Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
498 changes: 498 additions & 0 deletions docs/assets/recipes/trace_ingestion/agent_rollout_distillation.py

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions docs/recipes/cards.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,23 @@ Each recipe is a self-contained example that can be run independently.
[:material-book-open-page-variant: View Recipe](qa_and_chat/multi_turn_chat.md){ .md-button }
[Download Code :octicons-download-24:](../assets/recipes/qa_and_chat/multi_turn_chat.py){ .md-button download="multi_turn_chat.py" }

- :material-source-branch:{ .lg .middle } **Agent Rollout Trace Distillation**

Read agent rollout traces from disk and turn each imported rollout into a structured workflow record inside a Data Designer pipeline.

---

**Demonstrates:**

- `AgentRolloutSeedSource` across Claude Code and Codex rollout formats
- Using normalized trace columns in generation prompts
- Distilling agent traces into reusable structured records

---

[:material-book-open-page-variant: View Recipe](trace_ingestion/agent_rollout_distillation.md){ .md-button }
[Download Code :octicons-download-24:](../assets/recipes/trace_ingestion/agent_rollout_distillation.py){ .md-button download="agent_rollout_distillation.py" }


- :material-tools:{ .lg .middle } **Basic MCP Tool Use**

Expand Down
12 changes: 12 additions & 0 deletions docs/recipes/trace_ingestion/agent_rollout_distillation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[Download Code :octicons-download-24:](../../assets/recipes/trace_ingestion/agent_rollout_distillation.py){ .md-button download="agent_rollout_distillation.py" }

This recipe ingests built-in agent rollout traces with `AgentRolloutSeedSource(...)`, selecting the format with
`--format` and optionally overriding the input directory with `--trace-dir`. It works with `claude_code` and `codex`;
both can use their default locations when `--trace-dir` is omitted. The pipeline turns each imported trace into a
compact task digest, a standalone instruction-response pair for coding-assistant SFT, and a judge-scored quality
signal you can use for downstream filtering. It supports both full dataset creation and in-memory preview mode via
`--preview`.

```python
--8<-- "assets/recipes/trace_ingestion/agent_rollout_distillation.py"
```
2 changes: 2 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ nav:
- QA and Chat:
- Product Info QA: recipes/qa_and_chat/product_info_qa.md
- Multi-Turn Chat: recipes/qa_and_chat/multi_turn_chat.md
- Trace Ingestion:
- Agent Rollout Trace Distillation: recipes/trace_ingestion/agent_rollout_distillation.md
- MCP and Tool Use:
- "Basic MCP Tool Use": recipes/mcp_and_tooluse/basic_mcp.md
- "PDF Document QA": recipes/mcp_and_tooluse/pdf_qa.md
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@
SeedConfig,
)
from data_designer.config.seed_source import ( # noqa: F401
AgentRolloutFormat,
AgentRolloutSeedSource,
DirectorySeedSource,
FileContentsSeedSource,
HuggingFaceSeedSource,
Expand Down Expand Up @@ -199,6 +201,8 @@
"SeedConfig": (_MOD_SEED, "SeedConfig"),
# seed_source
"DataFrameSeedSource": (f"{_MOD_BASE}.seed_source_dataframe", "DataFrameSeedSource"),
"AgentRolloutFormat": (_MOD_SEED_SOURCE, "AgentRolloutFormat"),
"AgentRolloutSeedSource": (_MOD_SEED_SOURCE, "AgentRolloutSeedSource"),
"DirectorySeedSource": (_MOD_SEED_SOURCE, "DirectorySeedSource"),
"FileContentsSeedSource": (_MOD_SEED_SOURCE, "FileContentsSeedSource"),
"HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
validate_dataset_file_path,
validate_path_contains_files_of_type,
)
from data_designer.config.utils.type_helpers import StrEnum

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -109,14 +110,14 @@ class FileSystemSeedSource(SeedSource, ABC):
)

@field_validator("path", mode="after")
def validate_path(cls, value: str) -> str:
path = Path(value).expanduser().resolve()
if not path.is_dir():
raise InvalidFilePathError(f"🛑 Path {path} is not a directory.")
return value
def validate_path(cls, value: str | None) -> str | None:
# Signature is str | None because AgentRolloutSeedSource overrides path to str | None
# and inherited validators fire for all subclasses.
return _validate_filesystem_seed_source_path(value)

def model_post_init(self, __context: Any) -> None:
self._runtime_path = _resolve_filesystem_runtime_path(self.path)
# None guard is exercised by AgentRolloutSeedSource (path: str | None) via inheritance.
self._runtime_path = None if self.path is None else _resolve_filesystem_runtime_path(self.path)

@property
def runtime_path(self) -> str:
Expand All @@ -125,12 +126,8 @@ def runtime_path(self) -> str:
return self._runtime_path

@field_validator("file_pattern", mode="after")
def validate_file_pattern(cls, value: str) -> str:
if not value.strip():
raise ValueError("🛑 FileSystemSeedSource.file_pattern must be a non-empty string.")
if "/" in value or "\\" in value:
raise ValueError("🛑 FileSystemSeedSource.file_pattern must match file names, not relative paths.")
return value
def validate_file_pattern(cls, value: str | None) -> str | None:
return _validate_filesystem_seed_source_file_pattern(value)


class DirectorySeedSource(FileSystemSeedSource):
Expand Down Expand Up @@ -165,3 +162,85 @@ def _resolve_local_file_runtime_path(path: str) -> str:
path_prefix, glob_suffix = path.split("*", 1)
resolved_prefix = Path(path_prefix or ".").expanduser().resolve()
return str(resolved_prefix / f"*{glob_suffix}")


def get_claude_code_default_path() -> str:
return str(Path("~/.claude/projects").expanduser())


def get_codex_default_path() -> str:
return str(Path("~/.codex/sessions").expanduser())


def _validate_filesystem_seed_source_path(value: str | None) -> str | None:
if value is None:
return None
path = Path(value).expanduser().resolve()
if not path.is_dir():
raise InvalidFilePathError(f"🛑 Path {path} is not a directory.")
return value


def _validate_filesystem_seed_source_file_pattern(value: str | None) -> str | None:
if value is None:
return None
if not value.strip():
raise ValueError("🛑 FileSystemSeedSource.file_pattern must be a non-empty string.")
if "/" in value or "\\" in value:
raise ValueError("🛑 FileSystemSeedSource.file_pattern must match file names, not relative paths.")
return value


class AgentRolloutFormat(StrEnum):
CLAUDE_CODE = "claude_code"
CODEX = "codex"


def get_agent_rollout_format_defaults(fmt: AgentRolloutFormat) -> tuple[str, str]:
if fmt == AgentRolloutFormat.CLAUDE_CODE:
return (get_claude_code_default_path(), "*.jsonl")
if fmt == AgentRolloutFormat.CODEX:
return (get_codex_default_path(), "*.jsonl")
raise ValueError(f"🛑 Unknown agent rollout format: {fmt!r}")


class AgentRolloutSeedSource(FileSystemSeedSource):
seed_type: Literal["agent_rollout"] = "agent_rollout"

format: AgentRolloutFormat = Field(
...,
description="Built-in agent rollout format to use for parsing trace files.",
)

path: str | None = Field(
None,
description=(
"Directory containing agent rollout artifacts. When omitted, built-in defaults are used: "
"Claude Code defaults to ~/.claude/projects and Codex defaults to ~/.codex/sessions. "
"Relative paths are resolved from the current working directory when the config is loaded, "
"not from the config file location."
),
)

file_pattern: str | None = Field(
None,
description=(
"Case-sensitive filename pattern used to match agent rollout files. When omitted, defaults to '*.jsonl'."
),
)

@property
def runtime_path(self) -> str:
if self._runtime_path is not None:
return self._runtime_path
default_path, _ = get_agent_rollout_format_defaults(self.format)
resolved_path = self.path if self.path is not None else default_path
self._runtime_path = _resolve_filesystem_runtime_path(resolved_path)
return self._runtime_path

@property
def resolved_file_pattern(self) -> str:
if self.file_pattern is not None:
return self.file_pattern
_, default_file_pattern = get_agent_rollout_format_defaults(self.format)
return default_file_pattern
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing_extensions import TypeAlias

from data_designer.config.seed_source import (
AgentRolloutSeedSource,
DirectorySeedSource,
FileContentsSeedSource,
HuggingFaceSeedSource,
Expand All @@ -20,7 +21,12 @@
plugin_manager = PluginManager()

_SeedSourceT: TypeAlias = (
LocalFileSeedSource | HuggingFaceSeedSource | DataFrameSeedSource | DirectorySeedSource | FileContentsSeedSource
LocalFileSeedSource
| HuggingFaceSeedSource
| DataFrameSeedSource
| DirectorySeedSource
| FileContentsSeedSource
| AgentRolloutSeedSource
)
_SeedSourceT = plugin_manager.inject_into_seed_source_type_union(_SeedSourceT)

Expand Down
Loading
Loading