From 3a1c9347fbb1c12fef37f811cf2595975eff2a6b Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Mar 2026 22:12:56 -0700 Subject: [PATCH 01/27] add skill --- skills/data-designer/SKILL.md | 99 +++++++++++++++++++ .../references/person-sampling.md | 70 +++++++++++++ .../data-designer/references/seed-datasets.md | 14 +++ skills/data-designer/workflows/autopilot.md | 27 +++++ skills/data-designer/workflows/interactive.md | 30 ++++++ 5 files changed, 240 insertions(+) create mode 100644 skills/data-designer/SKILL.md create mode 100644 skills/data-designer/references/person-sampling.md create mode 100644 skills/data-designer/references/seed-datasets.md create mode 100644 skills/data-designer/workflows/autopilot.md create mode 100644 skills/data-designer/workflows/interactive.md diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md new file mode 100644 index 00000000..42088dea --- /dev/null +++ b/skills/data-designer/SKILL.md @@ -0,0 +1,99 @@ +--- +name: data-designer +description: Use when the user wants to create a dataset, generate synthetic data, or build a data generation pipeline. Contains the essential workflow and discovery commands for the Data Designer library. Always invoke before exploring the workspace or writing code. +argument-hint: "[describe the dataset you want to generate]" +--- + +# Before You Start + +Do not explore the workspace, browse files, run `ls`/`find`/`Glob`, check git history, or spawn Agent subagents before starting the workflow below. + +# Goal + +Build a synthetic dataset using the Data Designer library that matches this description: + +$ARGUMENTS + +# Agent CLI + +Always run this command before starting attempting to design or build a dataset. This command is your single discovery mechanism — it tells you exactly which files to read: + +```bash +data-designer agent context +``` + +# Workflow + +Use **Autopilot** mode if the user implies they don't want to answer questions — e.g., they say something like "be opinionated", "you decide", "make reasonable assumptions", "just build it", "surprise me", etc. Otherwise, use **Interactive** mode (default). + +Read **only** the workflow file that matches the selected mode, then follow it: + +- **Interactive** → read `workflows/interactive.md` +- **Autopilot** → read `workflows/autopilot.md` + +# Rules + +- Do not drop columns unless the user explicitly asks. Keep all columns in the output by default. +- Do not suggest or ask about seed datasets. Only use one when the user explicitly provides seed data or asks to build from existing records. When using a seed, read `references/seed-datasets.md`. +- When the dataset requires person data (names, demographics, addresses), read `references/person-sampling.md`. +- If a dataset script that matches the dataset description already exists, ask the user whether to edit it or create a new one. + +# Usage Tips and Common Pitfalls + +- **Sampler and validation columns need both a type and params.** E.g., `sampler_type="category"` with `params=dd.CategorySamplerParams(...)`. +- **Jinja2 templates** in `prompt`, `system_prompt`, and `expr` fields: reference columns with `{{ column_name }}`, nested fields with `{{ column_name.field }}`. +- **`SamplerColumnConfig`:** Takes `params`, not `sampler_params`. +- **LLM judge score access:** `LLMJudgeColumnConfig` produces a nested dict where each score name maps to `{reasoning: str, score: int}`. To get the numeric score, use the `.score` attribute. For example, for a judge column named `quality` with a score named `correctness`, use `{{ quality.correctness.score }}`. Using `{{ quality.correctness }}` returns the full dict, not the numeric score. +- **Nested field access in `SchemaTransformProcessorConfig`:** Nested field access (e.g., `{{ column.field }}`) does **not** work inside schema transform templates because the processor sees column values as serialized strings, not parsed dicts. This affects structured columns, judge columns, and any column with nested output. To use nested fields in a schema transform, first extract them into intermediate `ExpressionColumnConfig` columns (e.g., `expr="{{ column.field }}"` with `drop=True`), then reference those flat columns in the template. + +# Troubleshooting + +- **`data-designer` command not found:** The package is not on the PATH. The user needs to either install it (`pip install data-designer`) or activate their virtual environment. +- **Network errors during preview:** A sandbox environment may be blocking outbound requests. Let the user know and ask them to either run the preview command outside the sandbox or grant the necessary permissions. + +# Output Template + +Write a Python file to the current directory with a `load_config_builder()` function returning a `DataDesignerConfigBuilder`. Use PEP 723 inline metadata for dependencies. + +```python +# /// script +# dependencies = [ +# "data-designer", +# "pydantic", +# ] +# /// +import data_designer.config as dd +from pydantic import BaseModel, Field + + +# Define Pydantic models when a column needs structured output +class MyEntity(BaseModel): + field_one: str = Field(description="...") + field_two: int = Field(description="...") + + +# Use custom generators when built-in column types aren't enough +@dd.custom_column_generator( + required_columns=["col_a"], + side_effect_columns=["extra_col"], +) +def my_custom_generator(row: dict) -> dict: + # add custom logic here and update row in place + row["custom_field"] = "custom value" + row["extra_col"] = "extra value" + return row + + +def load_config_builder() -> dd.DataDesignerConfigBuilder: + config_builder = dd.DataDesignerConfigBuilder() + + # Seed dataset (only if the user explicitly mentions a seed dataset path) + # config_builder.with_seed_dataset(dd.LocalFileSeedSource(path="path/to/seed.parquet")) + + # config_builder.add_column(...) + # config_builder.add_processor(...) + + return config_builder +``` + +Only include Pydantic models, custom generators, seed datasets, and extra dependencies when the task requires them. diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md new file mode 100644 index 00000000..f8114718 --- /dev/null +++ b/skills/data-designer/references/person-sampling.md @@ -0,0 +1,70 @@ +# Person Sampling Reference + +## Sampler types + +Prefer `"person"` when the locale is downloaded — it provides census-grounded demographics and optional personality traits. Fall back to `"person_from_faker"` when the locale isn't available. + +| `sampler_type` | Params class | When to use | +|---|---|---| +| `"person"` | `PersonSamplerParams` | **Preferred.** Locale downloaded to `~/.data-designer/managed-assets/datasets/` by default. | +| `"person_from_faker"` | `PersonFromFakerSamplerParams` | Fallback when locale not downloaded. Basic names/addresses via Faker, not demographically accurate. | + +## Available persona datasets + +Before using, always check for installed persona datasets with this command: + +```bash +data-designer agent state persona-datasets +``` + +## Usage + +The sampled person column is a nested dict. You can keep it as-is in the final dataset, or set `drop=True` to remove it and extract only the fields you need via `ExpressionColumnConfig`: + +```python +# Keep the full person dict in the output +config_builder.add_column(dd.SamplerColumnConfig( + name="person", sampler_type="person", + params=dd.PersonSamplerParams(locale="en_US"), +)) + +# Or drop it and extract specific fields +config_builder.add_column(dd.SamplerColumnConfig( + name="person", sampler_type="person", + params=dd.PersonSamplerParams(locale="en_US"), drop=True, +)) +config_builder.add_column(dd.ExpressionColumnConfig( + name="full_name", + expr="{{ person.first_name }} {{ person.last_name }}", dtype="str", +)) +``` + +## PersonSamplerParams + +| Parameter | Type | Default | Notes | +|---|---|---|---| +| `locale` | `str` | `"en_US"` | Must be a downloaded managed-dataset locale | +| `sex` | `"Male" \| "Female" \| None` | `None` | Filter by sex | +| `city` | `str \| list[str] \| None` | `None` | Filter by city | +| `age_range` | `list[int]` | `[18, 114]` | `[min, max]` inclusive | +| `select_field_values` | `dict[str, list[str]] \| None` | `None` | Flexible field filtering | +| `with_synthetic_personas` | `bool` | `False` | Append Big Five + persona fields | + +Available managed-dataset locales: `en_US`, `en_IN`, `en_SG`, `hi_Deva_IN`, `hi_Latn_IN`, `ja_JP`, `pt_BR` + +## PersonFromFakerSamplerParams + +| Parameter | Type | Default | Notes | +|---|---|---|---| +| `locale` | `str` | `"en_US"` | Any Faker-supported locale | +| `sex` | `"Male" \| "Female" \| None` | `None` | Filter by sex | +| `city` | `str \| list[str] \| None` | `None` | Filter by city | +| `age_range` | `list[int]` | `[18, 114]` | `[min, max]` inclusive | + +## Person fields (keys in sampled dict) + +**Standard fields:** `uuid`, `first_name`, `middle_name`, `last_name`, `sex`, `age`, `birth_date`, `marital_status`, `postcode`, `city`, `region`, `country`, `locale`, `education_level`, `bachelors_field`, `occupation`, `national_id`, `street_name`, `street_number`, `email_address`, `phone_number` + +**Locale-specific:** `unit`/`state` (US), `area`/`prefecture`/`zone` (JP), `race` (BR), `district`/`education_degree`/`first_language`/`second_language`/`third_language` (IN), `religion` (BR, IN) + +**Persona fields** (when `with_synthetic_personas=True`): `persona`, `detailed_persona`, `cultural_background`, `career_goals_and_ambitions`, `hobbies_and_interests`, `skills_and_expertise`, Big Five scores (`openness`, `conscientiousness`, `extraversion`, `agreeableness`, `neuroticism`), plus domain personas (`professional_persona`, `finance_persona`, `healthcare_persona`, etc.) diff --git a/skills/data-designer/references/seed-datasets.md b/skills/data-designer/references/seed-datasets.md new file mode 100644 index 00000000..077cb1e6 --- /dev/null +++ b/skills/data-designer/references/seed-datasets.md @@ -0,0 +1,14 @@ +# Seed Datasets Reference + +Seed datasets bootstrap synthetic data generation from existing data. Every column from the seed becomes a Jinja2 variable you can reference in prompts and expressions — the seed provides realism and domain specificity, and Data Designer adds volume and variation on top. + +## Before configuring a seed source + +1. **Read the source code.** Read `{config_root}/seed_source.py` for all seed source classes and their parameters. Do not guess types or parameters. + +2. **Verify the dataset is readable and fetch column names.** Before wiring the seed into the config, confirm the file can be read and extract its column names. This catches bad paths and corrupt files, and gives you the exact column names available for downstream prompts. + +## Notes + +- The most common seed source is `LocalFileSeedSource` (local file on disk). Supported formats: `.parquet`, `.csv`, `.json`, `.jsonl`. +- Seed columns are automatically registered as `SeedDatasetColumnConfig` entries — you do **not** add them manually. Just reference them by name in downstream prompts and expressions. diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md new file mode 100644 index 00000000..80f1b0e5 --- /dev/null +++ b/skills/data-designer/workflows/autopilot.md @@ -0,0 +1,27 @@ +# Autopilot Workflow + +In this mode, make reasonable design decisions autonomously based on the dataset description. Do not ask clarifying questions — infer sensible defaults and move straight through to a working preview. + +1. **Learn** — Run `data-designer agent context`. + - If no model aliases are configured, stop and ask the user. + - Inspect schemas for every column, sampler type, validator, and processor you plan to use. + - Never guess types or parameters — read the relevant config files first. + - Always read `base.py` for inherited fields shared by all config objects. +2. **Infer** — Based on the dataset description, make reasonable decisions for: + - Axes of diversity and what should be well represented. + - Which variables to randomize. + - The schema of the final dataset. + - The structure of any structured output columns. + - Briefly state the key decisions you made so the user can course-correct if needed. +3. **Plan** — Determine columns, samplers, processors, validators, and other dataset features needed. +4. **Build** — Write the Python script with `load_config_builder()` (see Output Template in SKILL.md). +5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. +6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. + - `cd` into the sample records directory (which is printed out by the `data-designer preview` command) + - Run `python -m http.server 8787` (in background) + - Tell the user to open `http://localhost:8787/sample_records_browser.html` to review them +7. **Create** — If the user specified a record count: + - 50 or fewer: run `data-designer create --num-records ` directly. + - More than 50: warn that generation can take a long time and ask for confirmation before running. + - If no record count was specified, skip this step. +8. **Present** — Summarize what was built: columns, samplers used, key design choices. If the create command was run, share the results. Ask the user if they want any changes. If so, edit the script, re-validate, re-preview, and iterate. diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md new file mode 100644 index 00000000..b2e193d6 --- /dev/null +++ b/skills/data-designer/workflows/interactive.md @@ -0,0 +1,30 @@ +# Interactive Workflow + +This is an interactive, iterative design process. Do not disengage from the loop unless the user says they are satisfied. + +1. **Learn** — Run `data-designer agent context`. + - If no model aliases are configured, stop and ask the user. + - Inspect schemas for every column, sampler type, validator, and processor you plan to use. + - Never guess types or parameters — read the relevant config files first. + - Always read `base.py` for inherited fields shared by all config objects. +2. **Clarify** — Ask the user clarifying questions to narrow down precisely what they want. + - Use a question-asking UX tool if available. + - Optimize for a great user experience: batch related questions together, keep the set short, provide concrete options/examples/defaults where possible, and use structured inputs (single-select, multi-select, forms, etc.) when they make answering easier. + - Common things to make precise: + - What the "axes of diversity" are — what should be well represented and diverse in the resulting dataset. + - The kind and nature of any input data. + - What variables should be randomized. + - The schema of the final dataset. + - The structure of any required structured output columns. + - What facets of the output dataset are important to capture. +3. **Plan** — Determine columns, samplers, processors, validators, and other dataset features needed. +4. **Build** — Write the Python script with `load_config_builder()` (see Output Template in SKILL.md). +5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. +6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. + - `cd` into the sample records directory (which is printed out by the `data-designer preview` command) + - Run `python -m http.server 8787` (in background) + - Tell the user to open `http://localhost:8787/sample_records_browser.html` to review them +7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. +8. **Finalize** — Once the user is happy, tell them they can run the following command to create the dataset: + - `data-designer create --num-records `. + - Do not run this command yourself. It requires model endpoints and can take a long time. From 15836399f67d736deff5a13823d06e498e38d5cc Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Mar 2026 22:28:37 -0700 Subject: [PATCH 02/27] remove quotes from hint --- skills/data-designer/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 42088dea..bf48ba77 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -1,7 +1,7 @@ --- name: data-designer description: Use when the user wants to create a dataset, generate synthetic data, or build a data generation pipeline. Contains the essential workflow and discovery commands for the Data Designer library. Always invoke before exploring the workspace or writing code. -argument-hint: "[describe the dataset you want to generate]" +argument-hint: [describe the dataset you want to generate] --- # Before You Start From 47bd0854c1268af15417e03f100ca516c81cf6d2 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Mar 2026 22:31:40 -0700 Subject: [PATCH 03/27] add internal metadata to .claude skills --- .claude/skills/commit/SKILL.md | 2 ++ .claude/skills/create-pr/SKILL.md | 2 ++ .claude/skills/new-sdg/SKILL.md | 22 ++++++++++++---------- .claude/skills/review-code/SKILL.md | 2 ++ .claude/skills/search-docs/SKILL.md | 2 ++ .claude/skills/search-github/SKILL.md | 2 ++ .claude/skills/update-pr/SKILL.md | 2 ++ 7 files changed, 24 insertions(+), 10 deletions(-) diff --git a/.claude/skills/commit/SKILL.md b/.claude/skills/commit/SKILL.md index 8bb69a13..b2e1c15d 100644 --- a/.claude/skills/commit/SKILL.md +++ b/.claude/skills/commit/SKILL.md @@ -3,6 +3,8 @@ name: commit description: Commit current changes with a clear, descriptive message argument-hint: [special instructions] disable-model-invocation: true +metadata: + internal: true --- # Commit Changes diff --git a/.claude/skills/create-pr/SKILL.md b/.claude/skills/create-pr/SKILL.md index 85ca1b3a..b3f4fe94 100644 --- a/.claude/skills/create-pr/SKILL.md +++ b/.claude/skills/create-pr/SKILL.md @@ -3,6 +3,8 @@ name: create-pr description: Create a GitHub PR with a well-formatted description including summary, categorized changes, and attention areas argument-hint: [special instructions] disable-model-invocation: true +metadata: + internal: true --- # Create Pull Request diff --git a/.claude/skills/new-sdg/SKILL.md b/.claude/skills/new-sdg/SKILL.md index ce889bff..db908193 100644 --- a/.claude/skills/new-sdg/SKILL.md +++ b/.claude/skills/new-sdg/SKILL.md @@ -3,6 +3,8 @@ name: new-sdg description: Implement a new synthetic data generator using NeMo Data Designer by defining its configuration and executing a preview job. argument-hint: disable-model-invocation: true +metadata: + internal: true --- # Your Goal @@ -18,13 +20,13 @@ Implement a new synthetic data generator using NeMo Data Designer to match the u The user will provide you with some description, but it is likely that you do not have enough information to precisely define what they want. It is hard for a user to define everything up front. Ask follow up questions to the user -using the AskUser tool to narrow down on precisely what they want. +using the AskUser tool to narrow down on precisely what they want. Common things to make precise are: - IMPORTANT: What the "axes of diversity" are -- e.g. what should be well represented and diverse in the resulting dataset. - The kind an nature of any input data to the dataset. -- What variables should be randomized. +- What variables should be randomized. - The schema of the final dataset. - The structure of any required structured output columns. - What facets of the output dataset are important to capture. @@ -40,22 +42,22 @@ Common things to make precise are: > USER: Respond > YOU: ...repeat... -Very often, the initial implementation will not conform precisely to what the user wants. You are to engage in an **iterative design loop** with the user. As shown +Very often, the initial implementation will not conform precisely to what the user wants. You are to engage in an **iterative design loop** with the user. As shown in the example below, you will construct a configuration, then review its outputs, -present those outputs to the user, and ask follow up questions. +present those outputs to the user, and ask follow up questions. Depending on the user responses, you will then edit the script, re-run it, and present the user with the results and ask followups and so. When showing results to the user DO NOT SUMMARIZE content, it is *very important* that you show them the records as-is so they can make thoughtful decisions. DO NOT disengage from this **iterative design loop** unless commanded by the user. -## Implementing a NeMo Data Designer Synthetic Data Generator +## Implementing a NeMo Data Designer Synthetic Data Generator - You will be writing a new python script for execution. - The script should be made in the current working directory, so `$(pwd)/script-name.py`. - Implement the script as a stand-alone, `uv`-executable script (https://docs.astral.sh/uv/guides/scripts/#creating-a-python-script). - The script should depend on the latest version of `data-designer`. -- Include other third-party dependencies only if the job requires it. +- Include other third-party dependencies only if the job requires it. - Model aliases are required when definining LLM generation columns. - Before implementing, make sure to use the Explore tool to understand the src/ and docs/. - Review available model aliases and providers. @@ -73,7 +75,7 @@ uv run --with data-designer data-designer config list ### Real World Seed Data -Depending on user requirements, you may need to access real-world datasets to serve as Seed datasets for your Data Designer SDG. +Depending on user requirements, you may need to access real-world datasets to serve as Seed datasets for your Data Designer SDG. In these cases, you may use Web Search tools to search for datasets available on HuggingFace, and use the `datasets` python library to load them. You will have to convert them to Pandas DataFrames in these cases. @@ -88,7 +90,7 @@ If you do use real-world data, pay attention to file sizes and avoid large file # ] # /// -# ... data designer config_builder implementation +# ... data designer config_builder implementation def build_config() -> DataDesignerConfigBuilder: """Implements the definition of the synthetic data generator. @@ -112,7 +114,7 @@ if __name__ == "__main__": preview.display_sample_record() # The raw data is located in this Pandas DataFrame object. - # You can implenent code to display some or all of this + # You can implenent code to display some or all of this # to STDOUT so you can see the outputs and report to the user. preview.dataset -``` \ No newline at end of file +``` diff --git a/.claude/skills/review-code/SKILL.md b/.claude/skills/review-code/SKILL.md index ee817f50..922afb0b 100644 --- a/.claude/skills/review-code/SKILL.md +++ b/.claude/skills/review-code/SKILL.md @@ -3,6 +3,8 @@ name: review-code description: Perform a thorough code review of the current branch or a GitHub PR by number. argument-hint: [pr-number] [special instructions] disable-model-invocation: true +metadata: + internal: true --- # Review Code Changes diff --git a/.claude/skills/search-docs/SKILL.md b/.claude/skills/search-docs/SKILL.md index f0898a46..00989683 100644 --- a/.claude/skills/search-docs/SKILL.md +++ b/.claude/skills/search-docs/SKILL.md @@ -2,6 +2,8 @@ name: search-docs description: Search local documentation in the docs/ folder for content related to a topic argument-hint: +metadata: + internal: true --- # Documentation Search diff --git a/.claude/skills/search-github/SKILL.md b/.claude/skills/search-github/SKILL.md index 9c00e422..324d6c36 100644 --- a/.claude/skills/search-github/SKILL.md +++ b/.claude/skills/search-github/SKILL.md @@ -2,6 +2,8 @@ name: search-github description: Search GitHub issues, discussions, and PRs for content related to a topic argument-hint: +metadata: + internal: true --- # GitHub Search diff --git a/.claude/skills/update-pr/SKILL.md b/.claude/skills/update-pr/SKILL.md index 0f4b7775..69bf944f 100644 --- a/.claude/skills/update-pr/SKILL.md +++ b/.claude/skills/update-pr/SKILL.md @@ -3,6 +3,8 @@ name: update-pr description: Update an existing GitHub PR description to reflect current changes after incorporating feedback argument-hint: [special instructions] disable-model-invocation: true +metadata: + internal: true --- # Update Pull Request From f4a04e5ff843e0cf2a07e7b930f31b27bf1e77ec Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Tue, 17 Mar 2026 22:39:21 -0700 Subject: [PATCH 04/27] address review feedback: fix typo, clarify config_root, use dynamic port --- skills/data-designer/SKILL.md | 2 +- skills/data-designer/references/seed-datasets.md | 2 +- skills/data-designer/workflows/autopilot.md | 4 ++-- skills/data-designer/workflows/interactive.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index bf48ba77..9a06b207 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -16,7 +16,7 @@ $ARGUMENTS # Agent CLI -Always run this command before starting attempting to design or build a dataset. This command is your single discovery mechanism — it tells you exactly which files to read: +Always run this command before attempting to design or build a dataset. This command is your single discovery mechanism — it tells you exactly which files to read: ```bash data-designer agent context diff --git a/skills/data-designer/references/seed-datasets.md b/skills/data-designer/references/seed-datasets.md index 077cb1e6..86e96c74 100644 --- a/skills/data-designer/references/seed-datasets.md +++ b/skills/data-designer/references/seed-datasets.md @@ -4,7 +4,7 @@ Seed datasets bootstrap synthetic data generation from existing data. Every colu ## Before configuring a seed source -1. **Read the source code.** Read `{config_root}/seed_source.py` for all seed source classes and their parameters. Do not guess types or parameters. +1. **Read the source code.** Read `seed_source.py` under the config root directory printed by `data-designer agent context`. This file contains all seed source classes and their parameters. Do not guess types or parameters. 2. **Verify the dataset is readable and fetch column names.** Before wiring the seed into the config, confirm the file can be read and extract its column names. This catches bad paths and corrupt files, and gives you the exact column names available for downstream prompts. diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index 80f1b0e5..be8d786f 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -18,8 +18,8 @@ In this mode, make reasonable design decisions autonomously based on the dataset 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - `cd` into the sample records directory (which is printed out by the `data-designer preview` command) - - Run `python -m http.server 8787` (in background) - - Tell the user to open `http://localhost:8787/sample_records_browser.html` to review them + - Run `python -m http.server 0` (in background) and note the port it prints + - Tell the user to open `http://localhost:/sample_records_browser.html` to review them 7. **Create** — If the user specified a record count: - 50 or fewer: run `data-designer create --num-records ` directly. - More than 50: warn that generation can take a long time and ask for confirmation before running. diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index b2e193d6..db7ea307 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -22,8 +22,8 @@ This is an interactive, iterative design process. Do not disengage from the loop 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - `cd` into the sample records directory (which is printed out by the `data-designer preview` command) - - Run `python -m http.server 8787` (in background) - - Tell the user to open `http://localhost:8787/sample_records_browser.html` to review them + - Run `python -m http.server 0` (in background) and note the port it prints + - Tell the user to open `http://localhost:/sample_records_browser.html` to review them 7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. 8. **Finalize** — Once the user is happy, tell them they can run the following command to create the dataset: - `data-designer create --num-records `. From ba3327bd33a8c87d25c01d5963b64c7b38af427d Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 09:36:13 -0700 Subject: [PATCH 05/27] address review feedback: use --directory flag, add server cleanup note - Replace `cd` + bare http.server with `--directory` flag to keep CWD stable for subsequent steps - Add note to stop the background server after review - Add large-record-count warning to interactive finalize step --- skills/data-designer/workflows/autopilot.md | 5 +++-- skills/data-designer/workflows/interactive.md | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index be8d786f..7bbb76f5 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -17,9 +17,10 @@ In this mode, make reasonable design decisions autonomously based on the dataset 4. **Build** — Write the Python script with `load_config_builder()` (see Output Template in SKILL.md). 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - - `cd` into the sample records directory (which is printed out by the `data-designer preview` command) - - Run `python -m http.server 0` (in background) and note the port it prints + - Note the sample records directory printed by the `data-designer preview` command + - Run `python -m http.server 0 --directory ` (in background) and note the port it prints - Tell the user to open `http://localhost:/sample_records_browser.html` to review them + - When the user is done reviewing, stop the background server 7. **Create** — If the user specified a record count: - 50 or fewer: run `data-designer create --num-records ` directly. - More than 50: warn that generation can take a long time and ask for confirmation before running. diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index db7ea307..c00281d4 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -21,10 +21,12 @@ This is an interactive, iterative design process. Do not disengage from the loop 4. **Build** — Write the Python script with `load_config_builder()` (see Output Template in SKILL.md). 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - - `cd` into the sample records directory (which is printed out by the `data-designer preview` command) - - Run `python -m http.server 0` (in background) and note the port it prints + - Note the sample records directory printed by the `data-designer preview` command + - Run `python -m http.server 0 --directory ` (in background) and note the port it prints - Tell the user to open `http://localhost:/sample_records_browser.html` to review them + - When the user is done reviewing, stop the background server 7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. 8. **Finalize** — Once the user is happy, tell them they can run the following command to create the dataset: - `data-designer create --num-records `. + - Warn the user that generation can take a long time for large record counts (50+). - Do not run this command yourself. It requires model endpoints and can take a long time. From aefd07268da132a61919b9ac906824d2a30623cd Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 09:59:06 -0700 Subject: [PATCH 06/27] improve preview server reliability and sandbox error handling - Use fixed port 8741 with fallback to port 0 - Require verifying server startup from background task output - Clarify sandbox network error guidance: ask to retry without sandbox before telling user to run manually --- skills/data-designer/SKILL.md | 4 ++-- skills/data-designer/workflows/autopilot.md | 3 ++- skills/data-designer/workflows/interactive.md | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 9a06b207..320e603f 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -48,8 +48,8 @@ Read **only** the workflow file that matches the selected mode, then follow it: # Troubleshooting -- **`data-designer` command not found:** The package is not on the PATH. The user needs to either install it (`pip install data-designer`) or activate their virtual environment. -- **Network errors during preview:** A sandbox environment may be blocking outbound requests. Let the user know and ask them to either run the preview command outside the sandbox or grant the necessary permissions. +- **`data-designer` command not found:** The package is not in your current Python environment's PATH. The user needs to either install it (`pip install data-designer`) or activate their virtual environment. +- **Network errors during preview:** A sandbox environment may be blocking outbound requests. Ask the user for permission to retry the command with the sandbox disabled. Only as a last resort, if retrying outside the sandbox also fails, tell the user to run the command themselves. # Output Template diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index 7bbb76f5..ef48cae7 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -18,7 +18,8 @@ In this mode, make reasonable design decisions autonomously based on the dataset 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - Note the sample records directory printed by the `data-designer preview` command - - Run `python -m http.server 0 --directory ` (in background) and note the port it prints + - Run `python -m http.server 8741 --directory ` (in background). If port 8741 is taken, retry with port 0 and note the port it prints + - **Verify the server started** by reading the background task output. Confirm the port from the server's own output — do not guess or scan for other Python processes - Tell the user to open `http://localhost:/sample_records_browser.html` to review them - When the user is done reviewing, stop the background server 7. **Create** — If the user specified a record count: diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index c00281d4..f0dfa2ed 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -22,7 +22,8 @@ This is an interactive, iterative design process. Do not disengage from the loop 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - Note the sample records directory printed by the `data-designer preview` command - - Run `python -m http.server 0 --directory ` (in background) and note the port it prints + - Run `python -m http.server 8741 --directory ` (in background). If port 8741 is taken, retry with port 0 and note the port it prints + - **Verify the server started** by reading the background task output. Confirm the port from the server's own output — do not guess or scan for other Python processes - Tell the user to open `http://localhost:/sample_records_browser.html` to review them - When the user is done reviewing, stop the background server 7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. From 42b8dd599812ef1ff8fe876bfca63e0f1b17755c Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 10:04:40 -0700 Subject: [PATCH 07/27] ensure venv creation before installing data-designer --- skills/data-designer/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 320e603f..f4faf138 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -48,7 +48,7 @@ Read **only** the workflow file that matches the selected mode, then follow it: # Troubleshooting -- **`data-designer` command not found:** The package is not in your current Python environment's PATH. The user needs to either install it (`pip install data-designer`) or activate their virtual environment. +- **`data-designer` command not found:** If no virtual environment exists, create one first (`python -m venv .venv && source .venv/bin/activate`), then install (`pip install data-designer`). If a virtual environment already exists, activate it and verify the package is installed. - **Network errors during preview:** A sandbox environment may be blocking outbound requests. Ask the user for permission to retry the command with the sandbox disabled. Only as a last resort, if retrying outside the sandbox also fails, tell the user to run the command themselves. # Output Template From 6031d843e034bf06f0a3f1ce58d5e64df04b9081 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 10:10:39 -0700 Subject: [PATCH 08/27] verify server via background task output, not curl probing --- skills/data-designer/workflows/autopilot.md | 2 +- skills/data-designer/workflows/interactive.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index ef48cae7..2ed470a1 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -19,7 +19,7 @@ In this mode, make reasonable design decisions autonomously based on the dataset 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - Note the sample records directory printed by the `data-designer preview` command - Run `python -m http.server 8741 --directory ` (in background). If port 8741 is taken, retry with port 0 and note the port it prints - - **Verify the server started** by reading the background task output. Confirm the port from the server's own output — do not guess or scan for other Python processes + - **Verify the server started** by reading the background task output until you see `Serving HTTP on ...`. Get the port from that output line. Do not use curl, lsof, or any other probing method - Tell the user to open `http://localhost:/sample_records_browser.html` to review them - When the user is done reviewing, stop the background server 7. **Create** — If the user specified a record count: diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index f0dfa2ed..ebf00fe7 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -23,7 +23,7 @@ This is an interactive, iterative design process. Do not disengage from the loop 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - Note the sample records directory printed by the `data-designer preview` command - Run `python -m http.server 8741 --directory ` (in background). If port 8741 is taken, retry with port 0 and note the port it prints - - **Verify the server started** by reading the background task output. Confirm the port from the server's own output — do not guess or scan for other Python processes + - **Verify the server started** by reading the background task output until you see `Serving HTTP on ...`. Get the port from that output line. Do not use curl, lsof, or any other probing method - Tell the user to open `http://localhost:/sample_records_browser.html` to review them - When the user is done reviewing, stop the background server 7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. From a81afa818226bacdba386eee747e4ddd16f76133 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 10:26:53 -0700 Subject: [PATCH 09/27] replace HTTP server with file:// link for preview, add push_to_hub guidance --- skills/data-designer/workflows/autopilot.md | 7 ++----- skills/data-designer/workflows/interactive.md | 6 ++---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index 2ed470a1..0efe5ade 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -18,12 +18,9 @@ In this mode, make reasonable design decisions autonomously based on the dataset 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - Note the sample records directory printed by the `data-designer preview` command - - Run `python -m http.server 8741 --directory ` (in background). If port 8741 is taken, retry with port 0 and note the port it prints - - **Verify the server started** by reading the background task output until you see `Serving HTTP on ...`. Get the port from that output line. Do not use curl, lsof, or any other probing method - - Tell the user to open `http://localhost:/sample_records_browser.html` to review them - - When the user is done reviewing, stop the background server + - Give the user a clickable link: `file:///sample_records_browser.html` 7. **Create** — If the user specified a record count: - 50 or fewer: run `data-designer create --num-records ` directly. - More than 50: warn that generation can take a long time and ask for confirmation before running. - If no record count was specified, skip this step. -8. **Present** — Summarize what was built: columns, samplers used, key design choices. If the create command was run, share the results. Ask the user if they want any changes. If so, edit the script, re-validate, re-preview, and iterate. +8. **Present** — Summarize what was built: columns, samplers used, key design choices. If the create command was run, share the results. Let the user know they can push the dataset to Hugging Face using `results.push_to_hub("org/dataset-name", "description", tags=["tag1", "tag2"])`, or from a saved folder using `HuggingFaceHubClient.push_to_hub_from_folder(dataset_path="{artifact_path}/{dataset_name}", repo_id="org/name", description="...")`. Ask the user if they want any changes. If so, edit the script, re-validate, re-preview, and iterate. diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index ebf00fe7..3335eec7 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -22,12 +22,10 @@ This is an interactive, iterative design process. Do not disengage from the loop 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. - Note the sample records directory printed by the `data-designer preview` command - - Run `python -m http.server 8741 --directory ` (in background). If port 8741 is taken, retry with port 0 and note the port it prints - - **Verify the server started** by reading the background task output until you see `Serving HTTP on ...`. Get the port from that output line. Do not use curl, lsof, or any other probing method - - Tell the user to open `http://localhost:/sample_records_browser.html` to review them - - When the user is done reviewing, stop the background server + - Give the user a clickable link: `file:///sample_records_browser.html` 7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. 8. **Finalize** — Once the user is happy, tell them they can run the following command to create the dataset: - `data-designer create --num-records `. - Warn the user that generation can take a long time for large record counts (50+). - Do not run this command yourself. It requires model endpoints and can take a long time. + - Let the user know they can push the dataset to Hugging Face using `results.push_to_hub("org/dataset-name", "description", tags=["tag1", "tag2"])`, or from a saved folder using `HuggingFaceHubClient.push_to_hub_from_folder(dataset_path="{artifact_path}/{dataset_name}", repo_id="org/name", description="...")`. From 4068a76db691e0eb71521ce29d5fb0fbebf0ee5a Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 10:42:57 -0700 Subject: [PATCH 10/27] add --dataset-name to create command, remove push_to_hub notes --- skills/data-designer/workflows/autopilot.md | 4 ++-- skills/data-designer/workflows/interactive.md | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index 0efe5ade..40bdcec4 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -20,7 +20,7 @@ In this mode, make reasonable design decisions autonomously based on the dataset - Note the sample records directory printed by the `data-designer preview` command - Give the user a clickable link: `file:///sample_records_browser.html` 7. **Create** — If the user specified a record count: - - 50 or fewer: run `data-designer create --num-records ` directly. + - 50 or fewer: run `data-designer create --num-records --dataset-name ` directly. - More than 50: warn that generation can take a long time and ask for confirmation before running. - If no record count was specified, skip this step. -8. **Present** — Summarize what was built: columns, samplers used, key design choices. If the create command was run, share the results. Let the user know they can push the dataset to Hugging Face using `results.push_to_hub("org/dataset-name", "description", tags=["tag1", "tag2"])`, or from a saved folder using `HuggingFaceHubClient.push_to_hub_from_folder(dataset_path="{artifact_path}/{dataset_name}", repo_id="org/name", description="...")`. Ask the user if they want any changes. If so, edit the script, re-validate, re-preview, and iterate. +8. **Present** — Summarize what was built: columns, samplers used, key design choices. If the create command was run, share the results. Ask the user if they want any changes. If so, edit the script, re-validate, re-preview, and iterate. diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index 3335eec7..44710b8e 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -25,7 +25,6 @@ This is an interactive, iterative design process. Do not disengage from the loop - Give the user a clickable link: `file:///sample_records_browser.html` 7. **Iterate** — Ask the user for feedback. Edit the script, re-validate, re-preview, and serve again. Repeat until they are satisfied. 8. **Finalize** — Once the user is happy, tell them they can run the following command to create the dataset: - - `data-designer create --num-records `. + - `data-designer create --num-records --dataset-name `. - Warn the user that generation can take a long time for large record counts (50+). - Do not run this command yourself. It requires model endpoints and can take a long time. - - Let the user know they can push the dataset to Hugging Face using `results.push_to_hub("org/dataset-name", "description", tags=["tag1", "tag2"])`, or from a saved folder using `HuggingFaceHubClient.push_to_hub_from_folder(dataset_path="{artifact_path}/{dataset_name}", repo_id="org/name", description="...")`. From d6878b8cd2895544c3be334f79a3604799bfc475 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 10:50:58 -0700 Subject: [PATCH 11/27] update custom column example --- skills/data-designer/SKILL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index f4faf138..82120334 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -78,8 +78,8 @@ class MyEntity(BaseModel): side_effect_columns=["extra_col"], ) def my_custom_generator(row: dict) -> dict: - # add custom logic here and update row in place - row["custom_field"] = "custom value" + # add custom logic here that depends on "col_a" and update row in place + row["name_in_custom_column_config"] = "custom value" row["extra_col"] = "extra value" return row From acff87fb88de71937ad9a3d5ea1f5ba0a7da4358 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 10:52:08 -0700 Subject: [PATCH 12/27] remove schema transform pitfall which is about to be fixed --- skills/data-designer/SKILL.md | 1 - 1 file changed, 1 deletion(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 82120334..f1f98895 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -44,7 +44,6 @@ Read **only** the workflow file that matches the selected mode, then follow it: - **Jinja2 templates** in `prompt`, `system_prompt`, and `expr` fields: reference columns with `{{ column_name }}`, nested fields with `{{ column_name.field }}`. - **`SamplerColumnConfig`:** Takes `params`, not `sampler_params`. - **LLM judge score access:** `LLMJudgeColumnConfig` produces a nested dict where each score name maps to `{reasoning: str, score: int}`. To get the numeric score, use the `.score` attribute. For example, for a judge column named `quality` with a score named `correctness`, use `{{ quality.correctness.score }}`. Using `{{ quality.correctness }}` returns the full dict, not the numeric score. -- **Nested field access in `SchemaTransformProcessorConfig`:** Nested field access (e.g., `{{ column.field }}`) does **not** work inside schema transform templates because the processor sees column values as serialized strings, not parsed dicts. This affects structured columns, judge columns, and any column with nested output. To use nested fields in a schema transform, first extract them into intermediate `ExpressionColumnConfig` columns (e.g., `expr="{{ column.field }}"` with `drop=True`), then reference those flat columns in the template. # Troubleshooting From 656c6a038a169f2d603b92992adc7a3b74adf8f7 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 11:08:31 -0700 Subject: [PATCH 13/27] tighten agent skill: remove redundancy, add missing interactive guidance --- skills/data-designer/SKILL.md | 14 +++----------- skills/data-designer/references/person-sampling.md | 2 -- skills/data-designer/workflows/interactive.md | 5 +++-- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index f1f98895..e40905ff 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -1,12 +1,12 @@ --- name: data-designer -description: Use when the user wants to create a dataset, generate synthetic data, or build a data generation pipeline. Contains the essential workflow and discovery commands for the Data Designer library. Always invoke before exploring the workspace or writing code. +description: Use when the user wants to create a dataset, generate synthetic data, or build a data generation pipeline. argument-hint: [describe the dataset you want to generate] --- # Before You Start -Do not explore the workspace, browse files, run `ls`/`find`/`Glob`, check git history, or spawn Agent subagents before starting the workflow below. +Do not explore the workspace first. The workflow's Learn step gives you everything you need. # Goal @@ -14,14 +14,6 @@ Build a synthetic dataset using the Data Designer library that matches this desc $ARGUMENTS -# Agent CLI - -Always run this command before attempting to design or build a dataset. This command is your single discovery mechanism — it tells you exactly which files to read: - -```bash -data-designer agent context -``` - # Workflow Use **Autopilot** mode if the user implies they don't want to answer questions — e.g., they say something like "be opinionated", "you decide", "make reasonable assumptions", "just build it", "surprise me", etc. Otherwise, use **Interactive** mode (default). @@ -52,7 +44,7 @@ Read **only** the workflow file that matches the selected mode, then follow it: # Output Template -Write a Python file to the current directory with a `load_config_builder()` function returning a `DataDesignerConfigBuilder`. Use PEP 723 inline metadata for dependencies. +Write a Python file to the current directory with a `load_config_builder()` function returning a `DataDesignerConfigBuilder`. Name the file descriptively (e.g., `customer_reviews.py`). Use PEP 723 inline metadata for dependencies. ```python # /// script diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md index f8114718..72d17472 100644 --- a/skills/data-designer/references/person-sampling.md +++ b/skills/data-designer/references/person-sampling.md @@ -50,8 +50,6 @@ config_builder.add_column(dd.ExpressionColumnConfig( | `select_field_values` | `dict[str, list[str]] \| None` | `None` | Flexible field filtering | | `with_synthetic_personas` | `bool` | `False` | Append Big Five + persona fields | -Available managed-dataset locales: `en_US`, `en_IN`, `en_SG`, `hi_Deva_IN`, `hi_Latn_IN`, `ja_JP`, `pt_BR` - ## PersonFromFakerSamplerParams | Parameter | Type | Default | Notes | diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index 44710b8e..5636a507 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -10,6 +10,7 @@ This is an interactive, iterative design process. Do not disengage from the loop 2. **Clarify** — Ask the user clarifying questions to narrow down precisely what they want. - Use a question-asking UX tool if available. - Optimize for a great user experience: batch related questions together, keep the set short, provide concrete options/examples/defaults where possible, and use structured inputs (single-select, multi-select, forms, etc.) when they make answering easier. + - If multiple model aliases are available, ask which one(s) to use (or default to the first usable alias). - Common things to make precise: - What the "axes of diversity" are — what should be well represented and diverse in the resulting dataset. - The kind and nature of any input data. @@ -17,7 +18,7 @@ This is an interactive, iterative design process. Do not disengage from the loop - The schema of the final dataset. - The structure of any required structured output columns. - What facets of the output dataset are important to capture. -3. **Plan** — Determine columns, samplers, processors, validators, and other dataset features needed. +3. **Plan** — Determine columns, samplers, processors, validators, and other dataset features needed. Present the plan to the user for approval before proceeding to Build. 4. **Build** — Write the Python script with `load_config_builder()` (see Output Template in SKILL.md). 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. @@ -27,4 +28,4 @@ This is an interactive, iterative design process. Do not disengage from the loop 8. **Finalize** — Once the user is happy, tell them they can run the following command to create the dataset: - `data-designer create --num-records --dataset-name `. - Warn the user that generation can take a long time for large record counts (50+). - - Do not run this command yourself. It requires model endpoints and can take a long time. + - Do not run this command yourself — it can take a long time for large datasets and the user should control when it runs. From 0ae610c1ec80f4ca2f1e67a543589ec402e05eed Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 11:14:39 -0700 Subject: [PATCH 14/27] clarify interactive plan step: ask for changes before generating preview --- skills/data-designer/workflows/interactive.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index 5636a507..e94a75c8 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -18,7 +18,7 @@ This is an interactive, iterative design process. Do not disengage from the loop - The schema of the final dataset. - The structure of any required structured output columns. - What facets of the output dataset are important to capture. -3. **Plan** — Determine columns, samplers, processors, validators, and other dataset features needed. Present the plan to the user for approval before proceeding to Build. +3. **Plan** — Determine columns, samplers, processors, validators, and other dataset features needed. Present the plan to the user and ask if they want any changes before generating a preview. 4. **Build** — Write the Python script with `load_config_builder()` (see Output Template in SKILL.md). 5. **Validate** — Run `data-designer validate `. Address any warnings or errors and re-validate until it passes. 6. **Preview** — Run `data-designer preview --save-results` to generate sample records as HTML files. From 666e7430d5314400848643993361a296a02059e4 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 13:12:08 -0700 Subject: [PATCH 15/27] improve structured question tool guidance in interactive workflow --- skills/data-designer/workflows/interactive.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index e94a75c8..a3c531b8 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -8,7 +8,7 @@ This is an interactive, iterative design process. Do not disengage from the loop - Never guess types or parameters — read the relevant config files first. - Always read `base.py` for inherited fields shared by all config objects. 2. **Clarify** — Ask the user clarifying questions to narrow down precisely what they want. - - Use a question-asking UX tool if available. + - Prefer a structured question tool over plain text if one is available (look for tools that support input types like single-select, multi-select, or free text). - Optimize for a great user experience: batch related questions together, keep the set short, provide concrete options/examples/defaults where possible, and use structured inputs (single-select, multi-select, forms, etc.) when they make answering easier. - If multiple model aliases are available, ask which one(s) to use (or default to the first usable alias). - Common things to make precise: From fbb11d69355c03e3fb73abf1bbeafc3de3512dab Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 13:13:17 -0700 Subject: [PATCH 16/27] merge structured question tool guidance with UX bullet point --- skills/data-designer/workflows/interactive.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index a3c531b8..1e40f61c 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -8,8 +8,7 @@ This is an interactive, iterative design process. Do not disengage from the loop - Never guess types or parameters — read the relevant config files first. - Always read `base.py` for inherited fields shared by all config objects. 2. **Clarify** — Ask the user clarifying questions to narrow down precisely what they want. - - Prefer a structured question tool over plain text if one is available (look for tools that support input types like single-select, multi-select, or free text). - - Optimize for a great user experience: batch related questions together, keep the set short, provide concrete options/examples/defaults where possible, and use structured inputs (single-select, multi-select, forms, etc.) when they make answering easier. + - Optimize for a great user experience: prefer a structured question tool over plain text if one is available, batch related questions together, keep the set short, provide concrete options/examples/defaults where possible, and use structured inputs (single-select, multi-select, free text, etc.) when they make answering easier. - If multiple model aliases are available, ask which one(s) to use (or default to the first usable alias). - Common things to make precise: - What the "axes of diversity" are — what should be well represented and diverse in the resulting dataset. From b81572ef36d653d8438649ef5a7b0db6f30b8f66 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 14:21:16 -0700 Subject: [PATCH 17/27] soften column-dropping rule to allow dropping helper columns Allow dropping internal/helper columns (e.g., sampled person objects) that exist solely to derive other columns, while still defaulting to keeping everything else. --- skills/data-designer/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index e40905ff..01d02326 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -25,7 +25,7 @@ Read **only** the workflow file that matches the selected mode, then follow it: # Rules -- Do not drop columns unless the user explicitly asks. Keep all columns in the output by default. +- Keep all columns in the output by default. The only exceptions for dropping a column are: (1) the user explicitly asks, or (2) it is a helper column that exists solely to derive other columns (e.g., a sampled person object used to extract name, city, etc.). When in doubt, keep the column. - Do not suggest or ask about seed datasets. Only use one when the user explicitly provides seed data or asks to build from existing records. When using a seed, read `references/seed-datasets.md`. - When the dataset requires person data (names, demographics, addresses), read `references/person-sampling.md`. - If a dataset script that matches the dataset description already exists, ask the user whether to edit it or create a new one. From fe70161451694bbe2eb81a93c02f68e077c1e777 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 14:27:04 -0700 Subject: [PATCH 18/27] default model alias to appropriate generation_type per column Instead of defaulting to the first usable alias (which could be an embedding model), default to an alias with the appropriate generation_type for each column. --- skills/data-designer/workflows/interactive.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index 1e40f61c..fd96eedd 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -9,7 +9,7 @@ This is an interactive, iterative design process. Do not disengage from the loop - Always read `base.py` for inherited fields shared by all config objects. 2. **Clarify** — Ask the user clarifying questions to narrow down precisely what they want. - Optimize for a great user experience: prefer a structured question tool over plain text if one is available, batch related questions together, keep the set short, provide concrete options/examples/defaults where possible, and use structured inputs (single-select, multi-select, free text, etc.) when they make answering easier. - - If multiple model aliases are available, ask which one(s) to use (or default to the first usable alias). + - If multiple model aliases are available, ask which one(s) to use (or default to an alias with the appropriate `generation_type` for each column). - Common things to make precise: - What the "axes of diversity" are — what should be well represented and diverse in the resulting dataset. - The kind and nature of any input data. From 1a1ed2ff0dee5fe03cbdf3771ed14197a2390aaa Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 14:28:50 -0700 Subject: [PATCH 19/27] clarify missing model aliases: suggest running data-designer config --- skills/data-designer/workflows/autopilot.md | 2 +- skills/data-designer/workflows/interactive.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skills/data-designer/workflows/autopilot.md b/skills/data-designer/workflows/autopilot.md index 40bdcec4..4fd08489 100644 --- a/skills/data-designer/workflows/autopilot.md +++ b/skills/data-designer/workflows/autopilot.md @@ -3,7 +3,7 @@ In this mode, make reasonable design decisions autonomously based on the dataset description. Do not ask clarifying questions — infer sensible defaults and move straight through to a working preview. 1. **Learn** — Run `data-designer agent context`. - - If no model aliases are configured, stop and ask the user. + - If no model aliases are configured, stop and tell the user to run `data-designer config` to set them up before proceeding. - Inspect schemas for every column, sampler type, validator, and processor you plan to use. - Never guess types or parameters — read the relevant config files first. - Always read `base.py` for inherited fields shared by all config objects. diff --git a/skills/data-designer/workflows/interactive.md b/skills/data-designer/workflows/interactive.md index fd96eedd..81d22c94 100644 --- a/skills/data-designer/workflows/interactive.md +++ b/skills/data-designer/workflows/interactive.md @@ -3,7 +3,7 @@ This is an interactive, iterative design process. Do not disengage from the loop unless the user says they are satisfied. 1. **Learn** — Run `data-designer agent context`. - - If no model aliases are configured, stop and ask the user. + - If no model aliases are configured, stop and tell the user to run `data-designer config` to set them up before proceeding. - Inspect schemas for every column, sampler type, validator, and processor you plan to use. - Never guess types or parameters — read the relevant config files first. - Always read `base.py` for inherited fields shared by all config objects. From 8f10ea95a7aa10a3eda6abdfead24a86eecf51ff Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 14:33:28 -0700 Subject: [PATCH 20/27] close the loop on persona dataset locale check Make the check section explicitly state what to do when the needed locale is not installed: use person_from_faker instead. --- skills/data-designer/references/person-sampling.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md index 72d17472..37bbc5a6 100644 --- a/skills/data-designer/references/person-sampling.md +++ b/skills/data-designer/references/person-sampling.md @@ -11,12 +11,14 @@ Prefer `"person"` when the locale is downloaded — it provides census-grounded ## Available persona datasets -Before using, always check for installed persona datasets with this command: +Before using `"person"`, check which locales are installed: ```bash data-designer agent state persona-datasets ``` +If the needed locale is not listed, use `"person_from_faker"` instead. + ## Usage The sampled person column is a nested dict. You can keep it as-is in the final dataset, or set `drop=True` to remove it and extract only the fields you need via `ExpressionColumnConfig`: From 05d4c248e6a0e7a05422f33cf06c6df94bfd2c77 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 15:32:30 -0700 Subject: [PATCH 21/27] add locale schema script and simplify person-sampling reference Add get_person_object_schema.py script that prints PII and synthetic persona fields for a given locale's managed dataset. Update person-sampling.md to use this script instead of hardcoded field lists, and remove redundant param tables already available via agent context. --- .../references/person-sampling.md | 42 +++++----------- .../scripts/get_person_object_schema.py | 48 +++++++++++++++++++ 2 files changed, 61 insertions(+), 29 deletions(-) create mode 100644 skills/data-designer/scripts/get_person_object_schema.py diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md index 37bbc5a6..75bc78ee 100644 --- a/skills/data-designer/references/person-sampling.md +++ b/skills/data-designer/references/person-sampling.md @@ -4,12 +4,14 @@ Prefer `"person"` when the locale is downloaded — it provides census-grounded demographics and optional personality traits. Fall back to `"person_from_faker"` when the locale isn't available. -| `sampler_type` | Params class | When to use | -|---|---|---| -| `"person"` | `PersonSamplerParams` | **Preferred.** Locale downloaded to `~/.data-designer/managed-assets/datasets/` by default. | + +| `sampler_type` | Params class | When to use | +| --------------------- | ------------------------------ | --------------------------------------------------------------------------------------------------- | +| `"person"` | `PersonSamplerParams` | **Preferred.** Locale downloaded to `~/.data-designer/managed-assets/datasets/` by default. | | `"person_from_faker"` | `PersonFromFakerSamplerParams` | Fallback when locale not downloaded. Basic names/addresses via Faker, not demographically accurate. | -## Available persona datasets + +## Available locales Before using `"person"`, check which locales are installed: @@ -17,8 +19,6 @@ Before using `"person"`, check which locales are installed: data-designer agent state persona-datasets ``` -If the needed locale is not listed, use `"person_from_faker"` instead. - ## Usage The sampled person column is a nested dict. You can keep it as-is in the final dataset, or set `drop=True` to remove it and extract only the fields you need via `ExpressionColumnConfig`: @@ -41,30 +41,14 @@ config_builder.add_column(dd.ExpressionColumnConfig( )) ``` -## PersonSamplerParams - -| Parameter | Type | Default | Notes | -|---|---|---|---| -| `locale` | `str` | `"en_US"` | Must be a downloaded managed-dataset locale | -| `sex` | `"Male" \| "Female" \| None` | `None` | Filter by sex | -| `city` | `str \| list[str] \| None` | `None` | Filter by city | -| `age_range` | `list[int]` | `[18, 114]` | `[min, max]` inclusive | -| `select_field_values` | `dict[str, list[str]] \| None` | `None` | Flexible field filtering | -| `with_synthetic_personas` | `bool` | `False` | Append Big Five + persona fields | +Set `with_synthetic_personas=True` when the dataset benefits from personality traits, interests, cultural background, or detailed persona descriptions (e.g., for realistic user simulation or persona-driven prompting). This option is only available with `"person"` — `"person_from_faker"` does not support it. -## PersonFromFakerSamplerParams +## Person Object Schema -| Parameter | Type | Default | Notes | -|---|---|---|---| -| `locale` | `str` | `"en_US"` | Any Faker-supported locale | -| `sex` | `"Male" \| "Female" \| None` | `None` | Filter by sex | -| `city` | `str \| list[str] \| None` | `None` | Filter by city | -| `age_range` | `list[int]` | `[18, 114]` | `[min, max]` inclusive | +Fields vary by locale. Always run the following script to get the exact schema for the locale you are using: -## Person fields (keys in sampled dict) - -**Standard fields:** `uuid`, `first_name`, `middle_name`, `last_name`, `sex`, `age`, `birth_date`, `marital_status`, `postcode`, `city`, `region`, `country`, `locale`, `education_level`, `bachelors_field`, `occupation`, `national_id`, `street_name`, `street_number`, `email_address`, `phone_number` - -**Locale-specific:** `unit`/`state` (US), `area`/`prefecture`/`zone` (JP), `race` (BR), `district`/`education_degree`/`first_language`/`second_language`/`third_language` (IN), `religion` (BR, IN) +```bash +.venv/bin/python scripts/get_person_object_schema.py +``` -**Persona fields** (when `with_synthetic_personas=True`): `persona`, `detailed_persona`, `cultural_background`, `career_goals_and_ambitions`, `hobbies_and_interests`, `skills_and_expertise`, Big Five scores (`openness`, `conscientiousness`, `extraversion`, `agreeableness`, `neuroticism`), plus domain personas (`professional_persona`, `finance_persona`, `healthcare_persona`, etc.) +This prints the PII fields (always included) and synthetic persona fields (only included when `with_synthetic_personas=True`) available for that locale. diff --git a/skills/data-designer/scripts/get_person_object_schema.py b/skills/data-designer/scripts/get_person_object_schema.py new file mode 100644 index 00000000..ed2b4202 --- /dev/null +++ b/skills/data-designer/scripts/get_person_object_schema.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Inspect a locale's managed persona dataset and print its available fields. + +Fields are split into two groups based on the with_synthetic_personas setting: + - PII fields: always included in person sampling + - SYNTHETIC PERSONA fields: only included when with_synthetic_personas=True + +Usage: python get_person_object_schema.py +Example: python get_person_object_schema.py en_US +""" + +from __future__ import annotations + +import sys + +import pyarrow.parquet as pq + +from data_designer.config.utils.constants import MANAGED_ASSETS_PATH +from data_designer.engine.sampling_gen.entities.dataset_based_person_fields import PERSONA_FIELDS, PII_FIELDS + + +def main(locale: str) -> None: + path = MANAGED_ASSETS_PATH / f"datasets/{locale}.parquet" + if not path.exists(): + print(f"Error: locale '{locale}' does not exist (no dataset at {path})", file=sys.stderr) + sys.exit(1) + + schema = {field.name: str(field.type) for field in pq.read_schema(path)} + + pii = {k: v for k, v in schema.items() if k in PII_FIELDS and v != "null"} + persona = {k: v for k, v in schema.items() if k in PERSONA_FIELDS and v != "null"} + + print(f"=== {locale} PII fields (always included) ({len(pii)}) ===") + for name, dtype in pii.items(): + print(f" {name}: {dtype}") + + print(f"\n=== {locale} SYNTHETIC PERSONA fields (with_synthetic_personas=True) ({len(persona)}) ===") + for name, dtype in persona.items(): + print(f" {name}: {dtype}") + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + main(sys.argv[1]) From d3ec31b863703bdd1b43a89f6c8355101ee4d60b Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 15:52:52 -0700 Subject: [PATCH 22/27] clarify script path is relative to skill directory --- skills/data-designer/references/person-sampling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md index 75bc78ee..5f2f0697 100644 --- a/skills/data-designer/references/person-sampling.md +++ b/skills/data-designer/references/person-sampling.md @@ -45,7 +45,7 @@ Set `with_synthetic_personas=True` when the dataset benefits from personality tr ## Person Object Schema -Fields vary by locale. Always run the following script to get the exact schema for the locale you are using: +Fields vary by locale. Always run the following script to get the exact schema for the locale you are using (script path is relative to the skill directory): ```bash .venv/bin/python scripts/get_person_object_schema.py From d5dd6421eea5c9e5648cb4a05c72732704aaf429 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 15:53:24 -0700 Subject: [PATCH 23/27] minor wording tweak in person-sampling reference --- skills/data-designer/references/person-sampling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md index 5f2f0697..2761c9a4 100644 --- a/skills/data-designer/references/person-sampling.md +++ b/skills/data-designer/references/person-sampling.md @@ -45,7 +45,7 @@ Set `with_synthetic_personas=True` when the dataset benefits from personality tr ## Person Object Schema -Fields vary by locale. Always run the following script to get the exact schema for the locale you are using (script path is relative to the skill directory): +Fields vary by locale. Always run the following script to get the exact schema for the locale you are using (script path is relative to this skill's directory): ```bash .venv/bin/python scripts/get_person_object_schema.py From 866ac711d45025694ba1c90c228781f6d318f7d0 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Wed, 18 Mar 2026 16:00:15 -0700 Subject: [PATCH 24/27] remove redundant available locales section from person-sampling ref The locale install status is already printed by `data-designer agent context`, which the agent runs at the start of every workflow. --- skills/data-designer/references/person-sampling.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/skills/data-designer/references/person-sampling.md b/skills/data-designer/references/person-sampling.md index 2761c9a4..0410da76 100644 --- a/skills/data-designer/references/person-sampling.md +++ b/skills/data-designer/references/person-sampling.md @@ -11,14 +11,6 @@ Prefer `"person"` when the locale is downloaded — it provides census-grounded | `"person_from_faker"` | `PersonFromFakerSamplerParams` | Fallback when locale not downloaded. Basic names/addresses via Faker, not demographically accurate. | -## Available locales - -Before using `"person"`, check which locales are installed: - -```bash -data-designer agent state persona-datasets -``` - ## Usage The sampled person column is a nested dict. You can keep it as-is in the final dataset, or set `drop=True` to remove it and extract only the fields you need via `ExpressionColumnConfig`: @@ -48,7 +40,7 @@ Set `with_synthetic_personas=True` when the dataset benefits from personality tr Fields vary by locale. Always run the following script to get the exact schema for the locale you are using (script path is relative to this skill's directory): ```bash -.venv/bin/python scripts/get_person_object_schema.py +python scripts/get_person_object_schema.py ``` This prints the PII fields (always included) and synthetic persona fields (only included when `with_synthetic_personas=True`) available for that locale. From 285dfff87b9cfd964b102e51216a90c259277f3f Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Thu, 19 Mar 2026 00:13:22 -0700 Subject: [PATCH 25/27] tweak --- skills/data-designer/SKILL.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 01d02326..9ee99fd9 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -49,26 +49,19 @@ Write a Python file to the current directory with a `load_config_builder()` func ```python # /// script # dependencies = [ -# "data-designer", -# "pydantic", +# "data-designer", # required +# # add any additional dependencies here # ] # /// import data_designer.config as dd -from pydantic import BaseModel, Field -# Define Pydantic models when a column needs structured output -class MyEntity(BaseModel): - field_one: str = Field(description="...") - field_two: int = Field(description="...") - - -# Use custom generators when built-in column types aren't enough +# Use custom generators when bespoke generation logic is needed @dd.custom_column_generator( required_columns=["col_a"], side_effect_columns=["extra_col"], ) -def my_custom_generator(row: dict) -> dict: +def generator_function(row: dict) -> dict: # add custom logic here that depends on "col_a" and update row in place row["name_in_custom_column_config"] = "custom value" row["extra_col"] = "extra value" From 91bd843ef3a20fd82bc1bae3eabf0d28d1e18156 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Thu, 19 Mar 2026 00:17:41 -0700 Subject: [PATCH 26/27] pydantic is always included with data-designer --- skills/data-designer/SKILL.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 9ee99fd9..8f5c01e9 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -54,9 +54,16 @@ Write a Python file to the current directory with a `load_config_builder()` func # ] # /// import data_designer.config as dd +from pydantic import BaseModel, Field -# Use custom generators when bespoke generation logic is needed +# Use Pydantic models when the output needs to conform to a specific schema +class MyStructuredOutput(BaseModel): + field_one: str = Field(description="...") + field_two: int = Field(description="...") + + +# Use custom generators when built-in column types aren't enough @dd.custom_column_generator( required_columns=["col_a"], side_effect_columns=["extra_col"], From 88f7c68aaefa0501d157864b3cadfa25c1558e49 Mon Sep 17 00:00:00 2001 From: Johnny Greco Date: Thu, 19 Mar 2026 00:27:25 -0700 Subject: [PATCH 27/27] imports tweak --- skills/data-designer/SKILL.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skills/data-designer/SKILL.md b/skills/data-designer/SKILL.md index 8f5c01e9..ddee328a 100644 --- a/skills/data-designer/SKILL.md +++ b/skills/data-designer/SKILL.md @@ -49,8 +49,9 @@ Write a Python file to the current directory with a `load_config_builder()` func ```python # /// script # dependencies = [ -# "data-designer", # required -# # add any additional dependencies here +# "data-designer", # always required +# "pydantic", # only if this script imports from pydantic +# # add additional dependencies here # ] # /// import data_designer.config as dd