diff --git a/.claude/skills/address-pr-review/SKILL.md b/.claude/skills/address-pr-review/SKILL.md
new file mode 100644
index 0000000..53f0f11
--- /dev/null
+++ b/.claude/skills/address-pr-review/SKILL.md
@@ -0,0 +1,99 @@
+---
+name: address-pr-review
+description: Use when you have PR review comments to address and want to evaluate each comment's validity before deciding to fix, reply, or skip
+---
+
+# Address PR Review Comments
+
+## Overview
+
+Interactive workflow: analyze PR review comment validity, recommend action, let user decide (fix/reply/skip).
+
+## When to Use
+
+- PR has review comments needing evaluation before action
+- Reviewer feedback might be incorrect or needs discussion
+- Comments require varied responses (fix/reply/skip)
+- Need to balance code quality with respectful reviewer engagement
+
+## When NOT to Use
+
+- All comments are clearly valid and straightforward to fix
+- No comments yet or doing pre-review self-review
+- Comments only on non-code files without technical analysis needed
+
+## Workflow Overview
+
+```dot
+digraph pr_review_flow {
+ "Fetch PR comments" [shape=box];
+ "More comments?" [shape=diamond];
+ "Show comment + file context" [shape=box];
+ "Analyze validity" [shape=box];
+ "Recommend action" [shape=box];
+ "Ask user: Fix/Reply/Skip/Quit?" [shape=diamond];
+ "Make code changes" [shape=box];
+ "Draft reply" [shape=box];
+ "Track as skipped" [shape=box];
+ "Show summary" [shape=box];
+
+ "Fetch PR comments" -> "More comments?";
+ "More comments?" -> "Show comment + file context" [label="yes"];
+ "More comments?" -> "Show summary" [label="no"];
+ "Show comment + file context" -> "Analyze validity";
+ "Analyze validity" -> "Recommend action";
+ "Recommend action" -> "Ask user: Fix/Reply/Skip/Quit?";
+ "Ask user: Fix/Reply/Skip/Quit?" -> "Make code changes" [label="Fix"];
+ "Ask user: Fix/Reply/Skip/Quit?" -> "Draft reply" [label="Reply"];
+ "Ask user: Fix/Reply/Skip/Quit?" -> "Track as skipped" [label="Skip"];
+ "Ask user: Fix/Reply/Skip/Quit?" -> "Show summary" [label="Quit"];
+ "Make code changes" -> "More comments?";
+ "Draft reply" -> "More comments?";
+ "Track as skipped" -> "More comments?";
+}
+```
+
+## Quick Reference
+
+**Critical principle:** Reviewer may be wrong - analyze validity before recommending action.
+
+| Phase | Actions |
+|-------|---------|
+| **Fetch** | `gh api repos/{owner}/{repo}/pulls/$PR/comments`
Extract: path, line, body, user.login, id
Exit if no comments |
+| **Per Comment** | Show: file:line, author, comment, ±10 lines context
Analyze: Valid/Nitpick/Disagree/Question
Recommend: Fix/Reply/Skip with reasoning |
+| **Fix** | Minimal changes per llm/rules-*.md
Offer reply draft: `Fixed: [what]. [why]`
Show: `gh api --method POST repos/{owner}/{repo}/pulls/comments/$ID/replies -f body="..."` |
+| **Reply** | Draft based on type: Question/Suggestion/Disagreement
Let user edit
Show gh command (never auto-post) |
+| **Summary** | Processed X/N: Fixed Y, Replied Z, Skipped W
List: files modified, reply drafts, next steps |
+
+## Critical Principles
+
+| Principle | Violation Pattern |
+|-----------|-------------------|
+| **Analyze first** | Accepting all feedback as valid without critical analysis |
+| **Never auto-post** | Posting replies automatically instead of showing gh command |
+| **One at a time** | Batch processing all comments without individual analysis |
+| **Show context** | Making changes without displaying ±10 lines around code |
+| **Minimal changes** | Large refactors in response to small comments |
+| **Follow standards** | Ignoring llm/rules-*.md when fixing |
+| **Respectful honesty** | Being defensive/dismissive when reviewer is wrong |
+| **User control** | Posting drafts without letting user edit first |
+
+## Reply Formats
+
+- Fix: `Fixed: [what]. [why]`
+- Update: `Updated: [what]`
+- Answer: `[explanation]`
+- Acknowledge: `Good catch, [action/reason]`
+- Disagree: `[respectful reasoning]`
+
+## Setup & Usage
+
+Requires: `gh` CLI authenticated, GitHub remote configured
+
+```bash
+# Start session
+"use address-pr-review for PR "
+
+# Or list PRs first
+"use address-pr-review"
+```
diff --git a/.claude/skills/code-review/SKILL.md b/.claude/skills/code-review/SKILL.md
new file mode 100644
index 0000000..3292752
--- /dev/null
+++ b/.claude/skills/code-review/SKILL.md
@@ -0,0 +1,170 @@
+---
+name: code-review
+description: Use after completing implementation to review code quality, user impact, test coverage, and documentation before creating a PR
+---
+
+# Code Review
+
+## Overview
+
+Review code for quality, user impact, tests, and documentation. Balance technical excellence with practical simplicity.
+
+**Core principle:** Clean code should serve users, not just developers.
+
+## When to Use
+
+- After completing a feature or bug fix
+- Before creating a pull request
+- When reviewing changes before merge
+
+## When NOT to Use
+
+- Trivial changes (typo fixes)
+- Documentation-only changes
+- Initial exploration/prototyping
+
+## Review Process
+
+| Phase | Focus | Key Question |
+|-------|-------|--------------|
+| 1. Identify | What changed? | `git diff --name-only develop` |
+| 2. User Impact | How does this affect users? | Is UX better or worse? |
+| 3. Code Quality | Does it follow standards? | KISS + no anti-patterns? |
+| 4. Tests | Is it covered? | New code = new tests? |
+| 5. Docs | What needs updating? | llm/state-*.md current? |
+
+---
+
+## Phase 1: Identify Changes
+
+Categorize changed files:
+- **Backend:** `lib/`, `app.py`
+- **Frontend:** `frontend/src/`
+- **Tests:** `tests/`
+- **Docs:** `llm/`, `*.md`
+
+Note change type: new feature | bug fix | refactoring | enhancement
+
+---
+
+## Phase 2: User Impact
+
+**Ask for each change:**
+1. Does this affect what users see or do?
+2. Are error messages user-friendly (not technical jargon)?
+3. Are loading states shown?
+4. Can users recover from errors?
+5. Is this the simplest UX possible?
+
+**Red flags:**
+- Silent failures (user doesn't know something failed)
+- Lost work on errors
+- Unclear feedback ("Error: 500" vs "Could not save")
+- Unnecessary complexity exposed to users
+
+---
+
+## Phase 3: Code Quality
+
+### KISS Check
+
+Can each function be explained in one sentence? If not, it's too complex.
+
+### Backend Anti-patterns (blocking)
+
+- [ ] Silent failures (empty except blocks)
+- [ ] God functions (>30 lines, >3 params)
+- [ ] SQL injection (f-strings in queries)
+- [ ] Missing error context
+- [ ] Walrus operators / complex one-liners
+
+### Frontend Anti-patterns (blocking)
+
+- [ ] Empty catch blocks
+- [ ] Inline fetch (not in service layer)
+- [ ] Missing useEffect cleanup
+- [ ] `any` types or `as` assertions
+- [ ] Hardcoded colors (use theme: fg.*, canvas.*)
+- [ ] Prop drilling (>5 props)
+
+### Security
+
+- [ ] Inputs validated at API boundary
+- [ ] SQL parameterized (`?` placeholders)
+- [ ] No secrets in code/logs
+
+---
+
+## Phase 4: Test Coverage
+
+| Change Type | Required Test |
+|-------------|---------------|
+| New API endpoint | Unit test |
+| New block | `tests/blocks/test_*.py` |
+| Bug fix | Regression test |
+| User workflow change | E2E test |
+| Refactoring | Existing tests pass |
+
+**Test quality:**
+- Naming: `test___`
+- One behavior per test
+- Error cases tested, not just happy path
+
+---
+
+## Phase 5: Documentation
+
+**Update llm/state-*.md when:**
+- New API endpoint → `state-backend.md`
+- New block → `state-backend.md`
+- New component/page → `state-frontend.md`
+- Architecture change → `state-project.md`
+
+**Code comments:** explain WHY, not what. Lowercase, concise.
+
+---
+
+## Output Format
+
+```markdown
+### User Impact
+[UX improvements or issues found]
+
+### Anti-patterns
+[location + violation + fix, or "none"]
+
+### Code Quality Issues
+[severity + location + fix, or "none"]
+
+### Test Coverage
+[required: present/missing | gaps if any]
+
+### Documentation Updates
+[files needing update, or "none"]
+
+### Verdict
+[BLOCK | REQUEST CHANGES | APPROVE]
+Reason: [brief explanation]
+```
+
+---
+
+## Verdict Rules
+
+| Condition | Verdict |
+|-----------|---------|
+| Anti-patterns found | BLOCK |
+| Security issues | BLOCK |
+| Missing required tests | REQUEST CHANGES |
+| Needs doc updates | REQUEST CHANGES |
+| All checks pass | APPROVE |
+
+---
+
+## Golden Rules
+
+1. Anti-patterns are blocking - always reject
+2. User experience matters - clean code that hurts UX is bad code
+3. KISS wins - one sentence explanation or it's too complex
+4. Tests are not optional - new code needs tests
+5. Fail loudly - silent failures are never acceptable
diff --git a/.claude/skills/debugging-pipelines/SKILL.md b/.claude/skills/debugging-pipelines/SKILL.md
new file mode 100644
index 0000000..a8ab8e8
--- /dev/null
+++ b/.claude/skills/debugging-pipelines/SKILL.md
@@ -0,0 +1,282 @@
+---
+name: debugging-pipelines
+description: Use when pipelines fail, produce unexpected output, or need systematic troubleshooting
+---
+
+# Debugging DataGenFlow Pipelines
+
+## Overview
+
+Systematic debugging workflow for any DataGenFlow pipeline failure or unexpected output. This skill provides a structured four-phase process to identify and fix root causes rather than guessing at solutions.
+
+**Core Principle:** Find the root cause before attempting fixes. Random fixes waste time and create new bugs.
+
+## When to Use
+
+Use this skill when:
+- Pipeline execution fails with unclear errors
+- Pipeline produces "bad data" or unexpected output
+- Need to isolate which block is causing issues
+- LLM generates duplicates or poor quality content
+- Output has unexpected fields (metadata pollution)
+- Results are missing expected fields
+- Performance issues or slow execution
+- Integration test failures
+
+## When NOT to Use
+
+Skip this skill for:
+- Simple configuration errors (typos in config)
+- Documentation lookup (how to use a specific block)
+- Feature requests (adding new functionality)
+- Questions about architecture (use codebase exploration instead)
+
+## The Four-Phase Debugging Process
+
+### Phase 1: Observe & Gather Evidence
+
+**Goal:** Understand what's wrong and collect data
+
+**Steps:**
+1. **Run the pipeline and capture full output**
+ - Use pytest for tests: `pytest tests/integration/test_X.py -v -s`
+ - For API, check logs and response data
+ - Save the complete error message and stack trace
+
+2. **Identify what makes output "bad"**
+ - Missing fields? (expected `price` but not in output)
+ - Wrong values? (all prices are 0)
+ - Extra fields? (input metadata leaking: `samples`, `target_count`)
+ - Duplicates? (similarity_score = 1.0, exact copies)
+ - Type errors? (expected dict, got list)
+
+3. **Check recent changes**
+ - Run `git diff` to see what changed
+ - Review recent commits that might affect this pipeline
+ - Check if tests passed before the change
+
+4. **Review error messages completely**
+ - Read the full stack trace, not just the last line
+ - Note file paths, line numbers, and error types
+ - Check for validation errors with detail context
+
+**Red Flags to Stop:**
+- "I think I know the problem" (without evidence)
+- "Let me try changing X" (before tracing data flow)
+- Skipping logs because "error is obvious"
+
+### Phase 2: Trace Data Flow
+
+**Goal:** Understand how data transforms through the pipeline
+
+**Steps:**
+1. **Identify which blocks touch the problematic data**
+ - Check pipeline definition (YAML or dict)
+ - List all blocks in execution order
+ - Note which blocks read/write the affected fields
+
+2. **Read block implementations**
+ - Open `lib/blocks/builtin/[block_name].py`
+ - Review the `execute()` method
+ - Check what inputs it expects and outputs it returns
+ - Look for data transformations or filtering logic
+
+3. **Trace data transformation between blocks**
+ - Check `lib/workflow.py:_process_single_seed()` for multiplier pipelines
+ - See how `accumulated_state` merges block outputs
+ - Identify where data gets added, modified, or removed
+
+4. **Check workflow execution flow**
+ - Normal pipeline: `lib/workflow.py:85-224`
+ - Multiplier pipeline: `lib/workflow.py:305-449`
+ - Understand seed processing vs result filtering
+
+**Key Files to Check:**
+- `lib/workflow.py` - Pipeline execution engine
+- `lib/blocks/builtin/` - All block implementations
+- `lib/entities/block_execution_context.py` - Context passed between blocks
+
+### Phase 3: Root Cause Analysis
+
+**Goal:** Form a specific, testable hypothesis
+
+**Steps:**
+1. **Form specific hypothesis**
+ - Format: "I think X causes Y because Z"
+ - Example: "I think input metadata leaks to output because workflow.py line 323 merges all initial_data without filtering"
+ - Be specific, not vague
+
+2. **Don't assume - verify with evidence**
+ - Read the actual code at the suspected line
+ - Check logs or traces confirming the behavior
+ - Look for similar patterns in other files
+
+3. **Use logs, traces, and execution results**
+ - Check test output for actual vs expected values
+ - Review trace data showing block inputs/outputs
+ - Examine execution_time for performance issues
+
+**Red Flags:**
+- "It's probably just..." (guessing)
+- "This usually means..." (pattern matching without verification)
+- Proposing fixes before understanding the cause
+
+### Phase 4: Fix & Verify
+
+**Goal:** Implement minimal fix targeting the root cause
+
+**Steps:**
+1. **Make minimal fix**
+ - Change only what's necessary to fix the root cause
+ - Don't refactor or "improve" surrounding code
+ - One logical change at a time
+
+2. **Run tests to verify fix**
+ - Run the specific failing test
+ - Check for test passing
+ - Run related tests to catch regressions
+
+3. **Check for side effects**
+ - Did the fix break other tests?
+ - Are there related features that might be affected?
+ - Review the change for unintended consequences
+
+4. **If fix doesn't work**
+ - Count: How many fixes have you tried?
+ - If < 3: Return to Phase 1, re-analyze with new information
+ - If ≥ 3: Question the architecture - might need design discussion
+
+**Success Criteria:**
+- Tests pass
+- Root cause addressed (not just symptoms)
+- No new bugs introduced
+- Code follows project guidelines (KISS, minimal changes)
+
+## Common Pipeline Issues
+
+| Issue Pattern | Where to Look | Typical Root Causes | Fix Pattern |
+|--------------|---------------|---------------------|-------------|
+| Output has unexpected fields | `lib/workflow.py` data merging | Input metadata leaking to output | Filter `initial_data_keys` before returning results |
+| Block returns wrong data type | Block's `execute()` method | Incorrect return type (dict vs list) | Fix block to return declared type |
+| LLM generates poor quality | Block's prompt building | Unclear instructions, low temperature, copying examples | Improve prompt, add diversity instructions |
+| LLM copying examples verbatim | SemanticInfiller prompt | Prompt doesn't emphasize creating NEW content | Add "do NOT copy" instruction to prompt |
+| Pipeline crashes on specific input | Block's validation logic | Missing input validation or type checking | Add validation in block's execute() |
+| Results missing fields | Block's output filtering or merging | Overly aggressive filtering or incorrect merge | Check field filtering logic |
+| All duplicates flagged | DuplicateRemover threshold | Threshold too low or embedding model issues | Check similarity_threshold config |
+| Metadata pollution | Workflow seed processing | Initial seed data not filtered from output | Use `_filter_output_data()` helper |
+
+## Critical Files Reference
+
+**Pipeline Execution:**
+- `lib/workflow.py:85-224` - Normal pipeline execution flow
+- `lib/workflow.py:305-449` - Multiplier pipeline (1→N expansion) with seed processing
+- `lib/workflow.py:275-284` - `_filter_output_data()` helper (filters metadata from results)
+
+**Built-in Blocks:**
+- `lib/blocks/builtin/structure_sampler.py` - Statistical sampling (multiplier block)
+- `lib/blocks/builtin/semantic_infiller.py:59-109` - LLM prompt building
+- `lib/blocks/builtin/semantic_infiller.py:146-165` - Metadata filtering in SemanticInfiller
+- `lib/blocks/builtin/duplicate_remover.py` - Embedding-based similarity detection
+
+**Core Infrastructure:**
+- `lib/entities/block_execution_context.py` - Context passed between blocks
+- `lib/blocks/base.py` - BaseBlock interface all blocks inherit from
+- `lib/entities/pipeline.py` - ExecutionResult, Usage models
+- `lib/template_renderer.py` - Jinja2 template rendering
+
+**Tests:**
+- `tests/integration/` - Integration tests for end-to-end verification
+- `tests/blocks/` - Unit tests for individual blocks
+
+## Debugging Checklist
+
+Use this checklist to ensure systematic debugging:
+
+```
+Phase 1: Observe & Gather Evidence
+□ Run pipeline and capture full output
+□ Identify specific problem (what's wrong?)
+□ Read error messages completely (full stack trace)
+□ Check recent git changes (git diff, git log)
+
+Phase 2: Trace Data Flow
+□ Check which blocks are in the pipeline
+□ Read those block implementations (execute methods)
+□ Trace data flow through blocks (accumulated_state)
+□ Understand workflow execution (normal vs multiplier)
+
+Phase 3: Root Cause Analysis
+□ Form specific hypothesis ("X causes Y because Z")
+□ Verify hypothesis with evidence (code, logs, traces)
+□ Don't assume - read actual code
+□ Check for similar patterns elsewhere
+
+Phase 4: Fix & Verify
+□ Make minimal fix targeting root cause
+□ Run tests to verify fix works
+□ Check for unintended side effects
+□ Follow project guidelines (KISS, simplicity)
+```
+
+## Real-World Example: Data Augmentation Metadata Pollution
+
+**Problem Observed:**
+Pipeline output contained input configuration fields (`samples`, `target_count`, `categorical_fields`) mixed with generated data.
+
+**Phase 1 - Evidence:**
+```json
+// Expected output:
+{"category": "electronics", "price": 449, "description": "...", "is_duplicate": false}
+
+// Actual output:
+{"category": "electronics", "price": 449, "description": "...",
+ "samples": [...], "target_count": 10, "categorical_fields": [...]} // ❌ Bad!
+```
+
+**Phase 2 - Trace:**
+- Traced workflow.py seed processing
+- Found `merged_state = {**initial_data, **seed_data}` at line 323
+- Merged state flows through all blocks
+- No filtering before returning results
+
+**Phase 3 - Root Cause:**
+Hypothesis: "Input metadata leaks to output because workflow.py merges all initial_data into accumulated_state without filtering configuration fields before returning results"
+
+**Phase 4 - Fix:**
+1. Added `_filter_output_data()` helper method
+2. Track `initial_data_keys` at merge time
+3. Filter those keys before returning `ExecutionResult`
+4. Tests passed, metadata removed from output
+
+**Lessons:**
+- Data flow tracing revealed the merge point
+- Minimal fix (filter helper) solved the root cause
+- No refactoring needed - targeted change only
+
+## Tips for Effective Debugging
+
+1. **Start with the simplest explanation**
+ - Don't assume complex bugs when simple causes are more likely
+ - Check configuration before code logic
+
+2. **Use the scientific method**
+ - Observe → Hypothesize → Test → Verify
+ - One variable at a time
+
+3. **Trust but verify**
+ - Don't trust assumptions about what code does
+ - Read the actual implementation
+
+4. **Leverage existing patterns**
+ - Look for similar working code in the codebase
+ - Compare broken vs working implementations
+
+5. **Document as you go**
+ - Keep notes on what you've checked
+ - Record hypotheses and test results
+ - Helps if you need to ask for help
+
+## Related Skills
+
+- `implementing-datagenflow-blocks` - For understanding block structure and creation
+- `address-pr-review` - For evaluating whether debugging revealed design issues
diff --git a/.claude/skills/implementing-datagenflow-blocks/SKILL.md b/.claude/skills/implementing-datagenflow-blocks/SKILL.md
new file mode 100644
index 0000000..f9cbaca
--- /dev/null
+++ b/.claude/skills/implementing-datagenflow-blocks/SKILL.md
@@ -0,0 +1,623 @@
+---
+name: implementing-datagenflow-blocks
+description: Use when creating new blocks for DataGenFlow pipeline system or modifying existing blocks to ensure consistency with established patterns
+---
+
+# Implementing DataGenFlow Blocks
+
+## Overview
+
+DataGenFlow blocks are composable pipeline components. Follow KISS principles: write minimal functions, make code self-explanatory, keep it simple.
+
+## When to Use
+
+- Creating a new block
+- Modifying existing block behavior
+- Reviewing block implementations
+- Debugging block execution issues
+
+**When NOT to use:**
+- General backend code (use llm/rules-backend.md)
+- Frontend development (use llm/rules-frontend.md)
+
+## Block Structure
+
+```python
+import logging
+from typing import Any
+
+import litellm # if using LLM
+
+from lib.blocks.base import BaseBlock
+from lib.entities import pipeline
+from lib.entities.block_execution_context import BlockExecutionContext
+from lib.template_renderer import render_template # if using templates
+
+logger = logging.getLogger(__name__)
+
+
+class MyBlock(BaseBlock):
+ name = "My Block"
+ description = "Short description of what this block does"
+ category = "generators" # generators|transformers|validators|utilities
+ inputs = ["field1"] # or ["*"] for any input fields
+ outputs = ["field2"] # or ["*"] for dynamic outputs
+
+ _config_descriptions = {
+ "param_name": "Help text shown in UI",
+ }
+
+ def __init__(
+ self,
+ param1: str,
+ model: str | None = None, # EXACTLY "model" for LLM selection UI
+ temperature: float = 0.7,
+ ):
+ self.param1 = param1
+ self.model_name = model # store as model_name
+ self.temperature = temperature
+
+ async def execute(self, context: BlockExecutionContext) -> dict[str, Any]:
+ from app import llm_config_manager # import inside execute
+
+ # your logic here
+
+ return {"field": value, "_usage": usage_info}
+```
+
+## UI Integration Patterns
+
+The frontend automatically renders different UI controls based on parameter names, types, and class attributes.
+
+### Model Dropdown (LLM)
+
+**Parameter MUST be named exactly `model`** for automatic dropdown:
+
+```python
+def __init__(
+ self,
+ model: str | None = None, # MUST be "model" and str|None
+ temperature: float = 0.7,
+ max_tokens: int = 2048,
+):
+ self.model_name = model # store as model_name
+```
+
+**Config description:**
+```python
+_config_descriptions = {
+ "model": "Select LLM model to use (leave empty for default)",
+}
+```
+
+**Usage in execute:**
+```python
+async def execute(self, context: BlockExecutionContext) -> dict[str, Any]:
+ from app import llm_config_manager
+
+ llm_config = await llm_config_manager.get_llm_model(self.model_name)
+ llm_params = llm_config_manager.prepare_llm_call(
+ llm_config,
+ messages=messages,
+ temperature=self.temperature,
+ max_tokens=self.max_tokens,
+ )
+```
+
+### Embedding Model Dropdown
+
+**Parameter MUST be named exactly `embedding_model`**:
+
+```python
+def __init__(
+ self,
+ embedding_model: str | None = None, # MUST be "embedding_model"
+):
+ self.embedding_model_name = embedding_model
+```
+
+**Config description:**
+```python
+_config_descriptions = {
+ "embedding_model": "Embedding model to use (leave empty for default)",
+}
+```
+
+**Usage:**
+```python
+embedding_config = await llm_config_manager.get_embedding_model(
+ self.embedding_model_name
+)
+```
+
+### Enum Dropdown
+
+Use `_config_enums` class attribute to create dropdown with predefined options:
+
+```python
+class MyBlock(BaseBlock):
+ _config_enums = {
+ "mode": ["strict", "lenient", "auto"],
+ "format": ["json", "yaml", "xml"],
+ }
+
+ def __init__(
+ self,
+ mode: str = "auto",
+ format: str = "json",
+ ):
+ self.mode = mode
+ self.format = format
+```
+
+### Multi-Select Checkboxes
+
+For array parameters with enum values:
+
+```python
+class MyBlock(BaseBlock):
+ _config_enums = {
+ "features": ["feature_a", "feature_b", "feature_c"],
+ }
+
+ def __init__(
+ self,
+ features: list[str] | None = None,
+ ):
+ self.features = features or []
+```
+
+### Field Reference Dropdown
+
+Use `_field_references` to create dropdown showing available fields from pipeline:
+
+```python
+class MyBlock(BaseBlock):
+ _field_references = ["source_field", "target_field"]
+
+ _config_descriptions = {
+ "source_field": "Field to read from",
+ "target_field": "Field to write to",
+ }
+
+ def __init__(
+ self,
+ source_field: str,
+ target_field: str,
+ ):
+ self.source_field = source_field
+ self.target_field = target_field
+```
+
+### Template Fields (Monaco Editor)
+
+Parameters with these patterns automatically get Monaco editor:
+- Name contains "prompt", "template", or "instruction"
+- Or set `schema.format = "jinja2"` via config
+
+```python
+def __init__(
+ self,
+ user_prompt: str = "", # automatically gets editor
+ system_prompt: str = "", # automatically gets editor
+ custom_template: str = "", # automatically gets editor
+):
+ self.user_prompt = user_prompt
+```
+
+**Config description should mention Jinja2:**
+```python
+_config_descriptions = {
+ "user_prompt": (
+ "Jinja2 template. Reference fields with {{ field_name }} or "
+ "{{ metadata.field_name }}"
+ ),
+}
+```
+
+**Rendering:**
+```python
+from lib.template_renderer import render_template
+
+rendered = render_template(self.user_prompt, context.accumulated_state)
+```
+
+### JSON Object/Array (Monaco Editor)
+
+Parameters typed as `dict` or `list` get JSON Monaco editor:
+
+```python
+def __init__(
+ self,
+ json_schema: dict[str, Any], # JSON editor
+ field_list: list[str], # JSON editor
+):
+ self.json_schema = json_schema
+ self.field_list = field_list
+```
+
+### Number Input
+
+Parameters typed as `int` or `float` get number input:
+
+```python
+def __init__(
+ self,
+ temperature: float = 0.7, # number input
+ max_tokens: int = 2048, # number input
+):
+ self.temperature = temperature
+```
+
+### Textarea
+
+Parameters with these patterns get multi-line textarea:
+- String length > 100 characters
+- Name contains "description"
+- Type has long content
+
+```python
+def __init__(
+ self,
+ description: str = "", # automatically gets textarea
+):
+ self.description = description
+```
+
+### Text Input (Default)
+
+Short string parameters get single-line text input:
+
+```python
+def __init__(
+ self,
+ name: str,
+ label: str = "",
+):
+ self.name = name
+```
+
+## JSON Array as String Pattern
+
+For parameters that should accept either JSON array or Jinja template (like `fields_to_generate`):
+
+```python
+def __init__(
+ self,
+ fields_to_generate: str, # str, not list[str]
+):
+ self.fields_to_generate_template = fields_to_generate
+
+_config_descriptions = {
+ "fields_to_generate": (
+ 'JSON array or Jinja template. Examples: ["bio", "storage"] or '
+ '{{ fields_to_generate | tojson }}'
+ ),
+}
+```
+
+**Parsing in execute:**
+```python
+import json
+
+fields_rendered = render_template(
+ self.fields_to_generate_template,
+ context.accumulated_state
+)
+try:
+ fields_list = json.loads(fields_rendered)
+ if not isinstance(fields_list, list):
+ raise BlockExecutionError("Must be JSON array")
+except json.JSONDecodeError as e:
+ raise BlockExecutionError(f"Invalid JSON: {str(e)}")
+```
+
+**Template usage:**
+```yaml
+fields_to_generate: "{{ fields_to_generate | tojson }}"
+```
+
+## LLM Integration Pattern
+
+Full pattern for blocks that call LLM:
+
+```python
+async def execute(self, context: BlockExecutionContext) -> dict[str, Any]:
+ from app import llm_config_manager
+
+ # prepare messages
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_prompt},
+ ]
+
+ # get llm config
+ llm_config = await llm_config_manager.get_llm_model(self.model_name)
+ llm_params = llm_config_manager.prepare_llm_call(
+ llm_config,
+ messages=messages,
+ temperature=self.temperature,
+ max_tokens=self.max_tokens,
+ )
+
+ # add trace metadata for langfuse grouping
+ llm_params["metadata"] = {
+ "trace_id": context.trace_id,
+ "tags": ["datagenflow"],
+ }
+
+ logger.info(f"Calling LiteLLM with model={llm_params.get('model')}")
+
+ try:
+ response = await litellm.acompletion(**llm_params)
+ except Exception as e:
+ logger.error(f"LLM call failed for {self.name}: {e}")
+ raise
+
+ content = response.choices[0].message.content
+
+ # extract usage info
+ usage_info = pipeline.Usage(
+ input_tokens=response.usage.prompt_tokens or 0,
+ output_tokens=response.usage.completion_tokens or 0,
+ cached_tokens=getattr(response.usage, "cache_read_input_tokens", 0) or 0,
+ )
+
+ return {
+ "generated": content,
+ "_usage": usage_info.model_dump(),
+ }
+```
+
+## State Management
+
+### Reading State
+
+```python
+async def execute(self, context: BlockExecutionContext) -> dict[str, Any]:
+ # get current record
+ current = context.accumulated_state.copy()
+
+ # remove internal fields
+ current.pop("_usage", None)
+ current.pop("_hints", None)
+
+ # get reference data from initial state
+ samples = context.get_state("samples", [])
+```
+
+### Caching Per Execution
+
+**Never use instance-level state that persists across jobs.** Use trace_id-keyed caching:
+
+```python
+def __init__(self):
+ # cache per trace_id (one cache per pipeline execution)
+ self._embeddings_cache: dict[str, list[list[float]]] = {}
+
+async def execute(self, context: BlockExecutionContext) -> dict[str, Any]:
+ trace_id = context.trace_id
+
+ # build cache once per pipeline execution
+ if trace_id not in self._embeddings_cache:
+ # compute embeddings
+ self._embeddings_cache[trace_id] = embeddings
+
+ # use cached data
+ cached_embeddings = self._embeddings_cache[trace_id]
+```
+
+## Multiplier Blocks
+
+Blocks that generate multiple items from one input:
+
+```python
+from lib.blocks.base import BaseMultiplierBlock
+from lib.entities.block_execution_context import BlockExecutionContext
+
+class StructureSampler(BaseMultiplierBlock):
+ name = "Structure Sampler"
+ category = "seeders"
+
+ async def execute(
+ self,
+ context: BlockExecutionContext
+ ) -> list[dict[str, Any]]:
+ # read from context and return list of records
+ return [record1, record2, record3]
+```
+
+## Code Quality
+
+### KISS Principle
+
+Write minimal number of functions, make code self-explanatory:
+
+```python
+# ✅ good - simple and clear
+def _prepare_prompts(self, data: dict[str, Any]) -> tuple[str, str]:
+ """render jinja2 templates with data context"""
+ system_template = self.system_prompt or data.get("system", "")
+ user_template = self.user_prompt or data.get("user", "")
+
+ system = render_template(system_template, data) if system_template else ""
+ user = render_template(user_template, data) if user_template else ""
+
+ return system, user
+
+# ❌ bad - over-engineered with too many tiny functions
+def _get_system(self, data): ...
+def _get_user(self, data): ...
+def _render_system(self, template, data): ...
+def _render_user(self, template, data): ...
+```
+
+### Comments
+
+Comments in lowercase, explain WHY not WHAT:
+
+```python
+# ✅ good - explains why
+def _extract_text(self, record: dict[str, Any]) -> str:
+ """
+ extract text from specified fields or all string fields
+ joins with spaces for embedding
+ """
+
+# ❌ bad - just describes what code does
+def _extract_text(self, record: dict[str, Any]) -> str:
+ """Extract text from record fields"""
+ # Loop through fields and get string values
+```
+
+### Imports
+
+All imports at top of file, not inside functions (except `from app import llm_config_manager`):
+
+```python
+# ✅ good
+import json
+import logging
+from typing import Any
+
+import litellm
+
+from lib.blocks.base import BaseBlock
+
+# ❌ bad
+def execute(self, context):
+ import json # wrong place
+```
+
+**Exception:** `from app import llm_config_manager` goes inside `execute()` to avoid circular imports.
+
+## Testing
+
+### Unit Tests
+
+Create `tests/blocks/test_.py`:
+
+```python
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+
+from lib.blocks.builtin.my_block import MyBlock
+from lib.entities.block_execution_context import BlockExecutionContext
+
+
+def make_context(state: dict) -> BlockExecutionContext:
+ """helper to create test context"""
+ return BlockExecutionContext(
+ trace_id="test-trace",
+ pipeline_id=1,
+ accumulated_state=state,
+ )
+
+
+class TestMyBlockInit:
+ def test_init_basic(self):
+ block = MyBlock(param="value")
+ assert block.param == "value"
+
+
+class TestMyBlockExecution:
+ @pytest.mark.asyncio
+ @patch("litellm.acompletion")
+ @patch("app.llm_config_manager")
+ async def test_execute_basic(self, mock_config_manager, mock_completion):
+ # setup mocks
+ mock_config_manager.get_llm_model = AsyncMock(...)
+ mock_completion.return_value = MagicMock(...)
+
+ block = MyBlock(param="value")
+ context = make_context({"field": "value"})
+
+ result = await block.execute(context)
+
+ assert result["field"] == "expected"
+```
+
+### Integration Tests
+
+Add to `tests/integration/test_data_augmentation.py`.
+
+## Documentation Updates
+
+**Always update after implementing:**
+
+1. **llm/state-project.md** - block count, description
+2. **llm/state-backend.md** - block count, details
+3. **lib/templates/** - template YAML if applicable
+
+## Common Mistakes
+
+| Mistake | Problem | Fix |
+|---------|---------|-----|
+| Parameter named `model_name` | No dropdown UI | Name it exactly `model` |
+| Parameter named `embedding` | No dropdown UI | Name it exactly `embedding_model` |
+| `list[str]` for JSON arrays | Can't use templates | Use `str`, render + parse |
+| Instance-level cache | Data leaks between jobs | Use `dict[str, T]` keyed by `trace_id` |
+| Imports inside functions | Not the codebase style | Move to top (except llm_config_manager) |
+| Over-engineering | Too many tiny functions | KISS - keep it simple |
+| Comments describe what | Obvious from code | Explain WHY, lowercase |
+| Forgot `_usage` | Usage not tracked | Always return `_usage` from LLM |
+| Missing `_config_descriptions` | No help text in UI | Add descriptions for all params |
+| Wrong enum format | UI doesn't render dropdown | Use `_config_enums` class attribute |
+
+## Implementation Checklist
+
+**Design:**
+- [ ] Choose block type (BaseBlock vs BaseMultiplierBlock)
+- [ ] Define inputs/outputs
+- [ ] Identify parameters and their types
+- [ ] Name model parameters correctly (`model`, `embedding_model`)
+- [ ] Decide which params need enum dropdowns or field references
+
+**Implementation:**
+- [ ] Add all imports at top (except llm_config_manager)
+- [ ] Create class with `name`, `description`, `category`, `inputs`, `outputs`
+- [ ] Add `_config_descriptions` with helpful UI text
+- [ ] Add `_config_enums` if using dropdowns
+- [ ] Add `_field_references` if using field selection
+- [ ] Implement `__init__` with correct parameter types
+- [ ] Implement `execute()` method
+- [ ] Add template rendering if needed
+- [ ] Use `llm_config_manager.get_llm_model()` for LLM
+- [ ] Use `llm_config_manager.get_embedding_model()` for embeddings
+- [ ] Add trace metadata to `llm_params["metadata"]`
+- [ ] Track usage with `pipeline.Usage()` and return `_usage`
+- [ ] Use trace_id-keyed caching if needed
+- [ ] Write lowercase comments explaining WHY
+
+**Testing:**
+- [ ] Create unit test file `tests/blocks/test_.py`
+- [ ] Test initialization variants
+- [ ] Test execution with mocked LLM config
+- [ ] Test edge cases and error handling
+- [ ] Add integration test
+- [ ] Run `pytest tests/` - all pass
+
+**Documentation:**
+- [ ] Update `llm/state-project.md`
+- [ ] Update `llm/state-backend.md`
+- [ ] Create template YAML if applicable
+
+**Review:**
+- [ ] Model parameters named exactly right
+- [ ] Imports at top (except llm_config_manager)
+- [ ] No instance-level state
+- [ ] KISS principle followed
+- [ ] `_usage` returned if using LLM
+- [ ] All UI integrations correct (enums, field refs, descriptions)
+
+## Reference Examples
+
+**Simple:** `lib/blocks/builtin/field_mapper.py`
+
+**LLM:** `lib/blocks/builtin/text_generator.py`
+
+**Structured:** `lib/blocks/builtin/structured_generator.py`
+
+**Multiplier:** `lib/blocks/builtin/structure_sampler.py`
+
+**Embedding:** `lib/blocks/builtin/duplicate_remover.py`
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 0000000..8973983
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,97 @@
+language: en-US
+early_access: false
+enable_free_tier: true
+
+reviews:
+ request_changes_workflow: true
+
+ high_level_summary: true
+ poem: false
+ review_status: true
+ collapse_walkthrough: false
+
+ auto_review:
+ enabled: true
+ auto_incremental_review: true
+ ignore_title_keywords: []
+ branches:
+ - "*"
+
+ # Path-based instructions
+ path_instructions:
+ # Backend code
+ - path: "**/*.py"
+ instructions: |
+ Apply backend code review checklist from llm/rules-backend.md:
+ Identify which llm/*.md files need updates:
+ - New API endpoints → update llm/state-backend.md
+ - New blocks → update llm/state-backend.md and llm/state-project.md
+ - Changed patterns → update relevant llm/state-*.md
+ Identify if the docs needs updates.
+ Golden rule: if code cannot be explained in one sentence, it's too complex.
+
+ # Frontend code
+ - path: "frontend/**/*.{ts,tsx,js,jsx}"
+ instructions: |
+ Apply frontend code review checklist from llm/rules-frontend.md:
+ Identify which llm/*.md files need updates:
+ - New pages/components → update llm/state-frontend.md
+ - Changed UI flow → update llm/state-frontend.md
+ - New patterns → update llm/state-frontend.md
+ Identify if the docs needs updates.
+ Golden rule: keep components focused and maintainable.
+
+ # Block implementations
+ - path: "lib/blocks/**/*.py"
+ instructions: |
+ Apply block implementation checklist from .claude/skills/implementing-datagenflow-blocks/SKILL.md:
+ Identify which llm/*.md files need updates:
+ - New blocks → update llm/state-backend.md and llm/state-project.md
+ - Changed block behavior → update relevant llm/state-*.md
+ Identify if the docs needs updates.
+ Golden rule: blocks should be single-responsibility and reusable.
+ # Tests
+ - path: "tests/**/*.py"
+ instructions: |
+ Review test quality:
+ - One behavior per test
+ - Test names: test___
+ - Error cases tested (not just happy path)
+ - Proper use of fixtures
+ - Mocks used appropriately
+ - Tests are focused and maintainable
+
+ # Documentation files
+ - path: "llm/**/*.md"
+ instructions: |
+ Review documentation updates:
+ - Changes reflect actual code (not aspirational designs)
+ - Updates are gradual and incremental (not complete rewrites)
+ - Technical and concise
+ - Explain what changed and why
+ - Note any breaking changes
+
+ # Configuration files
+ - path: "**/*.{yaml,yml,json,toml}"
+ instructions: |
+ Review configuration changes:
+ - No secrets committed
+ - Valid syntax
+ - Changes documented if needed
+ - Backwards compatible or migration documented
+
+chat:
+ auto_reply: true
+
+knowledge_base:
+ learnings:
+ scope: "auto"
+
+ opt_out: false
+
+tone_instructions: |
+ Be direct, technical, and concise:
+ 1. Blocking issues (anti-patterns, security, broken tests) - must fix
+ 2. Code quality violations - should fix
+ 3. Documentation updates needed
+ 4. Improvements - nice to have
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 2ed1de9..a7e9c01 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -23,4 +23,4 @@ Title Format: copy the one of the issue keep this format
- [ ] `make format` passes
- [ ] `make pre-merge` passes
- [ ] PR update from develop branch
-- [ ] Copilot review run and addressed
+- [ ] Ask CodeRabbit review addressed (comment `@coderabbitai review`)
diff --git a/.gitignore b/.gitignore
index cf68d8c..737f956 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,7 +19,8 @@ data/*.db-journal
# ide
.vscode/
.idea/
-.claude/
+.claude/*
+!.claude/skills/
.worktrees/
# cache
diff --git a/Makefile b/Makefile
index 7a875a9..8233cf5 100644
--- a/Makefile
+++ b/Makefile
@@ -91,6 +91,12 @@ test:
test-integration:
uv run pytest -m integration -v
+test-e2e:
+ ./tests/e2e/run_all_tests.sh
+
+test-e2e-ui:
+ ./tests/e2e/run_all_tests.sh --ui
+
pre-merge: format-all lint-all typecheck-all test
@echo "✅ Pre-merge checks completed successfully. Ready to merge!"
diff --git a/app.py b/app.py
index bbf4c74..ee8e978 100644
--- a/app.py
+++ b/app.py
@@ -534,6 +534,7 @@ async def get_pipeline(pipeline_id: int) -> dict[str, Any]:
blocks = pipeline.definition.get("blocks", [])
pipeline_dict = pipeline.model_dump()
pipeline_dict["first_block_is_multiplier"] = is_multiplier_pipeline(blocks)
+ pipeline_dict["first_block_type"] = blocks[0].get("type") if blocks else None
return pipeline_dict
diff --git a/docs/template_data_augmentation.md b/docs/template_data_augmentation.md
new file mode 100644
index 0000000..152ef6e
--- /dev/null
+++ b/docs/template_data_augmentation.md
@@ -0,0 +1,429 @@
+---
+title: Data Augmentation Template
+description: Generate synthetic records preserving statistical distributions from sample data
+---
+
+# Data Augmentation Template
+
+## Table of Contents
+- [Overview](#overview)
+- [Pipeline Architecture](#pipeline-architecture)
+- [Seed Format](#seed-format)
+- [Output Format](#output-format)
+- [How It Works](#how-it-works)
+- [Use Cases](#use-cases)
+- [Customization](#customization)
+- [Filtering Duplicates](#filtering-duplicates)
+- [Tuning Parameters](#tuning-parameters)
+- [Common Issues](#common-issues)
+- [Example Workflow](#example-workflow)
+- [Related Documentation](#related-documentation)
+
+## Overview
+
+**Complexity:** Advanced (3 blocks with multiplier)
+**Use Case:** Generate synthetic data that preserves statistical patterns from samples
+
+This template creates realistic synthetic records from sample data while maintaining:
+- Statistical distributions (e.g., "electronics" appears 50% of the time)
+- Numeric range constraints (e.g., electronics prices $299-$899, furniture prices $199-$349)
+- Semantic coherence (LLM-generated fields match context)
+- Output diversity (duplicate detection via embeddings)
+
+**Special Features:**
+- Statistical sampling preserves distributions
+- LLM-powered semantic field generation
+- Embedding-based duplicate detection
+- Supports field dependencies
+
+## Pipeline Architecture
+
+```text
+┌─────────────┐ ┌─────────────┐ ┌─────────────┐
+│ Structure │──►│ Semantic │──►│ Duplicate │
+│ Sampler │ │ Infiller │ │ Remover │
+└─────────────┘ └─────────────┘ └─────────────┘
+
+Input: samples array
+ ↓
++ category, _hints (multiplies: 1 seed → N skeletons)
+ ↓
++ description, price (LLM-generated fields)
+ ↓
++ is_duplicate, similarity_to_seeds, similarity_to_generated
+```
+
+**Blocks:**
+1. **StructureSampler** - Learns distributions from samples, generates statistical skeletons
+2. **SemanticInfiller** - Completes skeletons with LLM-generated semantic fields
+3. **DuplicateRemover** - Filters similar records using embedding similarity
+
+**Key Concept:** The StructureSampler is a multiplier block that generates N skeletons from one seed. Each skeleton flows through the remaining blocks to create one record.
+
+## Seed Format
+
+**Required fields:**
+- `samples` - Array of example records (minimum 3 recommended)
+- `target_count` - Number of synthetic records to generate
+- `categorical_fields` - Fields to preserve distribution
+- `fields_to_generate` - Fields for LLM to generate
+
+**Optional fields:**
+- `numeric_fields` - Numeric distributions to preserve
+- `dependencies` - Field relationships (e.g., role depends on plan)
+- `comparison_fields` - Fields for duplicate detection
+
+**Example seed (Product Catalog):**
+```json
+[
+ {
+ "repetitions": 1,
+ "metadata": {
+ "samples": [
+ {"category": "electronics", "price": 299, "description": "Wireless noise-canceling headphones with premium sound quality"},
+ {"category": "electronics", "price": 899, "description": "13-inch laptop with high-resolution display"},
+ {"category": "furniture", "price": 199, "description": "Ergonomic office chair with lumbar support"},
+ {"category": "furniture", "price": 349, "description": "Adjustable standing desk with memory presets"}
+ ],
+ "target_count": 10,
+ "categorical_fields": ["category"],
+ "numeric_fields": ["price"],
+ "fields_to_generate": ["description", "price"],
+ "comparison_fields": ["description"]
+ }
+ }
+]
+```
+
+**Field Explanations:**
+- **`samples`** - Example products showing the data structure (4 samples provided)
+- **`target_count`** - How many new products to generate (10 in this example)
+- **`categorical_fields`** - Fields with discrete values that preserve distribution (50% electronics, 50% furniture)
+- **`numeric_fields`** - Fields with numeric ranges that provide hints to the LLM (electronics: $299-$899, furniture: $199-$349)
+- **`fields_to_generate`** - Fields for the LLM to create NEW content for (description and price)
+- **`comparison_fields`** - Fields to check for duplicates using embedding similarity (description)
+
+> **Note:** `price` appears in both `numeric_fields` and `fields_to_generate`. This provides range hints to guide the LLM while letting it generate contextually appropriate prices.
+
+> **Tip:** Use 4-10 diverse samples for best results. More samples = better distribution learning.
+
+## Output Format
+
+The pipeline outputs a `generated_samples` array containing the final records.
+
+Each generated record contains:
+- Sampled categorical fields (preserving distribution)
+- LLM-generated semantic fields
+- Duplicate detection metadata
+
+**Example output:**
+```json
+{
+ "generated_samples": [
+ {
+ "category": "electronics",
+ "price": 449,
+ "description": "Bluetooth speaker with 360-degree sound and waterproof design",
+ "is_duplicate": false,
+ "similarity_to_seeds": 0.45,
+ "similarity_to_generated": 0.42
+ }
+ ]
+}
+```
+
+**Each record contains:**
+- Sampled categorical fields (`category`)
+- LLM-generated fields (`price`, `description`)
+- Duplicate detection metadata:
+ - `similarity_to_seeds`: highest similarity to original seed samples
+ - `similarity_to_generated`: highest similarity to other generated records
+ - `is_duplicate`: true if either similarity exceeds threshold
+
+**Note:** Input configuration fields like `samples`, `target_count`, `categorical_fields`, etc. are NOT included in the output.
+
+## How It Works
+
+### Stage 1: StructureSampler (Statistical Skeleton Generation)
+
+**What it does:**
+- Analyzes sample data to learn categorical frequencies
+- Computes numeric statistics (min, max, mean) for range hints
+- Respects field dependencies (e.g., role depends on plan)
+- Generates N skeletons respecting learned distributions
+
+**Example:** If samples show "Free" plan 40% and "Pro" 30%, generated skeletons maintain these ratios.
+
+**Output per skeleton:**
+```json
+{
+ "category": "electronics",
+ "_hints": {
+ "price_range": [199.0, 899.0],
+ "exemplars": [
+ {"category": "electronics", "price": 299, "description": "Wireless headphones"},
+ {"category": "electronics", "price": 899, "description": "13-inch laptop"}
+ ]
+ }
+}
+```
+
+### Stage 2: SemanticInfiller (LLM-Powered Field Completion)
+
+**What it does:**
+- Receives skeleton with locked statistical fields
+- Builds contextual prompt with numeric hints and exemplar examples
+- Calls LLM to generate semantic fields (bio, description, etc.)
+- Restores locked fields if LLM overwrites them
+
+**Prompt structure:**
+```text
+You are a data generator. Complete the following record skeleton.
+
+Skeleton: {category: "electronics"}
+
+Numeric hints:
+- price should be between 199-899
+
+Matching examples:
+- {category: "electronics", price: 299, description: "Wireless headphones"}
+
+Generate: ["description", "price"]
+Return JSON: {"description": "...", "price": ...}
+```
+
+**Locked fields behavior:** Categorical fields sampled by StructureSampler (e.g., `category`) are preserved even if the LLM tries to modify them.
+
+### Stage 3: DuplicateRemover (Similarity Filtering)
+
+**What it does:**
+- Extracts text from comparison fields
+- Generates embeddings via embedding model
+- Computes cosine similarity with cached embeddings
+- Marks records as duplicates if similarity > threshold
+
+**Output:**
+```json
+{
+ "category": "electronics",
+ "price": 549,
+ "description": "Portable bluetooth speaker with waterproof design",
+ "is_duplicate": false,
+ "similarity_to_seeds": 0.72,
+ "similarity_to_generated": 0.45
+}
+```
+
+**Output fields:**
+- `similarity_to_seeds`: highest similarity to any original sample
+- `similarity_to_generated`: highest similarity to previously generated records
+- `is_duplicate`: true if either similarity exceeds threshold
+
+> **Note:** DuplicateRemover gracefully degrades if embedding model is unavailable - marks all records as `is_duplicate: false` with similarity scores of 0.0.
+
+## Use Cases
+
+**Perfect for:**
+- Expanding training datasets while maintaining patterns
+- Creating realistic test data for applications
+- Generating synthetic user profiles with distributions
+- Data augmentation for ML training sets
+- Privacy-preserving data generation (learn from real, generate synthetic)
+
+**Not ideal for:**
+- Time-series data (no temporal modeling)
+- Graph/network data (no relationship modeling)
+- Highly correlated numeric fields (limited correlation preservation)
+
+## Customization
+
+Modify the template in `lib/templates/data_augmentation.yaml`:
+
+**Adjust generation count:**
+```yaml
+blocks:
+ - type: StructureSampler
+ config:
+ target_count: 100 # Generate 100 records
+```
+
+**Change LLM creativity:**
+```yaml
+ - type: SemanticInfiller
+ config:
+ temperature: 0.9 # Higher = more creative (0.7-0.9 recommended)
+ max_tokens: 300 # Longer outputs
+```
+
+**Adjust duplicate threshold:**
+```yaml
+ - type: DuplicateRemover
+ config:
+ similarity_threshold: 0.9 # Stricter (0.8-0.9 recommended)
+```
+
+**Add more dependencies:**
+```json
+{
+ "dependencies": {
+ "role": ["plan"],
+ "storage": ["plan"]
+ }
+}
+```
+
+## Filtering Duplicates
+
+Records marked as `is_duplicate: true` should be filtered post-generation:
+
+**Via API:**
+```python
+result = await pipeline.execute(seed_data)
+generated = result.result.get("generated_samples", [])
+unique_records = [r for r in generated if not r.get("is_duplicate")]
+```
+
+**Via export (manual filter):**
+```bash
+# Export all records
+curl http://localhost:8000/api/export?job_id=1 > output.jsonl
+
+# Filter duplicates from generated_samples
+jq '.generated_samples[] | select(.is_duplicate == false)' output.jsonl > unique.jsonl
+```
+
+> **Note:** Keeping duplicates in the trace allows adjusting the threshold post-generation and analyzing similarity score distributions (`similarity_to_seeds` and `similarity_to_generated`).
+
+## Tuning Parameters
+
+### Quality vs Speed
+
+**High quality (slower):**
+```yaml
+target_count: 100
+temperature: 0.9
+max_tokens: 300
+similarity_threshold: 0.9
+```
+
+**Fast iteration (lower quality):**
+```yaml
+target_count: 20
+temperature: 0.7
+max_tokens: 150
+similarity_threshold: 0.75
+```
+
+### Diversity vs Fidelity
+
+**Preserve distributions (higher fidelity):**
+- Include all important `categorical_fields`
+- Specify `dependencies` accurately
+- Include `numeric_fields` with tight ranges
+
+**Increase diversity (creative generation):**
+- Omit some `categorical_fields` (LLM generates freely)
+- Higher temperature (0.8-0.9)
+- Lower `similarity_threshold` (0.75-0.8)
+
+## Common Issues
+
+### Low diversity (many duplicates)
+
+**Causes:**
+- Too few samples (<5)
+- Temperature too low (<0.5)
+- Fields too restrictive
+
+**Fixes:**
+- Add more diverse samples
+- Increase temperature to 0.8-0.9
+- Generate more semantic fields
+- Increase similarity_threshold to 0.85-0.9
+
+### Unrealistic outputs
+
+**Causes:**
+- Dependencies not specified
+- Numeric hints too broad
+- Temperature too high (>0.95)
+
+**Fixes:**
+- Add dependencies config
+- Provide numeric_fields for constraints
+- Reduce temperature to 0.7-0.8
+- Include exemplar samples matching target patterns
+
+### LLM errors (invalid JSON)
+
+**Causes:**
+- max_tokens too low (truncated JSON)
+- Complex nested structures
+
+**Fixes:**
+- Increase max_tokens to 200-300
+- Simplify fields (fewer nested objects)
+- SemanticInfiller handles markdown wrappers automatically
+
+### Missing embeddings
+
+**Cause:** Embedding model not configured
+
+**Behavior:** DuplicateRemover marks all as `is_duplicate: false`
+
+**Fix:** Configure default embedding model in Settings page
+
+## Example Workflow
+
+**Goal:** Generate 100 synthetic user profiles
+
+### Step 1: Prepare samples (6 examples)
+```json
+[
+ {"plan": "Free", "role": "Viewer", "storage": 1, "bio": "Student learning"},
+ {"plan": "Free", "role": "Viewer", "storage": 2, "bio": "Just exploring"},
+ {"plan": "Pro", "role": "Editor", "storage": 50, "bio": "Freelance designer"},
+ {"plan": "Pro", "role": "Editor", "storage": 75, "bio": "Agency owner"},
+ {"plan": "Pro", "role": "Admin", "storage": 100, "bio": "Team lead"},
+ {"plan": "Enterprise", "role": "Admin", "storage": 500, "bio": "CTO"}
+]
+```
+
+### Step 2: Create pipeline from template
+```bash
+curl -X POST http://localhost:8000/api/pipelines/from_template/data_augmentation \
+ -H "Content-Type: application/json" \
+ -d '{"name": "User Profile Augmentation"}'
+```
+
+### Step 3: Start generation
+```bash
+curl -X POST http://localhost:8000/api/generate \
+ -F "file=@seed_data_augmentation.json" \
+ -F "pipeline_id=1"
+```
+
+### Step 4: Monitor progress
+```bash
+# Poll job status
+curl http://localhost:8000/api/jobs/1
+```
+
+### Step 5: Review and export
+```bash
+# Export unique records only
+curl http://localhost:8000/api/export?job_id=1 | jq 'select(.is_duplicate == false)' > unique_users.jsonl
+```
+
+**Result:** 100 synthetic user profiles preserving original distributions
+
+> **Tip:** For large datasets, start with 20 records to verify quality before scaling up.
+
+## Related Documentation
+
+- [Templates Overview](templates) - All available templates
+- [How to Use](how_to_use) - Running pipelines with templates
+- [Custom Blocks](how_to_create_blocks) - Understanding multiplier blocks
+- [StructureSampler Block](how_to_create_blocks#structuresampler)
+- [SemanticInfiller Block](how_to_create_blocks#semanticinfiller)
+- [DuplicateRemover Block](how_to_create_blocks#duplicateremover)
diff --git a/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx b/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx
index c8cd3dd..f1bb56d 100644
--- a/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx
+++ b/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx
@@ -32,9 +32,12 @@ export default function BlockConfigPanel({
const [formData, setFormData] = useState>(config || {});
const { resolvedColorScheme } = useTheme();
const [wordWrap, setWordWrap] = useState(false);
+ const [jsonMode, setJsonMode] = useState>({});
const [errors, setErrors] = useState>({});
const [panelWidth, setPanelWidth] = useState(400);
const [isResizing, setIsResizing] = useState(false);
+ const [llmModels, setLlmModels] = useState([]);
+ const [embeddingModels, setEmbeddingModels] = useState([]);
// sync formData with parent config changes
// this ensures that saved config persists when panel is reopened
@@ -75,6 +78,42 @@ export default function BlockConfigPanel({
};
}, []);
+ // fetch available LLM and embedding models
+ useEffect(() => {
+ const controller = new AbortController();
+ const { signal } = controller;
+
+ const fetchModels = async () => {
+ try {
+ const [llmResponse, embeddingResponse] = await Promise.all([
+ fetch("/api/llm-models", { signal }),
+ fetch("/api/embedding-models", { signal }),
+ ]);
+
+ if (llmResponse.ok) {
+ const llmData = await llmResponse.json();
+ if (Array.isArray(llmData)) {
+ setLlmModels(llmData.map((m: any) => m.name).filter(Boolean));
+ }
+ }
+
+ if (embeddingResponse.ok) {
+ const embeddingData = await embeddingResponse.json();
+ if (Array.isArray(embeddingData)) {
+ setEmbeddingModels(embeddingData.map((m: any) => m.name).filter(Boolean));
+ }
+ }
+ } catch (error) {
+ if ((error as any)?.name !== "AbortError") {
+ console.error("Failed to fetch models:", error);
+ }
+ }
+ };
+
+ fetchModels();
+ return () => controller.abort();
+ }, []);
+
// handle resize
useEffect(() => {
if (!isResizing) return;
@@ -115,6 +154,12 @@ export default function BlockConfigPanel({
Object.entries(schema).forEach(([key, fieldSchema]: [string, any]) => {
const value = processedData[key];
+
+ // skip json-or-template fields - they stay as strings
+ if (fieldSchema.format === "json-or-template") {
+ return;
+ }
+
if (
(fieldSchema.type === "array" || fieldSchema.type === "object") &&
typeof value === "string"
@@ -206,6 +251,42 @@ export default function BlockConfigPanel({
);
}
+ // llm model dropdown
+ if (key === "model" && llmModels.length > 0) {
+ return (
+
+ );
+ }
+
+ // embedding model dropdown
+ if (key === "embedding_model" && embeddingModels.length > 0) {
+ return (
+
+ );
+ }
+
// field reference dropdown (references to accumulated_state fields)
if (schema.isFieldReference) {
if (availableFields.length > 0) {
@@ -309,6 +390,80 @@ export default function BlockConfigPanel({
);
}
+ // json-or-template field - use monaco editor with toggle
+ if (schema.format === "json-or-template") {
+ const isJsonMode = jsonMode[key] ?? true; // default to JSON mode
+ const jsonValue = typeof value === "string" ? value : JSON.stringify(value, null, 2);
+
+ return (
+
+
+ setJsonMode((prev) => ({ ...prev, [key]: e.target.checked }))}
+ id={`jsonmode-${key}`}
+ sx={{ m: 0 }}
+ />
+
+ JSON mode
+
+
+ {isJsonMode ? "(JSON syntax)" : "(Jinja2 template)"}
+
+
+
+ {
+ // keep as string during editing, will be parsed on save if needed
+ handleChange(key, newValue || "");
+ }}
+ theme={resolvedColorScheme === "dark" ? "vs-dark" : "light"}
+ options={{
+ minimap: { enabled: false },
+ scrollbar: {
+ vertical: "auto",
+ horizontal: "auto",
+ verticalScrollbarSize: 10,
+ horizontalScrollbarSize: 10,
+ },
+ lineNumbers: "on",
+ lineNumbersMinChars: 3,
+ glyphMargin: false,
+ folding: true,
+ lineDecorationsWidth: 5,
+ scrollBeyondLastLine: false,
+ renderLineHighlight: "none",
+ overviewRulerLanes: 0,
+ hideCursorInOverviewRuler: true,
+ overviewRulerBorder: false,
+ wordWrap: wordWrap ? "on" : "off",
+ fontSize: 13,
+ fontFamily:
+ "ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace",
+ tabSize: 2,
+ padding: { top: 8, bottom: 8 },
+ }}
+ />
+
+
+ );
+ }
+
// object or array field - use monaco editor with JSON
if (schema.type === "object" || schema.type === "array") {
const jsonValue = typeof value === "string" ? value : JSON.stringify(value, null, 2);
diff --git a/frontend/src/components/pipeline-editor/BlockNode.tsx b/frontend/src/components/pipeline-editor/BlockNode.tsx
index 92e7922..82437b4 100644
--- a/frontend/src/components/pipeline-editor/BlockNode.tsx
+++ b/frontend/src/components/pipeline-editor/BlockNode.tsx
@@ -61,12 +61,41 @@ function getPreviewFields(blockType: string, config: Record): Array
// priority fields based on block type
let priorityKeys: string[] = [];
- if (type.includes("generator")) {
+ // data augmentation blocks
+ if (type.includes("sampler")) {
+ priorityKeys = ["target_count", "categorical_fields"];
+ } else if (type.includes("infiller")) {
+ priorityKeys = ["fields_to_generate", "model", "temperature"];
+ } else if (type.includes("remover")) {
+ priorityKeys = ["similarity_threshold", "comparison_fields", "embedding_model"];
+ }
+ // multiplier blocks
+ else if (type.includes("multiplier")) {
+ priorityKeys = ["parser_type", "chunk_size"];
+ }
+ // langfuse integration
+ else if (type.includes("langfuse")) {
+ priorityKeys = ["dataset_name"];
+ }
+ // field mapper
+ else if (type.includes("mapper")) {
+ priorityKeys = ["mappings"];
+ }
+ // ragas metrics
+ else if (type.includes("ragas")) {
+ priorityKeys = ["metrics", "model", "score_threshold"];
+ }
+ // generators (text/structured)
+ else if (type.includes("generator")) {
priorityKeys = ["model", "temperature", "max_tokens"];
- } else if (type.includes("validator")) {
- priorityKeys = ["min_length", "max_length", "required_fields"];
- } else if (type.includes("score")) {
- priorityKeys = ["generated_field", "reference_field", "metric"];
+ }
+ // validators
+ else if (type.includes("validator")) {
+ priorityKeys = ["min_length", "max_length", "required_fields", "field_name"];
+ }
+ // score blocks
+ else if (type.includes("score")) {
+ priorityKeys = ["generated_field", "reference_field", "field_name", "metric"];
}
// find up to 2 configured values from priority keys
@@ -76,9 +105,27 @@ function getPreviewFields(blockType: string, config: Record): Array
if (config[key] !== undefined && config[key] !== null && config[key] !== "") {
let displayValue = String(config[key]);
+ // special handling for fields_to_generate (JSON string)
+ if (key === "fields_to_generate" && typeof config[key] === "string") {
+ try {
+ const parsed = JSON.parse(config[key]);
+ if (Array.isArray(parsed)) {
+ displayValue = `[${parsed.length} items]`;
+ }
+ } catch {
+ // if not valid JSON, treat as template string
+ }
+ }
+ // special formatting for arrays/objects
+ else if (Array.isArray(config[key])) {
+ displayValue = `[${config[key].length} items]`;
+ } else if (typeof config[key] === "object") {
+ displayValue = `{${Object.keys(config[key]).length} keys}`;
+ }
+
// truncate long values
- if (displayValue.length > 20) {
- displayValue = displayValue.slice(0, 20) + "...";
+ if (displayValue.length > 25) {
+ displayValue = displayValue.slice(0, 25) + "...";
}
preview.push([key, displayValue]);
diff --git a/frontend/src/pages/Generator.tsx b/frontend/src/pages/Generator.tsx
index b62c2da..769a5f5 100644
--- a/frontend/src/pages/Generator.tsx
+++ b/frontend/src/pages/Generator.tsx
@@ -41,6 +41,7 @@ export default function Generator() {
const [pipelines, setPipelines] = useState([]);
const [selectedPipeline, setSelectedPipeline] = useState(null);
const [isMultiplierPipeline, setIsMultiplierPipeline] = useState(false);
+ const [needsMarkdown, setNeedsMarkdown] = useState(false);
const [validationResult, setValidationResult] = useState<{
valid: boolean;
errors: string[];
@@ -112,6 +113,7 @@ export default function Generator() {
if (!selectedPipeline) {
if (mounted) {
setIsMultiplierPipeline(false);
+ setNeedsMarkdown(false);
setValidationResult(null);
}
return;
@@ -123,6 +125,8 @@ export default function Generator() {
});
const data = await res.json();
const isMultiplier = data.first_block_is_multiplier || false;
+ const firstBlockType = data.first_block_type || "";
+ const needsMd = firstBlockType === "MarkdownMultiplierBlock";
if (!mounted) return;
@@ -130,7 +134,7 @@ export default function Generator() {
const isMarkdown = file.name.endsWith(".md");
const isJson = file.name.endsWith(".json");
- if ((isMultiplier && isJson) || (!isMultiplier && isMarkdown)) {
+ if ((needsMd && isJson) || (!needsMd && isMarkdown)) {
setFile(null);
setValidationResult(null);
setValidated(false);
@@ -138,10 +142,14 @@ export default function Generator() {
}
setIsMultiplierPipeline(isMultiplier);
+ setNeedsMarkdown(needsMd);
} catch (err) {
if (err instanceof Error && err.name !== "AbortError") {
console.error("Failed to load pipeline details:", err);
- if (mounted) setIsMultiplierPipeline(false);
+ if (mounted) {
+ setIsMultiplierPipeline(false);
+ setNeedsMarkdown(false);
+ }
}
}
};
@@ -199,7 +207,7 @@ export default function Generator() {
const isJson = droppedFile.type === "application/json" || droppedFile.name.endsWith(".json");
const isMarkdown = droppedFile.name.endsWith(".md");
- const isValidFile = isMultiplierPipeline ? isMarkdown : isJson;
+ const isValidFile = needsMarkdown ? isMarkdown : isJson;
if (isValidFile) {
const input = fileInputRef.current;
@@ -210,7 +218,7 @@ export default function Generator() {
input.dispatchEvent(new Event("change", { bubbles: true }));
}
} else {
- const expected = isMultiplierPipeline ? "Markdown (.md) file" : "JSON (.json) file";
+ const expected = needsMarkdown ? "Markdown (.md) file" : "JSON (.json) file";
toast.error(`Please drop a ${expected}`);
}
}
@@ -223,12 +231,12 @@ export default function Generator() {
const isMarkdown = selectedFile.name.endsWith(".md");
const isJson = selectedFile.name.endsWith(".json");
- if (isMultiplierPipeline && isJson) {
+ if (needsMarkdown && isJson) {
toast.error("Please upload a Markdown (.md) file for this pipeline.");
return;
}
- if (!isMultiplierPipeline && isMarkdown) {
+ if (!needsMarkdown && isMarkdown) {
toast.error("Please upload a JSON (.json) file for this pipeline.");
return;
}
@@ -650,7 +658,7 @@ export default function Generator() {
@@ -663,7 +671,7 @@ export default function Generator() {
? "Select a pipeline first"
: file
? file.name
- : isMultiplierPipeline
+ : needsMarkdown
? "Drop Markdown file here or click to browse"
: "Drop JSON seed file here or click to browse"}
@@ -672,7 +680,7 @@ export default function Generator() {
? "Choose a pipeline from the configuration panel"
: file
? `Size: ${(file.size / 1024).toFixed(2)} KB`
- : isMultiplierPipeline
+ : needsMarkdown
? "Markdown (.md) format"
: 'Format: {"repetitions": N, "metadata": {...}}'}
@@ -725,7 +733,7 @@ export default function Generator() {
{/* Verify Seeds Button */}
- {file && selectedPipeline && !isMultiplierPipeline && file.name.endsWith(".json") && (
+ {file && selectedPipeline && !needsMarkdown && file.name.endsWith(".json") && (