diff --git a/.claude/skills/address-pr-review/SKILL.md b/.claude/skills/address-pr-review/SKILL.md new file mode 100644 index 0000000..53f0f11 --- /dev/null +++ b/.claude/skills/address-pr-review/SKILL.md @@ -0,0 +1,99 @@ +--- +name: address-pr-review +description: Use when you have PR review comments to address and want to evaluate each comment's validity before deciding to fix, reply, or skip +--- + +# Address PR Review Comments + +## Overview + +Interactive workflow: analyze PR review comment validity, recommend action, let user decide (fix/reply/skip). + +## When to Use + +- PR has review comments needing evaluation before action +- Reviewer feedback might be incorrect or needs discussion +- Comments require varied responses (fix/reply/skip) +- Need to balance code quality with respectful reviewer engagement + +## When NOT to Use + +- All comments are clearly valid and straightforward to fix +- No comments yet or doing pre-review self-review +- Comments only on non-code files without technical analysis needed + +## Workflow Overview + +```dot +digraph pr_review_flow { + "Fetch PR comments" [shape=box]; + "More comments?" [shape=diamond]; + "Show comment + file context" [shape=box]; + "Analyze validity" [shape=box]; + "Recommend action" [shape=box]; + "Ask user: Fix/Reply/Skip/Quit?" [shape=diamond]; + "Make code changes" [shape=box]; + "Draft reply" [shape=box]; + "Track as skipped" [shape=box]; + "Show summary" [shape=box]; + + "Fetch PR comments" -> "More comments?"; + "More comments?" -> "Show comment + file context" [label="yes"]; + "More comments?" -> "Show summary" [label="no"]; + "Show comment + file context" -> "Analyze validity"; + "Analyze validity" -> "Recommend action"; + "Recommend action" -> "Ask user: Fix/Reply/Skip/Quit?"; + "Ask user: Fix/Reply/Skip/Quit?" -> "Make code changes" [label="Fix"]; + "Ask user: Fix/Reply/Skip/Quit?" -> "Draft reply" [label="Reply"]; + "Ask user: Fix/Reply/Skip/Quit?" -> "Track as skipped" [label="Skip"]; + "Ask user: Fix/Reply/Skip/Quit?" -> "Show summary" [label="Quit"]; + "Make code changes" -> "More comments?"; + "Draft reply" -> "More comments?"; + "Track as skipped" -> "More comments?"; +} +``` + +## Quick Reference + +**Critical principle:** Reviewer may be wrong - analyze validity before recommending action. + +| Phase | Actions | +|-------|---------| +| **Fetch** | `gh api repos/{owner}/{repo}/pulls/$PR/comments`
Extract: path, line, body, user.login, id
Exit if no comments | +| **Per Comment** | Show: file:line, author, comment, ±10 lines context
Analyze: Valid/Nitpick/Disagree/Question
Recommend: Fix/Reply/Skip with reasoning | +| **Fix** | Minimal changes per llm/rules-*.md
Offer reply draft: `Fixed: [what]. [why]`
Show: `gh api --method POST repos/{owner}/{repo}/pulls/comments/$ID/replies -f body="..."` | +| **Reply** | Draft based on type: Question/Suggestion/Disagreement
Let user edit
Show gh command (never auto-post) | +| **Summary** | Processed X/N: Fixed Y, Replied Z, Skipped W
List: files modified, reply drafts, next steps | + +## Critical Principles + +| Principle | Violation Pattern | +|-----------|-------------------| +| **Analyze first** | Accepting all feedback as valid without critical analysis | +| **Never auto-post** | Posting replies automatically instead of showing gh command | +| **One at a time** | Batch processing all comments without individual analysis | +| **Show context** | Making changes without displaying ±10 lines around code | +| **Minimal changes** | Large refactors in response to small comments | +| **Follow standards** | Ignoring llm/rules-*.md when fixing | +| **Respectful honesty** | Being defensive/dismissive when reviewer is wrong | +| **User control** | Posting drafts without letting user edit first | + +## Reply Formats + +- Fix: `Fixed: [what]. [why]` +- Update: `Updated: [what]` +- Answer: `[explanation]` +- Acknowledge: `Good catch, [action/reason]` +- Disagree: `[respectful reasoning]` + +## Setup & Usage + +Requires: `gh` CLI authenticated, GitHub remote configured + +```bash +# Start session +"use address-pr-review for PR " + +# Or list PRs first +"use address-pr-review" +``` diff --git a/.claude/skills/code-review/SKILL.md b/.claude/skills/code-review/SKILL.md new file mode 100644 index 0000000..3292752 --- /dev/null +++ b/.claude/skills/code-review/SKILL.md @@ -0,0 +1,170 @@ +--- +name: code-review +description: Use after completing implementation to review code quality, user impact, test coverage, and documentation before creating a PR +--- + +# Code Review + +## Overview + +Review code for quality, user impact, tests, and documentation. Balance technical excellence with practical simplicity. + +**Core principle:** Clean code should serve users, not just developers. + +## When to Use + +- After completing a feature or bug fix +- Before creating a pull request +- When reviewing changes before merge + +## When NOT to Use + +- Trivial changes (typo fixes) +- Documentation-only changes +- Initial exploration/prototyping + +## Review Process + +| Phase | Focus | Key Question | +|-------|-------|--------------| +| 1. Identify | What changed? | `git diff --name-only develop` | +| 2. User Impact | How does this affect users? | Is UX better or worse? | +| 3. Code Quality | Does it follow standards? | KISS + no anti-patterns? | +| 4. Tests | Is it covered? | New code = new tests? | +| 5. Docs | What needs updating? | llm/state-*.md current? | + +--- + +## Phase 1: Identify Changes + +Categorize changed files: +- **Backend:** `lib/`, `app.py` +- **Frontend:** `frontend/src/` +- **Tests:** `tests/` +- **Docs:** `llm/`, `*.md` + +Note change type: new feature | bug fix | refactoring | enhancement + +--- + +## Phase 2: User Impact + +**Ask for each change:** +1. Does this affect what users see or do? +2. Are error messages user-friendly (not technical jargon)? +3. Are loading states shown? +4. Can users recover from errors? +5. Is this the simplest UX possible? + +**Red flags:** +- Silent failures (user doesn't know something failed) +- Lost work on errors +- Unclear feedback ("Error: 500" vs "Could not save") +- Unnecessary complexity exposed to users + +--- + +## Phase 3: Code Quality + +### KISS Check + +Can each function be explained in one sentence? If not, it's too complex. + +### Backend Anti-patterns (blocking) + +- [ ] Silent failures (empty except blocks) +- [ ] God functions (>30 lines, >3 params) +- [ ] SQL injection (f-strings in queries) +- [ ] Missing error context +- [ ] Walrus operators / complex one-liners + +### Frontend Anti-patterns (blocking) + +- [ ] Empty catch blocks +- [ ] Inline fetch (not in service layer) +- [ ] Missing useEffect cleanup +- [ ] `any` types or `as` assertions +- [ ] Hardcoded colors (use theme: fg.*, canvas.*) +- [ ] Prop drilling (>5 props) + +### Security + +- [ ] Inputs validated at API boundary +- [ ] SQL parameterized (`?` placeholders) +- [ ] No secrets in code/logs + +--- + +## Phase 4: Test Coverage + +| Change Type | Required Test | +|-------------|---------------| +| New API endpoint | Unit test | +| New block | `tests/blocks/test_*.py` | +| Bug fix | Regression test | +| User workflow change | E2E test | +| Refactoring | Existing tests pass | + +**Test quality:** +- Naming: `test___` +- One behavior per test +- Error cases tested, not just happy path + +--- + +## Phase 5: Documentation + +**Update llm/state-*.md when:** +- New API endpoint → `state-backend.md` +- New block → `state-backend.md` +- New component/page → `state-frontend.md` +- Architecture change → `state-project.md` + +**Code comments:** explain WHY, not what. Lowercase, concise. + +--- + +## Output Format + +```markdown +### User Impact +[UX improvements or issues found] + +### Anti-patterns +[location + violation + fix, or "none"] + +### Code Quality Issues +[severity + location + fix, or "none"] + +### Test Coverage +[required: present/missing | gaps if any] + +### Documentation Updates +[files needing update, or "none"] + +### Verdict +[BLOCK | REQUEST CHANGES | APPROVE] +Reason: [brief explanation] +``` + +--- + +## Verdict Rules + +| Condition | Verdict | +|-----------|---------| +| Anti-patterns found | BLOCK | +| Security issues | BLOCK | +| Missing required tests | REQUEST CHANGES | +| Needs doc updates | REQUEST CHANGES | +| All checks pass | APPROVE | + +--- + +## Golden Rules + +1. Anti-patterns are blocking - always reject +2. User experience matters - clean code that hurts UX is bad code +3. KISS wins - one sentence explanation or it's too complex +4. Tests are not optional - new code needs tests +5. Fail loudly - silent failures are never acceptable diff --git a/.claude/skills/debugging-pipelines/SKILL.md b/.claude/skills/debugging-pipelines/SKILL.md new file mode 100644 index 0000000..a8ab8e8 --- /dev/null +++ b/.claude/skills/debugging-pipelines/SKILL.md @@ -0,0 +1,282 @@ +--- +name: debugging-pipelines +description: Use when pipelines fail, produce unexpected output, or need systematic troubleshooting +--- + +# Debugging DataGenFlow Pipelines + +## Overview + +Systematic debugging workflow for any DataGenFlow pipeline failure or unexpected output. This skill provides a structured four-phase process to identify and fix root causes rather than guessing at solutions. + +**Core Principle:** Find the root cause before attempting fixes. Random fixes waste time and create new bugs. + +## When to Use + +Use this skill when: +- Pipeline execution fails with unclear errors +- Pipeline produces "bad data" or unexpected output +- Need to isolate which block is causing issues +- LLM generates duplicates or poor quality content +- Output has unexpected fields (metadata pollution) +- Results are missing expected fields +- Performance issues or slow execution +- Integration test failures + +## When NOT to Use + +Skip this skill for: +- Simple configuration errors (typos in config) +- Documentation lookup (how to use a specific block) +- Feature requests (adding new functionality) +- Questions about architecture (use codebase exploration instead) + +## The Four-Phase Debugging Process + +### Phase 1: Observe & Gather Evidence + +**Goal:** Understand what's wrong and collect data + +**Steps:** +1. **Run the pipeline and capture full output** + - Use pytest for tests: `pytest tests/integration/test_X.py -v -s` + - For API, check logs and response data + - Save the complete error message and stack trace + +2. **Identify what makes output "bad"** + - Missing fields? (expected `price` but not in output) + - Wrong values? (all prices are 0) + - Extra fields? (input metadata leaking: `samples`, `target_count`) + - Duplicates? (similarity_score = 1.0, exact copies) + - Type errors? (expected dict, got list) + +3. **Check recent changes** + - Run `git diff` to see what changed + - Review recent commits that might affect this pipeline + - Check if tests passed before the change + +4. **Review error messages completely** + - Read the full stack trace, not just the last line + - Note file paths, line numbers, and error types + - Check for validation errors with detail context + +**Red Flags to Stop:** +- "I think I know the problem" (without evidence) +- "Let me try changing X" (before tracing data flow) +- Skipping logs because "error is obvious" + +### Phase 2: Trace Data Flow + +**Goal:** Understand how data transforms through the pipeline + +**Steps:** +1. **Identify which blocks touch the problematic data** + - Check pipeline definition (YAML or dict) + - List all blocks in execution order + - Note which blocks read/write the affected fields + +2. **Read block implementations** + - Open `lib/blocks/builtin/[block_name].py` + - Review the `execute()` method + - Check what inputs it expects and outputs it returns + - Look for data transformations or filtering logic + +3. **Trace data transformation between blocks** + - Check `lib/workflow.py:_process_single_seed()` for multiplier pipelines + - See how `accumulated_state` merges block outputs + - Identify where data gets added, modified, or removed + +4. **Check workflow execution flow** + - Normal pipeline: `lib/workflow.py:85-224` + - Multiplier pipeline: `lib/workflow.py:305-449` + - Understand seed processing vs result filtering + +**Key Files to Check:** +- `lib/workflow.py` - Pipeline execution engine +- `lib/blocks/builtin/` - All block implementations +- `lib/entities/block_execution_context.py` - Context passed between blocks + +### Phase 3: Root Cause Analysis + +**Goal:** Form a specific, testable hypothesis + +**Steps:** +1. **Form specific hypothesis** + - Format: "I think X causes Y because Z" + - Example: "I think input metadata leaks to output because workflow.py line 323 merges all initial_data without filtering" + - Be specific, not vague + +2. **Don't assume - verify with evidence** + - Read the actual code at the suspected line + - Check logs or traces confirming the behavior + - Look for similar patterns in other files + +3. **Use logs, traces, and execution results** + - Check test output for actual vs expected values + - Review trace data showing block inputs/outputs + - Examine execution_time for performance issues + +**Red Flags:** +- "It's probably just..." (guessing) +- "This usually means..." (pattern matching without verification) +- Proposing fixes before understanding the cause + +### Phase 4: Fix & Verify + +**Goal:** Implement minimal fix targeting the root cause + +**Steps:** +1. **Make minimal fix** + - Change only what's necessary to fix the root cause + - Don't refactor or "improve" surrounding code + - One logical change at a time + +2. **Run tests to verify fix** + - Run the specific failing test + - Check for test passing + - Run related tests to catch regressions + +3. **Check for side effects** + - Did the fix break other tests? + - Are there related features that might be affected? + - Review the change for unintended consequences + +4. **If fix doesn't work** + - Count: How many fixes have you tried? + - If < 3: Return to Phase 1, re-analyze with new information + - If ≥ 3: Question the architecture - might need design discussion + +**Success Criteria:** +- Tests pass +- Root cause addressed (not just symptoms) +- No new bugs introduced +- Code follows project guidelines (KISS, minimal changes) + +## Common Pipeline Issues + +| Issue Pattern | Where to Look | Typical Root Causes | Fix Pattern | +|--------------|---------------|---------------------|-------------| +| Output has unexpected fields | `lib/workflow.py` data merging | Input metadata leaking to output | Filter `initial_data_keys` before returning results | +| Block returns wrong data type | Block's `execute()` method | Incorrect return type (dict vs list) | Fix block to return declared type | +| LLM generates poor quality | Block's prompt building | Unclear instructions, low temperature, copying examples | Improve prompt, add diversity instructions | +| LLM copying examples verbatim | SemanticInfiller prompt | Prompt doesn't emphasize creating NEW content | Add "do NOT copy" instruction to prompt | +| Pipeline crashes on specific input | Block's validation logic | Missing input validation or type checking | Add validation in block's execute() | +| Results missing fields | Block's output filtering or merging | Overly aggressive filtering or incorrect merge | Check field filtering logic | +| All duplicates flagged | DuplicateRemover threshold | Threshold too low or embedding model issues | Check similarity_threshold config | +| Metadata pollution | Workflow seed processing | Initial seed data not filtered from output | Use `_filter_output_data()` helper | + +## Critical Files Reference + +**Pipeline Execution:** +- `lib/workflow.py:85-224` - Normal pipeline execution flow +- `lib/workflow.py:305-449` - Multiplier pipeline (1→N expansion) with seed processing +- `lib/workflow.py:275-284` - `_filter_output_data()` helper (filters metadata from results) + +**Built-in Blocks:** +- `lib/blocks/builtin/structure_sampler.py` - Statistical sampling (multiplier block) +- `lib/blocks/builtin/semantic_infiller.py:59-109` - LLM prompt building +- `lib/blocks/builtin/semantic_infiller.py:146-165` - Metadata filtering in SemanticInfiller +- `lib/blocks/builtin/duplicate_remover.py` - Embedding-based similarity detection + +**Core Infrastructure:** +- `lib/entities/block_execution_context.py` - Context passed between blocks +- `lib/blocks/base.py` - BaseBlock interface all blocks inherit from +- `lib/entities/pipeline.py` - ExecutionResult, Usage models +- `lib/template_renderer.py` - Jinja2 template rendering + +**Tests:** +- `tests/integration/` - Integration tests for end-to-end verification +- `tests/blocks/` - Unit tests for individual blocks + +## Debugging Checklist + +Use this checklist to ensure systematic debugging: + +``` +Phase 1: Observe & Gather Evidence +□ Run pipeline and capture full output +□ Identify specific problem (what's wrong?) +□ Read error messages completely (full stack trace) +□ Check recent git changes (git diff, git log) + +Phase 2: Trace Data Flow +□ Check which blocks are in the pipeline +□ Read those block implementations (execute methods) +□ Trace data flow through blocks (accumulated_state) +□ Understand workflow execution (normal vs multiplier) + +Phase 3: Root Cause Analysis +□ Form specific hypothesis ("X causes Y because Z") +□ Verify hypothesis with evidence (code, logs, traces) +□ Don't assume - read actual code +□ Check for similar patterns elsewhere + +Phase 4: Fix & Verify +□ Make minimal fix targeting root cause +□ Run tests to verify fix works +□ Check for unintended side effects +□ Follow project guidelines (KISS, simplicity) +``` + +## Real-World Example: Data Augmentation Metadata Pollution + +**Problem Observed:** +Pipeline output contained input configuration fields (`samples`, `target_count`, `categorical_fields`) mixed with generated data. + +**Phase 1 - Evidence:** +```json +// Expected output: +{"category": "electronics", "price": 449, "description": "...", "is_duplicate": false} + +// Actual output: +{"category": "electronics", "price": 449, "description": "...", + "samples": [...], "target_count": 10, "categorical_fields": [...]} // ❌ Bad! +``` + +**Phase 2 - Trace:** +- Traced workflow.py seed processing +- Found `merged_state = {**initial_data, **seed_data}` at line 323 +- Merged state flows through all blocks +- No filtering before returning results + +**Phase 3 - Root Cause:** +Hypothesis: "Input metadata leaks to output because workflow.py merges all initial_data into accumulated_state without filtering configuration fields before returning results" + +**Phase 4 - Fix:** +1. Added `_filter_output_data()` helper method +2. Track `initial_data_keys` at merge time +3. Filter those keys before returning `ExecutionResult` +4. Tests passed, metadata removed from output + +**Lessons:** +- Data flow tracing revealed the merge point +- Minimal fix (filter helper) solved the root cause +- No refactoring needed - targeted change only + +## Tips for Effective Debugging + +1. **Start with the simplest explanation** + - Don't assume complex bugs when simple causes are more likely + - Check configuration before code logic + +2. **Use the scientific method** + - Observe → Hypothesize → Test → Verify + - One variable at a time + +3. **Trust but verify** + - Don't trust assumptions about what code does + - Read the actual implementation + +4. **Leverage existing patterns** + - Look for similar working code in the codebase + - Compare broken vs working implementations + +5. **Document as you go** + - Keep notes on what you've checked + - Record hypotheses and test results + - Helps if you need to ask for help + +## Related Skills + +- `implementing-datagenflow-blocks` - For understanding block structure and creation +- `address-pr-review` - For evaluating whether debugging revealed design issues diff --git a/.claude/skills/implementing-datagenflow-blocks/SKILL.md b/.claude/skills/implementing-datagenflow-blocks/SKILL.md new file mode 100644 index 0000000..f9cbaca --- /dev/null +++ b/.claude/skills/implementing-datagenflow-blocks/SKILL.md @@ -0,0 +1,623 @@ +--- +name: implementing-datagenflow-blocks +description: Use when creating new blocks for DataGenFlow pipeline system or modifying existing blocks to ensure consistency with established patterns +--- + +# Implementing DataGenFlow Blocks + +## Overview + +DataGenFlow blocks are composable pipeline components. Follow KISS principles: write minimal functions, make code self-explanatory, keep it simple. + +## When to Use + +- Creating a new block +- Modifying existing block behavior +- Reviewing block implementations +- Debugging block execution issues + +**When NOT to use:** +- General backend code (use llm/rules-backend.md) +- Frontend development (use llm/rules-frontend.md) + +## Block Structure + +```python +import logging +from typing import Any + +import litellm # if using LLM + +from lib.blocks.base import BaseBlock +from lib.entities import pipeline +from lib.entities.block_execution_context import BlockExecutionContext +from lib.template_renderer import render_template # if using templates + +logger = logging.getLogger(__name__) + + +class MyBlock(BaseBlock): + name = "My Block" + description = "Short description of what this block does" + category = "generators" # generators|transformers|validators|utilities + inputs = ["field1"] # or ["*"] for any input fields + outputs = ["field2"] # or ["*"] for dynamic outputs + + _config_descriptions = { + "param_name": "Help text shown in UI", + } + + def __init__( + self, + param1: str, + model: str | None = None, # EXACTLY "model" for LLM selection UI + temperature: float = 0.7, + ): + self.param1 = param1 + self.model_name = model # store as model_name + self.temperature = temperature + + async def execute(self, context: BlockExecutionContext) -> dict[str, Any]: + from app import llm_config_manager # import inside execute + + # your logic here + + return {"field": value, "_usage": usage_info} +``` + +## UI Integration Patterns + +The frontend automatically renders different UI controls based on parameter names, types, and class attributes. + +### Model Dropdown (LLM) + +**Parameter MUST be named exactly `model`** for automatic dropdown: + +```python +def __init__( + self, + model: str | None = None, # MUST be "model" and str|None + temperature: float = 0.7, + max_tokens: int = 2048, +): + self.model_name = model # store as model_name +``` + +**Config description:** +```python +_config_descriptions = { + "model": "Select LLM model to use (leave empty for default)", +} +``` + +**Usage in execute:** +```python +async def execute(self, context: BlockExecutionContext) -> dict[str, Any]: + from app import llm_config_manager + + llm_config = await llm_config_manager.get_llm_model(self.model_name) + llm_params = llm_config_manager.prepare_llm_call( + llm_config, + messages=messages, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) +``` + +### Embedding Model Dropdown + +**Parameter MUST be named exactly `embedding_model`**: + +```python +def __init__( + self, + embedding_model: str | None = None, # MUST be "embedding_model" +): + self.embedding_model_name = embedding_model +``` + +**Config description:** +```python +_config_descriptions = { + "embedding_model": "Embedding model to use (leave empty for default)", +} +``` + +**Usage:** +```python +embedding_config = await llm_config_manager.get_embedding_model( + self.embedding_model_name +) +``` + +### Enum Dropdown + +Use `_config_enums` class attribute to create dropdown with predefined options: + +```python +class MyBlock(BaseBlock): + _config_enums = { + "mode": ["strict", "lenient", "auto"], + "format": ["json", "yaml", "xml"], + } + + def __init__( + self, + mode: str = "auto", + format: str = "json", + ): + self.mode = mode + self.format = format +``` + +### Multi-Select Checkboxes + +For array parameters with enum values: + +```python +class MyBlock(BaseBlock): + _config_enums = { + "features": ["feature_a", "feature_b", "feature_c"], + } + + def __init__( + self, + features: list[str] | None = None, + ): + self.features = features or [] +``` + +### Field Reference Dropdown + +Use `_field_references` to create dropdown showing available fields from pipeline: + +```python +class MyBlock(BaseBlock): + _field_references = ["source_field", "target_field"] + + _config_descriptions = { + "source_field": "Field to read from", + "target_field": "Field to write to", + } + + def __init__( + self, + source_field: str, + target_field: str, + ): + self.source_field = source_field + self.target_field = target_field +``` + +### Template Fields (Monaco Editor) + +Parameters with these patterns automatically get Monaco editor: +- Name contains "prompt", "template", or "instruction" +- Or set `schema.format = "jinja2"` via config + +```python +def __init__( + self, + user_prompt: str = "", # automatically gets editor + system_prompt: str = "", # automatically gets editor + custom_template: str = "", # automatically gets editor +): + self.user_prompt = user_prompt +``` + +**Config description should mention Jinja2:** +```python +_config_descriptions = { + "user_prompt": ( + "Jinja2 template. Reference fields with {{ field_name }} or " + "{{ metadata.field_name }}" + ), +} +``` + +**Rendering:** +```python +from lib.template_renderer import render_template + +rendered = render_template(self.user_prompt, context.accumulated_state) +``` + +### JSON Object/Array (Monaco Editor) + +Parameters typed as `dict` or `list` get JSON Monaco editor: + +```python +def __init__( + self, + json_schema: dict[str, Any], # JSON editor + field_list: list[str], # JSON editor +): + self.json_schema = json_schema + self.field_list = field_list +``` + +### Number Input + +Parameters typed as `int` or `float` get number input: + +```python +def __init__( + self, + temperature: float = 0.7, # number input + max_tokens: int = 2048, # number input +): + self.temperature = temperature +``` + +### Textarea + +Parameters with these patterns get multi-line textarea: +- String length > 100 characters +- Name contains "description" +- Type has long content + +```python +def __init__( + self, + description: str = "", # automatically gets textarea +): + self.description = description +``` + +### Text Input (Default) + +Short string parameters get single-line text input: + +```python +def __init__( + self, + name: str, + label: str = "", +): + self.name = name +``` + +## JSON Array as String Pattern + +For parameters that should accept either JSON array or Jinja template (like `fields_to_generate`): + +```python +def __init__( + self, + fields_to_generate: str, # str, not list[str] +): + self.fields_to_generate_template = fields_to_generate + +_config_descriptions = { + "fields_to_generate": ( + 'JSON array or Jinja template. Examples: ["bio", "storage"] or ' + '{{ fields_to_generate | tojson }}' + ), +} +``` + +**Parsing in execute:** +```python +import json + +fields_rendered = render_template( + self.fields_to_generate_template, + context.accumulated_state +) +try: + fields_list = json.loads(fields_rendered) + if not isinstance(fields_list, list): + raise BlockExecutionError("Must be JSON array") +except json.JSONDecodeError as e: + raise BlockExecutionError(f"Invalid JSON: {str(e)}") +``` + +**Template usage:** +```yaml +fields_to_generate: "{{ fields_to_generate | tojson }}" +``` + +## LLM Integration Pattern + +Full pattern for blocks that call LLM: + +```python +async def execute(self, context: BlockExecutionContext) -> dict[str, Any]: + from app import llm_config_manager + + # prepare messages + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + # get llm config + llm_config = await llm_config_manager.get_llm_model(self.model_name) + llm_params = llm_config_manager.prepare_llm_call( + llm_config, + messages=messages, + temperature=self.temperature, + max_tokens=self.max_tokens, + ) + + # add trace metadata for langfuse grouping + llm_params["metadata"] = { + "trace_id": context.trace_id, + "tags": ["datagenflow"], + } + + logger.info(f"Calling LiteLLM with model={llm_params.get('model')}") + + try: + response = await litellm.acompletion(**llm_params) + except Exception as e: + logger.error(f"LLM call failed for {self.name}: {e}") + raise + + content = response.choices[0].message.content + + # extract usage info + usage_info = pipeline.Usage( + input_tokens=response.usage.prompt_tokens or 0, + output_tokens=response.usage.completion_tokens or 0, + cached_tokens=getattr(response.usage, "cache_read_input_tokens", 0) or 0, + ) + + return { + "generated": content, + "_usage": usage_info.model_dump(), + } +``` + +## State Management + +### Reading State + +```python +async def execute(self, context: BlockExecutionContext) -> dict[str, Any]: + # get current record + current = context.accumulated_state.copy() + + # remove internal fields + current.pop("_usage", None) + current.pop("_hints", None) + + # get reference data from initial state + samples = context.get_state("samples", []) +``` + +### Caching Per Execution + +**Never use instance-level state that persists across jobs.** Use trace_id-keyed caching: + +```python +def __init__(self): + # cache per trace_id (one cache per pipeline execution) + self._embeddings_cache: dict[str, list[list[float]]] = {} + +async def execute(self, context: BlockExecutionContext) -> dict[str, Any]: + trace_id = context.trace_id + + # build cache once per pipeline execution + if trace_id not in self._embeddings_cache: + # compute embeddings + self._embeddings_cache[trace_id] = embeddings + + # use cached data + cached_embeddings = self._embeddings_cache[trace_id] +``` + +## Multiplier Blocks + +Blocks that generate multiple items from one input: + +```python +from lib.blocks.base import BaseMultiplierBlock +from lib.entities.block_execution_context import BlockExecutionContext + +class StructureSampler(BaseMultiplierBlock): + name = "Structure Sampler" + category = "seeders" + + async def execute( + self, + context: BlockExecutionContext + ) -> list[dict[str, Any]]: + # read from context and return list of records + return [record1, record2, record3] +``` + +## Code Quality + +### KISS Principle + +Write minimal number of functions, make code self-explanatory: + +```python +# ✅ good - simple and clear +def _prepare_prompts(self, data: dict[str, Any]) -> tuple[str, str]: + """render jinja2 templates with data context""" + system_template = self.system_prompt or data.get("system", "") + user_template = self.user_prompt or data.get("user", "") + + system = render_template(system_template, data) if system_template else "" + user = render_template(user_template, data) if user_template else "" + + return system, user + +# ❌ bad - over-engineered with too many tiny functions +def _get_system(self, data): ... +def _get_user(self, data): ... +def _render_system(self, template, data): ... +def _render_user(self, template, data): ... +``` + +### Comments + +Comments in lowercase, explain WHY not WHAT: + +```python +# ✅ good - explains why +def _extract_text(self, record: dict[str, Any]) -> str: + """ + extract text from specified fields or all string fields + joins with spaces for embedding + """ + +# ❌ bad - just describes what code does +def _extract_text(self, record: dict[str, Any]) -> str: + """Extract text from record fields""" + # Loop through fields and get string values +``` + +### Imports + +All imports at top of file, not inside functions (except `from app import llm_config_manager`): + +```python +# ✅ good +import json +import logging +from typing import Any + +import litellm + +from lib.blocks.base import BaseBlock + +# ❌ bad +def execute(self, context): + import json # wrong place +``` + +**Exception:** `from app import llm_config_manager` goes inside `execute()` to avoid circular imports. + +## Testing + +### Unit Tests + +Create `tests/blocks/test_.py`: + +```python +from unittest.mock import AsyncMock, MagicMock, patch +import pytest + +from lib.blocks.builtin.my_block import MyBlock +from lib.entities.block_execution_context import BlockExecutionContext + + +def make_context(state: dict) -> BlockExecutionContext: + """helper to create test context""" + return BlockExecutionContext( + trace_id="test-trace", + pipeline_id=1, + accumulated_state=state, + ) + + +class TestMyBlockInit: + def test_init_basic(self): + block = MyBlock(param="value") + assert block.param == "value" + + +class TestMyBlockExecution: + @pytest.mark.asyncio + @patch("litellm.acompletion") + @patch("app.llm_config_manager") + async def test_execute_basic(self, mock_config_manager, mock_completion): + # setup mocks + mock_config_manager.get_llm_model = AsyncMock(...) + mock_completion.return_value = MagicMock(...) + + block = MyBlock(param="value") + context = make_context({"field": "value"}) + + result = await block.execute(context) + + assert result["field"] == "expected" +``` + +### Integration Tests + +Add to `tests/integration/test_data_augmentation.py`. + +## Documentation Updates + +**Always update after implementing:** + +1. **llm/state-project.md** - block count, description +2. **llm/state-backend.md** - block count, details +3. **lib/templates/** - template YAML if applicable + +## Common Mistakes + +| Mistake | Problem | Fix | +|---------|---------|-----| +| Parameter named `model_name` | No dropdown UI | Name it exactly `model` | +| Parameter named `embedding` | No dropdown UI | Name it exactly `embedding_model` | +| `list[str]` for JSON arrays | Can't use templates | Use `str`, render + parse | +| Instance-level cache | Data leaks between jobs | Use `dict[str, T]` keyed by `trace_id` | +| Imports inside functions | Not the codebase style | Move to top (except llm_config_manager) | +| Over-engineering | Too many tiny functions | KISS - keep it simple | +| Comments describe what | Obvious from code | Explain WHY, lowercase | +| Forgot `_usage` | Usage not tracked | Always return `_usage` from LLM | +| Missing `_config_descriptions` | No help text in UI | Add descriptions for all params | +| Wrong enum format | UI doesn't render dropdown | Use `_config_enums` class attribute | + +## Implementation Checklist + +**Design:** +- [ ] Choose block type (BaseBlock vs BaseMultiplierBlock) +- [ ] Define inputs/outputs +- [ ] Identify parameters and their types +- [ ] Name model parameters correctly (`model`, `embedding_model`) +- [ ] Decide which params need enum dropdowns or field references + +**Implementation:** +- [ ] Add all imports at top (except llm_config_manager) +- [ ] Create class with `name`, `description`, `category`, `inputs`, `outputs` +- [ ] Add `_config_descriptions` with helpful UI text +- [ ] Add `_config_enums` if using dropdowns +- [ ] Add `_field_references` if using field selection +- [ ] Implement `__init__` with correct parameter types +- [ ] Implement `execute()` method +- [ ] Add template rendering if needed +- [ ] Use `llm_config_manager.get_llm_model()` for LLM +- [ ] Use `llm_config_manager.get_embedding_model()` for embeddings +- [ ] Add trace metadata to `llm_params["metadata"]` +- [ ] Track usage with `pipeline.Usage()` and return `_usage` +- [ ] Use trace_id-keyed caching if needed +- [ ] Write lowercase comments explaining WHY + +**Testing:** +- [ ] Create unit test file `tests/blocks/test_.py` +- [ ] Test initialization variants +- [ ] Test execution with mocked LLM config +- [ ] Test edge cases and error handling +- [ ] Add integration test +- [ ] Run `pytest tests/` - all pass + +**Documentation:** +- [ ] Update `llm/state-project.md` +- [ ] Update `llm/state-backend.md` +- [ ] Create template YAML if applicable + +**Review:** +- [ ] Model parameters named exactly right +- [ ] Imports at top (except llm_config_manager) +- [ ] No instance-level state +- [ ] KISS principle followed +- [ ] `_usage` returned if using LLM +- [ ] All UI integrations correct (enums, field refs, descriptions) + +## Reference Examples + +**Simple:** `lib/blocks/builtin/field_mapper.py` + +**LLM:** `lib/blocks/builtin/text_generator.py` + +**Structured:** `lib/blocks/builtin/structured_generator.py` + +**Multiplier:** `lib/blocks/builtin/structure_sampler.py` + +**Embedding:** `lib/blocks/builtin/duplicate_remover.py` diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..8973983 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,97 @@ +language: en-US +early_access: false +enable_free_tier: true + +reviews: + request_changes_workflow: true + + high_level_summary: true + poem: false + review_status: true + collapse_walkthrough: false + + auto_review: + enabled: true + auto_incremental_review: true + ignore_title_keywords: [] + branches: + - "*" + + # Path-based instructions + path_instructions: + # Backend code + - path: "**/*.py" + instructions: | + Apply backend code review checklist from llm/rules-backend.md: + Identify which llm/*.md files need updates: + - New API endpoints → update llm/state-backend.md + - New blocks → update llm/state-backend.md and llm/state-project.md + - Changed patterns → update relevant llm/state-*.md + Identify if the docs needs updates. + Golden rule: if code cannot be explained in one sentence, it's too complex. + + # Frontend code + - path: "frontend/**/*.{ts,tsx,js,jsx}" + instructions: | + Apply frontend code review checklist from llm/rules-frontend.md: + Identify which llm/*.md files need updates: + - New pages/components → update llm/state-frontend.md + - Changed UI flow → update llm/state-frontend.md + - New patterns → update llm/state-frontend.md + Identify if the docs needs updates. + Golden rule: keep components focused and maintainable. + + # Block implementations + - path: "lib/blocks/**/*.py" + instructions: | + Apply block implementation checklist from .claude/skills/implementing-datagenflow-blocks/SKILL.md: + Identify which llm/*.md files need updates: + - New blocks → update llm/state-backend.md and llm/state-project.md + - Changed block behavior → update relevant llm/state-*.md + Identify if the docs needs updates. + Golden rule: blocks should be single-responsibility and reusable. + # Tests + - path: "tests/**/*.py" + instructions: | + Review test quality: + - One behavior per test + - Test names: test___ + - Error cases tested (not just happy path) + - Proper use of fixtures + - Mocks used appropriately + - Tests are focused and maintainable + + # Documentation files + - path: "llm/**/*.md" + instructions: | + Review documentation updates: + - Changes reflect actual code (not aspirational designs) + - Updates are gradual and incremental (not complete rewrites) + - Technical and concise + - Explain what changed and why + - Note any breaking changes + + # Configuration files + - path: "**/*.{yaml,yml,json,toml}" + instructions: | + Review configuration changes: + - No secrets committed + - Valid syntax + - Changes documented if needed + - Backwards compatible or migration documented + +chat: + auto_reply: true + +knowledge_base: + learnings: + scope: "auto" + + opt_out: false + +tone_instructions: | + Be direct, technical, and concise: + 1. Blocking issues (anti-patterns, security, broken tests) - must fix + 2. Code quality violations - should fix + 3. Documentation updates needed + 4. Improvements - nice to have diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 2ed1de9..a7e9c01 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -23,4 +23,4 @@ Title Format: copy the one of the issue keep this format - [ ] `make format` passes - [ ] `make pre-merge` passes - [ ] PR update from develop branch -- [ ] Copilot review run and addressed +- [ ] Ask CodeRabbit review addressed (comment `@coderabbitai review`) diff --git a/.gitignore b/.gitignore index cf68d8c..737f956 100644 --- a/.gitignore +++ b/.gitignore @@ -19,7 +19,8 @@ data/*.db-journal # ide .vscode/ .idea/ -.claude/ +.claude/* +!.claude/skills/ .worktrees/ # cache diff --git a/Makefile b/Makefile index 7a875a9..8233cf5 100644 --- a/Makefile +++ b/Makefile @@ -91,6 +91,12 @@ test: test-integration: uv run pytest -m integration -v +test-e2e: + ./tests/e2e/run_all_tests.sh + +test-e2e-ui: + ./tests/e2e/run_all_tests.sh --ui + pre-merge: format-all lint-all typecheck-all test @echo "✅ Pre-merge checks completed successfully. Ready to merge!" diff --git a/app.py b/app.py index bbf4c74..ee8e978 100644 --- a/app.py +++ b/app.py @@ -534,6 +534,7 @@ async def get_pipeline(pipeline_id: int) -> dict[str, Any]: blocks = pipeline.definition.get("blocks", []) pipeline_dict = pipeline.model_dump() pipeline_dict["first_block_is_multiplier"] = is_multiplier_pipeline(blocks) + pipeline_dict["first_block_type"] = blocks[0].get("type") if blocks else None return pipeline_dict diff --git a/docs/template_data_augmentation.md b/docs/template_data_augmentation.md new file mode 100644 index 0000000..152ef6e --- /dev/null +++ b/docs/template_data_augmentation.md @@ -0,0 +1,429 @@ +--- +title: Data Augmentation Template +description: Generate synthetic records preserving statistical distributions from sample data +--- + +# Data Augmentation Template + +## Table of Contents +- [Overview](#overview) +- [Pipeline Architecture](#pipeline-architecture) +- [Seed Format](#seed-format) +- [Output Format](#output-format) +- [How It Works](#how-it-works) +- [Use Cases](#use-cases) +- [Customization](#customization) +- [Filtering Duplicates](#filtering-duplicates) +- [Tuning Parameters](#tuning-parameters) +- [Common Issues](#common-issues) +- [Example Workflow](#example-workflow) +- [Related Documentation](#related-documentation) + +## Overview + +**Complexity:** Advanced (3 blocks with multiplier) +**Use Case:** Generate synthetic data that preserves statistical patterns from samples + +This template creates realistic synthetic records from sample data while maintaining: +- Statistical distributions (e.g., "electronics" appears 50% of the time) +- Numeric range constraints (e.g., electronics prices $299-$899, furniture prices $199-$349) +- Semantic coherence (LLM-generated fields match context) +- Output diversity (duplicate detection via embeddings) + +**Special Features:** +- Statistical sampling preserves distributions +- LLM-powered semantic field generation +- Embedding-based duplicate detection +- Supports field dependencies + +## Pipeline Architecture + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Structure │──►│ Semantic │──►│ Duplicate │ +│ Sampler │ │ Infiller │ │ Remover │ +└─────────────┘ └─────────────┘ └─────────────┘ + +Input: samples array + ↓ ++ category, _hints (multiplies: 1 seed → N skeletons) + ↓ ++ description, price (LLM-generated fields) + ↓ ++ is_duplicate, similarity_to_seeds, similarity_to_generated +``` + +**Blocks:** +1. **StructureSampler** - Learns distributions from samples, generates statistical skeletons +2. **SemanticInfiller** - Completes skeletons with LLM-generated semantic fields +3. **DuplicateRemover** - Filters similar records using embedding similarity + +**Key Concept:** The StructureSampler is a multiplier block that generates N skeletons from one seed. Each skeleton flows through the remaining blocks to create one record. + +## Seed Format + +**Required fields:** +- `samples` - Array of example records (minimum 3 recommended) +- `target_count` - Number of synthetic records to generate +- `categorical_fields` - Fields to preserve distribution +- `fields_to_generate` - Fields for LLM to generate + +**Optional fields:** +- `numeric_fields` - Numeric distributions to preserve +- `dependencies` - Field relationships (e.g., role depends on plan) +- `comparison_fields` - Fields for duplicate detection + +**Example seed (Product Catalog):** +```json +[ + { + "repetitions": 1, + "metadata": { + "samples": [ + {"category": "electronics", "price": 299, "description": "Wireless noise-canceling headphones with premium sound quality"}, + {"category": "electronics", "price": 899, "description": "13-inch laptop with high-resolution display"}, + {"category": "furniture", "price": 199, "description": "Ergonomic office chair with lumbar support"}, + {"category": "furniture", "price": 349, "description": "Adjustable standing desk with memory presets"} + ], + "target_count": 10, + "categorical_fields": ["category"], + "numeric_fields": ["price"], + "fields_to_generate": ["description", "price"], + "comparison_fields": ["description"] + } + } +] +``` + +**Field Explanations:** +- **`samples`** - Example products showing the data structure (4 samples provided) +- **`target_count`** - How many new products to generate (10 in this example) +- **`categorical_fields`** - Fields with discrete values that preserve distribution (50% electronics, 50% furniture) +- **`numeric_fields`** - Fields with numeric ranges that provide hints to the LLM (electronics: $299-$899, furniture: $199-$349) +- **`fields_to_generate`** - Fields for the LLM to create NEW content for (description and price) +- **`comparison_fields`** - Fields to check for duplicates using embedding similarity (description) + +> **Note:** `price` appears in both `numeric_fields` and `fields_to_generate`. This provides range hints to guide the LLM while letting it generate contextually appropriate prices. + +> **Tip:** Use 4-10 diverse samples for best results. More samples = better distribution learning. + +## Output Format + +The pipeline outputs a `generated_samples` array containing the final records. + +Each generated record contains: +- Sampled categorical fields (preserving distribution) +- LLM-generated semantic fields +- Duplicate detection metadata + +**Example output:** +```json +{ + "generated_samples": [ + { + "category": "electronics", + "price": 449, + "description": "Bluetooth speaker with 360-degree sound and waterproof design", + "is_duplicate": false, + "similarity_to_seeds": 0.45, + "similarity_to_generated": 0.42 + } + ] +} +``` + +**Each record contains:** +- Sampled categorical fields (`category`) +- LLM-generated fields (`price`, `description`) +- Duplicate detection metadata: + - `similarity_to_seeds`: highest similarity to original seed samples + - `similarity_to_generated`: highest similarity to other generated records + - `is_duplicate`: true if either similarity exceeds threshold + +**Note:** Input configuration fields like `samples`, `target_count`, `categorical_fields`, etc. are NOT included in the output. + +## How It Works + +### Stage 1: StructureSampler (Statistical Skeleton Generation) + +**What it does:** +- Analyzes sample data to learn categorical frequencies +- Computes numeric statistics (min, max, mean) for range hints +- Respects field dependencies (e.g., role depends on plan) +- Generates N skeletons respecting learned distributions + +**Example:** If samples show "Free" plan 40% and "Pro" 30%, generated skeletons maintain these ratios. + +**Output per skeleton:** +```json +{ + "category": "electronics", + "_hints": { + "price_range": [199.0, 899.0], + "exemplars": [ + {"category": "electronics", "price": 299, "description": "Wireless headphones"}, + {"category": "electronics", "price": 899, "description": "13-inch laptop"} + ] + } +} +``` + +### Stage 2: SemanticInfiller (LLM-Powered Field Completion) + +**What it does:** +- Receives skeleton with locked statistical fields +- Builds contextual prompt with numeric hints and exemplar examples +- Calls LLM to generate semantic fields (bio, description, etc.) +- Restores locked fields if LLM overwrites them + +**Prompt structure:** +```text +You are a data generator. Complete the following record skeleton. + +Skeleton: {category: "electronics"} + +Numeric hints: +- price should be between 199-899 + +Matching examples: +- {category: "electronics", price: 299, description: "Wireless headphones"} + +Generate: ["description", "price"] +Return JSON: {"description": "...", "price": ...} +``` + +**Locked fields behavior:** Categorical fields sampled by StructureSampler (e.g., `category`) are preserved even if the LLM tries to modify them. + +### Stage 3: DuplicateRemover (Similarity Filtering) + +**What it does:** +- Extracts text from comparison fields +- Generates embeddings via embedding model +- Computes cosine similarity with cached embeddings +- Marks records as duplicates if similarity > threshold + +**Output:** +```json +{ + "category": "electronics", + "price": 549, + "description": "Portable bluetooth speaker with waterproof design", + "is_duplicate": false, + "similarity_to_seeds": 0.72, + "similarity_to_generated": 0.45 +} +``` + +**Output fields:** +- `similarity_to_seeds`: highest similarity to any original sample +- `similarity_to_generated`: highest similarity to previously generated records +- `is_duplicate`: true if either similarity exceeds threshold + +> **Note:** DuplicateRemover gracefully degrades if embedding model is unavailable - marks all records as `is_duplicate: false` with similarity scores of 0.0. + +## Use Cases + +**Perfect for:** +- Expanding training datasets while maintaining patterns +- Creating realistic test data for applications +- Generating synthetic user profiles with distributions +- Data augmentation for ML training sets +- Privacy-preserving data generation (learn from real, generate synthetic) + +**Not ideal for:** +- Time-series data (no temporal modeling) +- Graph/network data (no relationship modeling) +- Highly correlated numeric fields (limited correlation preservation) + +## Customization + +Modify the template in `lib/templates/data_augmentation.yaml`: + +**Adjust generation count:** +```yaml +blocks: + - type: StructureSampler + config: + target_count: 100 # Generate 100 records +``` + +**Change LLM creativity:** +```yaml + - type: SemanticInfiller + config: + temperature: 0.9 # Higher = more creative (0.7-0.9 recommended) + max_tokens: 300 # Longer outputs +``` + +**Adjust duplicate threshold:** +```yaml + - type: DuplicateRemover + config: + similarity_threshold: 0.9 # Stricter (0.8-0.9 recommended) +``` + +**Add more dependencies:** +```json +{ + "dependencies": { + "role": ["plan"], + "storage": ["plan"] + } +} +``` + +## Filtering Duplicates + +Records marked as `is_duplicate: true` should be filtered post-generation: + +**Via API:** +```python +result = await pipeline.execute(seed_data) +generated = result.result.get("generated_samples", []) +unique_records = [r for r in generated if not r.get("is_duplicate")] +``` + +**Via export (manual filter):** +```bash +# Export all records +curl http://localhost:8000/api/export?job_id=1 > output.jsonl + +# Filter duplicates from generated_samples +jq '.generated_samples[] | select(.is_duplicate == false)' output.jsonl > unique.jsonl +``` + +> **Note:** Keeping duplicates in the trace allows adjusting the threshold post-generation and analyzing similarity score distributions (`similarity_to_seeds` and `similarity_to_generated`). + +## Tuning Parameters + +### Quality vs Speed + +**High quality (slower):** +```yaml +target_count: 100 +temperature: 0.9 +max_tokens: 300 +similarity_threshold: 0.9 +``` + +**Fast iteration (lower quality):** +```yaml +target_count: 20 +temperature: 0.7 +max_tokens: 150 +similarity_threshold: 0.75 +``` + +### Diversity vs Fidelity + +**Preserve distributions (higher fidelity):** +- Include all important `categorical_fields` +- Specify `dependencies` accurately +- Include `numeric_fields` with tight ranges + +**Increase diversity (creative generation):** +- Omit some `categorical_fields` (LLM generates freely) +- Higher temperature (0.8-0.9) +- Lower `similarity_threshold` (0.75-0.8) + +## Common Issues + +### Low diversity (many duplicates) + +**Causes:** +- Too few samples (<5) +- Temperature too low (<0.5) +- Fields too restrictive + +**Fixes:** +- Add more diverse samples +- Increase temperature to 0.8-0.9 +- Generate more semantic fields +- Increase similarity_threshold to 0.85-0.9 + +### Unrealistic outputs + +**Causes:** +- Dependencies not specified +- Numeric hints too broad +- Temperature too high (>0.95) + +**Fixes:** +- Add dependencies config +- Provide numeric_fields for constraints +- Reduce temperature to 0.7-0.8 +- Include exemplar samples matching target patterns + +### LLM errors (invalid JSON) + +**Causes:** +- max_tokens too low (truncated JSON) +- Complex nested structures + +**Fixes:** +- Increase max_tokens to 200-300 +- Simplify fields (fewer nested objects) +- SemanticInfiller handles markdown wrappers automatically + +### Missing embeddings + +**Cause:** Embedding model not configured + +**Behavior:** DuplicateRemover marks all as `is_duplicate: false` + +**Fix:** Configure default embedding model in Settings page + +## Example Workflow + +**Goal:** Generate 100 synthetic user profiles + +### Step 1: Prepare samples (6 examples) +```json +[ + {"plan": "Free", "role": "Viewer", "storage": 1, "bio": "Student learning"}, + {"plan": "Free", "role": "Viewer", "storage": 2, "bio": "Just exploring"}, + {"plan": "Pro", "role": "Editor", "storage": 50, "bio": "Freelance designer"}, + {"plan": "Pro", "role": "Editor", "storage": 75, "bio": "Agency owner"}, + {"plan": "Pro", "role": "Admin", "storage": 100, "bio": "Team lead"}, + {"plan": "Enterprise", "role": "Admin", "storage": 500, "bio": "CTO"} +] +``` + +### Step 2: Create pipeline from template +```bash +curl -X POST http://localhost:8000/api/pipelines/from_template/data_augmentation \ + -H "Content-Type: application/json" \ + -d '{"name": "User Profile Augmentation"}' +``` + +### Step 3: Start generation +```bash +curl -X POST http://localhost:8000/api/generate \ + -F "file=@seed_data_augmentation.json" \ + -F "pipeline_id=1" +``` + +### Step 4: Monitor progress +```bash +# Poll job status +curl http://localhost:8000/api/jobs/1 +``` + +### Step 5: Review and export +```bash +# Export unique records only +curl http://localhost:8000/api/export?job_id=1 | jq 'select(.is_duplicate == false)' > unique_users.jsonl +``` + +**Result:** 100 synthetic user profiles preserving original distributions + +> **Tip:** For large datasets, start with 20 records to verify quality before scaling up. + +## Related Documentation + +- [Templates Overview](templates) - All available templates +- [How to Use](how_to_use) - Running pipelines with templates +- [Custom Blocks](how_to_create_blocks) - Understanding multiplier blocks +- [StructureSampler Block](how_to_create_blocks#structuresampler) +- [SemanticInfiller Block](how_to_create_blocks#semanticinfiller) +- [DuplicateRemover Block](how_to_create_blocks#duplicateremover) diff --git a/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx b/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx index c8cd3dd..f1bb56d 100644 --- a/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx +++ b/frontend/src/components/pipeline-editor/BlockConfigPanel.tsx @@ -32,9 +32,12 @@ export default function BlockConfigPanel({ const [formData, setFormData] = useState>(config || {}); const { resolvedColorScheme } = useTheme(); const [wordWrap, setWordWrap] = useState(false); + const [jsonMode, setJsonMode] = useState>({}); const [errors, setErrors] = useState>({}); const [panelWidth, setPanelWidth] = useState(400); const [isResizing, setIsResizing] = useState(false); + const [llmModels, setLlmModels] = useState([]); + const [embeddingModels, setEmbeddingModels] = useState([]); // sync formData with parent config changes // this ensures that saved config persists when panel is reopened @@ -75,6 +78,42 @@ export default function BlockConfigPanel({ }; }, []); + // fetch available LLM and embedding models + useEffect(() => { + const controller = new AbortController(); + const { signal } = controller; + + const fetchModels = async () => { + try { + const [llmResponse, embeddingResponse] = await Promise.all([ + fetch("/api/llm-models", { signal }), + fetch("/api/embedding-models", { signal }), + ]); + + if (llmResponse.ok) { + const llmData = await llmResponse.json(); + if (Array.isArray(llmData)) { + setLlmModels(llmData.map((m: any) => m.name).filter(Boolean)); + } + } + + if (embeddingResponse.ok) { + const embeddingData = await embeddingResponse.json(); + if (Array.isArray(embeddingData)) { + setEmbeddingModels(embeddingData.map((m: any) => m.name).filter(Boolean)); + } + } + } catch (error) { + if ((error as any)?.name !== "AbortError") { + console.error("Failed to fetch models:", error); + } + } + }; + + fetchModels(); + return () => controller.abort(); + }, []); + // handle resize useEffect(() => { if (!isResizing) return; @@ -115,6 +154,12 @@ export default function BlockConfigPanel({ Object.entries(schema).forEach(([key, fieldSchema]: [string, any]) => { const value = processedData[key]; + + // skip json-or-template fields - they stay as strings + if (fieldSchema.format === "json-or-template") { + return; + } + if ( (fieldSchema.type === "array" || fieldSchema.type === "object") && typeof value === "string" @@ -206,6 +251,42 @@ export default function BlockConfigPanel({ ); } + // llm model dropdown + if (key === "model" && llmModels.length > 0) { + return ( + + ); + } + + // embedding model dropdown + if (key === "embedding_model" && embeddingModels.length > 0) { + return ( + + ); + } + // field reference dropdown (references to accumulated_state fields) if (schema.isFieldReference) { if (availableFields.length > 0) { @@ -309,6 +390,80 @@ export default function BlockConfigPanel({ ); } + // json-or-template field - use monaco editor with toggle + if (schema.format === "json-or-template") { + const isJsonMode = jsonMode[key] ?? true; // default to JSON mode + const jsonValue = typeof value === "string" ? value : JSON.stringify(value, null, 2); + + return ( + + + setJsonMode((prev) => ({ ...prev, [key]: e.target.checked }))} + id={`jsonmode-${key}`} + sx={{ m: 0 }} + /> + + JSON mode + + + {isJsonMode ? "(JSON syntax)" : "(Jinja2 template)"} + + + + { + // keep as string during editing, will be parsed on save if needed + handleChange(key, newValue || ""); + }} + theme={resolvedColorScheme === "dark" ? "vs-dark" : "light"} + options={{ + minimap: { enabled: false }, + scrollbar: { + vertical: "auto", + horizontal: "auto", + verticalScrollbarSize: 10, + horizontalScrollbarSize: 10, + }, + lineNumbers: "on", + lineNumbersMinChars: 3, + glyphMargin: false, + folding: true, + lineDecorationsWidth: 5, + scrollBeyondLastLine: false, + renderLineHighlight: "none", + overviewRulerLanes: 0, + hideCursorInOverviewRuler: true, + overviewRulerBorder: false, + wordWrap: wordWrap ? "on" : "off", + fontSize: 13, + fontFamily: + "ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace", + tabSize: 2, + padding: { top: 8, bottom: 8 }, + }} + /> + + + ); + } + // object or array field - use monaco editor with JSON if (schema.type === "object" || schema.type === "array") { const jsonValue = typeof value === "string" ? value : JSON.stringify(value, null, 2); diff --git a/frontend/src/components/pipeline-editor/BlockNode.tsx b/frontend/src/components/pipeline-editor/BlockNode.tsx index 92e7922..82437b4 100644 --- a/frontend/src/components/pipeline-editor/BlockNode.tsx +++ b/frontend/src/components/pipeline-editor/BlockNode.tsx @@ -61,12 +61,41 @@ function getPreviewFields(blockType: string, config: Record): Array // priority fields based on block type let priorityKeys: string[] = []; - if (type.includes("generator")) { + // data augmentation blocks + if (type.includes("sampler")) { + priorityKeys = ["target_count", "categorical_fields"]; + } else if (type.includes("infiller")) { + priorityKeys = ["fields_to_generate", "model", "temperature"]; + } else if (type.includes("remover")) { + priorityKeys = ["similarity_threshold", "comparison_fields", "embedding_model"]; + } + // multiplier blocks + else if (type.includes("multiplier")) { + priorityKeys = ["parser_type", "chunk_size"]; + } + // langfuse integration + else if (type.includes("langfuse")) { + priorityKeys = ["dataset_name"]; + } + // field mapper + else if (type.includes("mapper")) { + priorityKeys = ["mappings"]; + } + // ragas metrics + else if (type.includes("ragas")) { + priorityKeys = ["metrics", "model", "score_threshold"]; + } + // generators (text/structured) + else if (type.includes("generator")) { priorityKeys = ["model", "temperature", "max_tokens"]; - } else if (type.includes("validator")) { - priorityKeys = ["min_length", "max_length", "required_fields"]; - } else if (type.includes("score")) { - priorityKeys = ["generated_field", "reference_field", "metric"]; + } + // validators + else if (type.includes("validator")) { + priorityKeys = ["min_length", "max_length", "required_fields", "field_name"]; + } + // score blocks + else if (type.includes("score")) { + priorityKeys = ["generated_field", "reference_field", "field_name", "metric"]; } // find up to 2 configured values from priority keys @@ -76,9 +105,27 @@ function getPreviewFields(blockType: string, config: Record): Array if (config[key] !== undefined && config[key] !== null && config[key] !== "") { let displayValue = String(config[key]); + // special handling for fields_to_generate (JSON string) + if (key === "fields_to_generate" && typeof config[key] === "string") { + try { + const parsed = JSON.parse(config[key]); + if (Array.isArray(parsed)) { + displayValue = `[${parsed.length} items]`; + } + } catch { + // if not valid JSON, treat as template string + } + } + // special formatting for arrays/objects + else if (Array.isArray(config[key])) { + displayValue = `[${config[key].length} items]`; + } else if (typeof config[key] === "object") { + displayValue = `{${Object.keys(config[key]).length} keys}`; + } + // truncate long values - if (displayValue.length > 20) { - displayValue = displayValue.slice(0, 20) + "..."; + if (displayValue.length > 25) { + displayValue = displayValue.slice(0, 25) + "..."; } preview.push([key, displayValue]); diff --git a/frontend/src/pages/Generator.tsx b/frontend/src/pages/Generator.tsx index b62c2da..769a5f5 100644 --- a/frontend/src/pages/Generator.tsx +++ b/frontend/src/pages/Generator.tsx @@ -41,6 +41,7 @@ export default function Generator() { const [pipelines, setPipelines] = useState([]); const [selectedPipeline, setSelectedPipeline] = useState(null); const [isMultiplierPipeline, setIsMultiplierPipeline] = useState(false); + const [needsMarkdown, setNeedsMarkdown] = useState(false); const [validationResult, setValidationResult] = useState<{ valid: boolean; errors: string[]; @@ -112,6 +113,7 @@ export default function Generator() { if (!selectedPipeline) { if (mounted) { setIsMultiplierPipeline(false); + setNeedsMarkdown(false); setValidationResult(null); } return; @@ -123,6 +125,8 @@ export default function Generator() { }); const data = await res.json(); const isMultiplier = data.first_block_is_multiplier || false; + const firstBlockType = data.first_block_type || ""; + const needsMd = firstBlockType === "MarkdownMultiplierBlock"; if (!mounted) return; @@ -130,7 +134,7 @@ export default function Generator() { const isMarkdown = file.name.endsWith(".md"); const isJson = file.name.endsWith(".json"); - if ((isMultiplier && isJson) || (!isMultiplier && isMarkdown)) { + if ((needsMd && isJson) || (!needsMd && isMarkdown)) { setFile(null); setValidationResult(null); setValidated(false); @@ -138,10 +142,14 @@ export default function Generator() { } setIsMultiplierPipeline(isMultiplier); + setNeedsMarkdown(needsMd); } catch (err) { if (err instanceof Error && err.name !== "AbortError") { console.error("Failed to load pipeline details:", err); - if (mounted) setIsMultiplierPipeline(false); + if (mounted) { + setIsMultiplierPipeline(false); + setNeedsMarkdown(false); + } } } }; @@ -199,7 +207,7 @@ export default function Generator() { const isJson = droppedFile.type === "application/json" || droppedFile.name.endsWith(".json"); const isMarkdown = droppedFile.name.endsWith(".md"); - const isValidFile = isMultiplierPipeline ? isMarkdown : isJson; + const isValidFile = needsMarkdown ? isMarkdown : isJson; if (isValidFile) { const input = fileInputRef.current; @@ -210,7 +218,7 @@ export default function Generator() { input.dispatchEvent(new Event("change", { bubbles: true })); } } else { - const expected = isMultiplierPipeline ? "Markdown (.md) file" : "JSON (.json) file"; + const expected = needsMarkdown ? "Markdown (.md) file" : "JSON (.json) file"; toast.error(`Please drop a ${expected}`); } } @@ -223,12 +231,12 @@ export default function Generator() { const isMarkdown = selectedFile.name.endsWith(".md"); const isJson = selectedFile.name.endsWith(".json"); - if (isMultiplierPipeline && isJson) { + if (needsMarkdown && isJson) { toast.error("Please upload a Markdown (.md) file for this pipeline."); return; } - if (!isMultiplierPipeline && isMarkdown) { + if (!needsMarkdown && isMarkdown) { toast.error("Please upload a JSON (.json) file for this pipeline."); return; } @@ -650,7 +658,7 @@ export default function Generator() { @@ -663,7 +671,7 @@ export default function Generator() { ? "Select a pipeline first" : file ? file.name - : isMultiplierPipeline + : needsMarkdown ? "Drop Markdown file here or click to browse" : "Drop JSON seed file here or click to browse"} @@ -672,7 +680,7 @@ export default function Generator() { ? "Choose a pipeline from the configuration panel" : file ? `Size: ${(file.size / 1024).toFixed(2)} KB` - : isMultiplierPipeline + : needsMarkdown ? "Markdown (.md) format" : 'Format: {"repetitions": N, "metadata": {...}}'} @@ -725,7 +733,7 @@ export default function Generator() { {/* Verify Seeds Button */} - {file && selectedPipeline && !isMultiplierPipeline && file.name.endsWith(".json") && ( + {file && selectedPipeline && !needsMarkdown && file.name.endsWith(".json") && (