From 35c48c65037045a82af00d35aacddab923632b9f Mon Sep 17 00:00:00 2001 From: CAholder Date: Fri, 20 Feb 2026 23:10:09 -0800 Subject: [PATCH 01/20] adding skill evals and skill optimization capabilities using GEPA optimize_anything --- .test/pyproject.toml | 3 +- .test/scripts/optimize.py | 150 ++++++++ .test/src/skill_test/cli/__init__.py | 37 ++ .test/src/skill_test/cli/commands.py | 78 ++++ .test/src/skill_test/optimize/__init__.py | 23 ++ .test/src/skill_test/optimize/asi.py | 95 +++++ .test/src/skill_test/optimize/config.py | 55 +++ .test/src/skill_test/optimize/evaluator.py | 334 ++++++++++++++++++ .../src/skill_test/optimize/gskill/README.md | 128 +++++++ .../skill_test/optimize/gskill/__init__.py | 5 + .../skill_test/optimize/gskill/pipeline.py | 209 +++++++++++ .test/src/skill_test/optimize/review.py | 191 ++++++++++ .test/src/skill_test/optimize/runner.py | 284 +++++++++++++++ .test/src/skill_test/optimize/splitter.py | 200 +++++++++++ .test/tests/test_optimize_e2e.py | 311 ++++++++++++++++ 15 files changed, 2102 insertions(+), 1 deletion(-) create mode 100644 .test/scripts/optimize.py create mode 100644 .test/src/skill_test/optimize/__init__.py create mode 100644 .test/src/skill_test/optimize/asi.py create mode 100644 .test/src/skill_test/optimize/config.py create mode 100644 .test/src/skill_test/optimize/evaluator.py create mode 100644 .test/src/skill_test/optimize/gskill/README.md create mode 100644 .test/src/skill_test/optimize/gskill/__init__.py create mode 100644 .test/src/skill_test/optimize/gskill/pipeline.py create mode 100644 .test/src/skill_test/optimize/review.py create mode 100644 .test/src/skill_test/optimize/runner.py create mode 100644 .test/src/skill_test/optimize/splitter.py create mode 100644 .test/tests/test_optimize_e2e.py diff --git a/.test/pyproject.toml b/.test/pyproject.toml index 8feaeed3..5e114eb4 100644 --- a/.test/pyproject.toml +++ b/.test/pyproject.toml @@ -17,7 +17,8 @@ dependencies = [ [project.optional-dependencies] databricks = ["databricks-sdk>=0.20.0"] dev = ["pytest>=8.0", "pytest-asyncio>=0.23"] -all = ["skill-test[databricks,dev]"] +optimize = ["gepa>=0.1.0", "tiktoken>=0.7.0"] +all = ["skill-test[databricks,dev,optimize]"] [project.scripts] skill-test = "skill_test.cli:main" diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py new file mode 100644 index 00000000..f0a84212 --- /dev/null +++ b/.test/scripts/optimize.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +"""CLI entry point for GEPA skill optimization. + +Usage: + # Standard workflow: evaluate + optimize a skill + uv run python .test/scripts/optimize.py databricks-metric-views + + # Quick pass (15 iterations) + uv run python .test/scripts/optimize.py databricks-metric-views --preset quick + + # Thorough optimization (150 iterations) + uv run python .test/scripts/optimize.py databricks-metric-views --preset thorough + + # Generative mode (generates fresh responses, more expensive) + uv run python .test/scripts/optimize.py databricks-metric-views --mode generative + + # Apply the optimized result + uv run python .test/scripts/optimize.py databricks-metric-views --apply + + # Dry run (show config, dataset info, estimate cost) + uv run python .test/scripts/optimize.py databricks-metric-views --dry-run + + # Optimize all skills that have ground_truth.yaml test cases + uv run python .test/scripts/optimize.py --all +""" + +import argparse +import sys +from pathlib import Path + +# Setup path using shared utilities +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _common import setup_path, handle_error, print_result + +setup_path() + + +def main(): + parser = argparse.ArgumentParser( + description="Optimize Databricks skills using GEPA", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "skill_name", + nargs="?", + help="Name of the skill to optimize (e.g., databricks-model-serving)", + ) + parser.add_argument( + "--all", + action="store_true", + help="Optimize all skills that have ground_truth.yaml", + ) + parser.add_argument( + "--preset", "-p", + choices=["quick", "standard", "thorough"], + default="standard", + help="GEPA optimization preset (default: standard)", + ) + parser.add_argument( + "--mode", "-m", + choices=["static", "generative"], + default="static", + help="Evaluation mode (default: static)", + ) + parser.add_argument( + "--task-lm", + default=None, + help="LLM model for generative mode (e.g., openai/gpt-4o)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show config and cost estimate without running optimization", + ) + parser.add_argument( + "--apply", + action="store_true", + help="Apply the optimized SKILL.md (overwrites original)", + ) + + args = parser.parse_args() + + if not args.skill_name and not args.all: + parser.error("Either provide a skill name or use --all") + + from skill_test.optimize.runner import optimize_skill + from skill_test.optimize.review import review_optimization, apply_optimization + + if args.all: + # Find all skills with ground_truth.yaml + skills_dir = Path(".test/skills") + skill_names = [ + d.name + for d in sorted(skills_dir.iterdir()) + if d.is_dir() and (d / "ground_truth.yaml").exists() and not d.name.startswith("_") + ] + print(f"Found {len(skill_names)} skills to optimize: {', '.join(skill_names)}\n") + + results = [] + for name in skill_names: + print(f"\n{'=' * 60}") + print(f" Optimizing: {name}") + print(f"{'=' * 60}") + try: + result = optimize_skill( + skill_name=name, + mode=args.mode, + preset=args.preset, + task_lm=args.task_lm, + dry_run=args.dry_run, + ) + review_optimization(result) + if args.apply and not args.dry_run: + apply_optimization(result) + results.append({"skill": name, "success": True, "improvement": result.improvement}) + except Exception as e: + print(f" ERROR: {e}") + results.append({"skill": name, "success": False, "error": str(e)}) + + # Summary + print(f"\n{'=' * 60}") + print(" Summary") + print(f"{'=' * 60}") + for r in results: + status = "OK" if r["success"] else "FAIL" + detail = f"+{r['improvement']:.3f}" if r["success"] else r["error"] + print(f" [{status}] {r['skill']}: {detail}") + + sys.exit(0 if all(r["success"] for r in results) else 1) + + else: + try: + result = optimize_skill( + skill_name=args.skill_name, + mode=args.mode, + preset=args.preset, + task_lm=args.task_lm, + dry_run=args.dry_run, + ) + review_optimization(result) + if args.apply and not args.dry_run: + apply_optimization(result) + sys.exit(0) + except Exception as e: + sys.exit(handle_error(e, args.skill_name)) + + +if __name__ == "__main__": + main() diff --git a/.test/src/skill_test/cli/__init__.py b/.test/src/skill_test/cli/__init__.py index b8352981..9e79785e 100644 --- a/.test/src/skill_test/cli/__init__.py +++ b/.test/src/skill_test/cli/__init__.py @@ -17,6 +17,7 @@ review, trace_eval, list_traces, + optimize, ) @@ -36,6 +37,7 @@ def main(): review - Review pending candidates interactively trace-eval - Evaluate trace against skill expectations list-traces - List available trace runs from MLflow + optimize - Optimize a skill using GEPA """ args = sys.argv[1:] @@ -51,6 +53,7 @@ def main(): print(" review Review pending candidates interactively") print(" trace-eval Evaluate trace against skill expectations") print(" list-traces List available trace runs from MLflow") + print(" optimize Optimize a skill using GEPA") sys.exit(0) skill_name = args[0] @@ -142,6 +145,39 @@ def main(): } else: result = list_traces(experiment, ctx, limit) + elif subcommand == "optimize": + # Parse optimize-specific arguments + opt_preset = "standard" + opt_mode = "static" + opt_task_lm = None + opt_dry_run = False + opt_apply = False + + i = 2 + while i < len(args): + if args[i] in ("--preset", "-p") and i + 1 < len(args): + opt_preset = args[i + 1] + i += 2 + elif args[i] in ("--mode", "-m") and i + 1 < len(args): + opt_mode = args[i + 1] + i += 2 + elif args[i] == "--task-lm" and i + 1 < len(args): + opt_task_lm = args[i + 1] + i += 2 + elif args[i] == "--dry-run": + opt_dry_run = True + i += 1 + elif args[i] == "--apply": + opt_apply = True + i += 1 + else: + i += 1 + + result = optimize( + skill_name, ctx, + preset=opt_preset, mode=opt_mode, task_lm=opt_task_lm, + dry_run=opt_dry_run, apply=opt_apply, + ) else: print(f"Unknown subcommand: {subcommand}") sys.exit(1) @@ -171,5 +207,6 @@ def main(): "review", "trace_eval", "list_traces", + "optimize", "main", ] diff --git a/.test/src/skill_test/cli/commands.py b/.test/src/skill_test/cli/commands.py index e84637e2..6798d68c 100644 --- a/.test/src/skill_test/cli/commands.py +++ b/.test/src/skill_test/cli/commands.py @@ -1399,3 +1399,81 @@ def list_traces( "experiment_name": experiment_name, "hint": "Check experiment name and MLflow connection", } + + +def optimize( + skill_name: str, + ctx: CLIContext, + preset: str = "standard", + mode: str = "static", + task_lm: Optional[str] = None, + dry_run: bool = False, + apply: bool = False, +) -> Dict[str, Any]: + """Optimize a skill using GEPA's optimize_anything. + + Runs the full optimization pipeline: evaluate -> optimize -> review. + Optionally applies the optimized result to the SKILL.md. + + Args: + skill_name: Name of the skill to optimize + ctx: CLI context + preset: GEPA preset ("quick", "standard", "thorough") + mode: "static" (uses ground truth) or "generative" (generates fresh responses) + task_lm: LLM model for generative mode + dry_run: Show config and estimate cost without running + apply: Apply the optimized result to SKILL.md + + Returns: + Dictionary with optimization results + """ + try: + from ..optimize.runner import optimize_skill + from ..optimize.review import review_optimization, apply_optimization + except ImportError as e: + return { + "success": False, + "error": f"GEPA optimization requires the 'optimize' extra: {e}", + "hint": "Install with: pip install skill-test[optimize]", + } + + try: + result = optimize_skill( + skill_name=skill_name, + mode=mode, + preset=preset, + task_lm=task_lm, + dry_run=dry_run, + ) + + review_optimization(result) + + if apply and not dry_run: + apply_optimization(result) + + return { + "success": True, + "skill_name": skill_name, + "original_score": result.original_score, + "optimized_score": result.optimized_score, + "improvement": result.improvement, + "original_tokens": result.original_token_count, + "optimized_tokens": result.optimized_token_count, + "token_reduction_pct": result.token_reduction_pct, + "applied": apply and not dry_run, + "dry_run": dry_run, + "mlflow_run_id": result.mlflow_run_id, + } + except FileNotFoundError as e: + return { + "success": False, + "error": str(e), + "skill_name": skill_name, + } + except Exception as e: + return { + "success": False, + "error": str(e), + "skill_name": skill_name, + "hint": "Check GEPA installation and API keys", + } diff --git a/.test/src/skill_test/optimize/__init__.py b/.test/src/skill_test/optimize/__init__.py new file mode 100644 index 00000000..5dd5cc9f --- /dev/null +++ b/.test/src/skill_test/optimize/__init__.py @@ -0,0 +1,23 @@ +"""GEPA-powered skill optimization for the skill-test framework. + +Public API: + optimize_skill() - End-to-end optimize a SKILL.md + create_skill_adapter() - Create a GEPA adapter for a skill + OptimizationResult - Dataclass with optimization results + PRESETS - GEPA config presets (quick, standard, thorough) +""" + +from .runner import optimize_skill, OptimizationResult +from .evaluator import create_skill_adapter, SkillAdapter +from .config import PRESETS +from .review import review_optimization, apply_optimization + +__all__ = [ + "optimize_skill", + "OptimizationResult", + "create_skill_adapter", + "SkillAdapter", + "PRESETS", + "review_optimization", + "apply_optimization", +] diff --git a/.test/src/skill_test/optimize/asi.py b/.test/src/skill_test/optimize/asi.py new file mode 100644 index 00000000..fea0fd70 --- /dev/null +++ b/.test/src/skill_test/optimize/asi.py @@ -0,0 +1,95 @@ +"""ASI diagnostics: convert MLflow Feedback to GEPA (score, diagnostics) contract. + +Collects failure diagnostics so the adapter's make_reflective_dataset() can +provide actionable context to GEPA's reflection LM. +""" + +from typing import Any + +from mlflow.entities import Feedback + + +def feedback_to_score(feedback: Feedback) -> float | None: + """Convert a single MLflow Feedback to a numeric score. + + Mapping: + "yes" -> 1.0 + "no" -> 0.0 + "skip" -> None (excluded from scoring) + numeric -> float(value) + + Returns: + Float score or None if the feedback should be excluded. + """ + value = feedback.value + if value == "yes": + return 1.0 + elif value == "no": + return 0.0 + elif value == "skip": + return None + else: + try: + return float(value) + except (TypeError, ValueError): + return None + + +def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: + """Convert a list of MLflow Feedback objects to GEPA (score, diagnostics). + + Computes the mean score across all non-skipped feedbacks and builds + a diagnostics dict with per-scorer results. + + Args: + feedbacks: List of MLflow Feedback objects from running scorers + + Returns: + Tuple of (composite_score, diagnostics_dict) + - composite_score: 0.0-1.0 mean across all scorable feedbacks + - diagnostics_dict: per-scorer name -> {score, rationale, value} + """ + scores = [] + diagnostics: dict[str, Any] = {} + failure_messages: list[str] = [] + + for fb in feedbacks: + score = feedback_to_score(fb) + name = fb.name or "unnamed" + + if score is None: + diagnostics[name] = { + "score": None, + "value": fb.value, + "rationale": fb.rationale or "", + "status": "skipped", + } + continue + + scores.append(score) + diagnostics[name] = { + "score": score, + "value": fb.value, + "rationale": fb.rationale or "", + "status": "pass" if score >= 0.5 else "fail", + } + + # Collect failure messages for reflection dataset + if score < 1.0: + failure_messages.append( + f"Scorer '{name}' returned {fb.value}: {fb.rationale or 'no rationale'}" + ) + + composite = sum(scores) / len(scores) if scores else 0.0 + + diagnostics["_summary"] = { + "composite_score": composite, + "total_scorers": len(feedbacks), + "scored": len(scores), + "skipped": len(feedbacks) - len(scores), + "passed": sum(1 for s in scores if s >= 0.5), + "failed": sum(1 for s in scores if s < 0.5), + } + diagnostics["_failure_messages"] = failure_messages + + return composite, diagnostics diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py new file mode 100644 index 00000000..63636275 --- /dev/null +++ b/.test/src/skill_test/optimize/config.py @@ -0,0 +1,55 @@ +"""GEPA configuration presets for skill optimization. + +GEPA's optimize() accepts flat kwargs. Presets are stored as dicts +that get unpacked into gepa.optimize(**preset). +""" + +from dataclasses import dataclass +from typing import Any + + +@dataclass +class GEPAPreset: + """Configuration preset for gepa.optimize() calls.""" + + max_metric_calls: int + reflection_lm: str = "openai/gpt-4o" + candidate_selection_strategy: str = "pareto" + reflection_minibatch_size: int = 3 + skip_perfect_score: bool = True + display_progress_bar: bool = True + + def to_kwargs(self) -> dict[str, Any]: + """Convert to kwargs dict for gepa.optimize().""" + return { + "max_metric_calls": self.max_metric_calls, + "reflection_lm": self.reflection_lm, + "candidate_selection_strategy": self.candidate_selection_strategy, + "reflection_minibatch_size": self.reflection_minibatch_size, + "skip_perfect_score": self.skip_perfect_score, + "display_progress_bar": self.display_progress_bar, + } + + +PRESETS: dict[str, GEPAPreset] = { + "quick": GEPAPreset(max_metric_calls=15), + "standard": GEPAPreset(max_metric_calls=50), + "thorough": GEPAPreset(max_metric_calls=150), +} + + +def get_preset(name: str) -> GEPAPreset: + """Get a GEPA config preset by name. + + Args: + name: One of "quick", "standard", "thorough" + + Returns: + GEPAPreset instance + + Raises: + KeyError: If preset name is not recognized + """ + if name not in PRESETS: + raise KeyError(f"Unknown preset '{name}'. Choose from: {list(PRESETS.keys())}") + return PRESETS[name] diff --git a/.test/src/skill_test/optimize/evaluator.py b/.test/src/skill_test/optimize/evaluator.py new file mode 100644 index 00000000..20150938 --- /dev/null +++ b/.test/src/skill_test/optimize/evaluator.py @@ -0,0 +1,334 @@ +"""Evaluator bridge: wrap existing MLflow scorers into a GEPA adapter. + +This is the core integration point between the skill-test scorer framework +and GEPA's optimize(). It implements GEPAAdapter to evaluate candidate +SKILL.md texts against test cases using existing scorers. +""" + +import inspect +from pathlib import Path +from typing import Any, Literal + +import tiktoken +from gepa import EvaluationBatch, GEPAAdapter +from mlflow.entities import Feedback + +from ..runners.evaluate import build_scorers, load_scorer_config +from ..scorers.universal import ( + python_syntax, + sql_syntax, + no_hallucinated_apis, + pattern_adherence, + expected_facts_present, +) +from .asi import feedback_to_asi +from .splitter import SkillTask + + +def _find_repo_root() -> Path: + """Find the repo root by searching upward for .test/src/.""" + current = Path(__file__).resolve().parent + while current != current.parent: + if (current / ".test" / "src").exists(): + return current + if (current / "src" / "skill_test").exists() and current.name == ".test": + return current.parent + current = current.parent + return Path.cwd() + + +def _find_skill_md(skill_name: str) -> Path | None: + """Locate the SKILL.md file for a given skill name.""" + repo_root = _find_repo_root() + candidates = [ + repo_root / ".claude" / "skills" / skill_name / "SKILL.md", + repo_root / "databricks-skills" / skill_name / "SKILL.md", + ] + for p in candidates: + if p.exists(): + return p + return None + + +def token_efficiency_score(candidate_text: str, original_token_count: int) -> float: + """Score 0-1 based on how concise the candidate is vs. the original. + + - Same size or smaller = 1.0 + - 10% larger = 0.9, 20% larger = 0.8, etc. + - Capped at 0.0 for 100%+ bloat + """ + if original_token_count <= 0: + return 1.0 + enc = tiktoken.get_encoding("cl100k_base") + candidate_tokens = len(enc.encode(candidate_text)) + ratio = candidate_tokens / original_token_count + return max(0.0, min(1.0, 2.0 - ratio)) + + +def count_tokens(text: str) -> int: + """Count tokens using cl100k_base encoding.""" + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) + + +def _run_scorer(scorer_fn: Any, outputs: dict, expectations: dict, inputs: dict) -> list[Feedback]: + """Run a single scorer and normalize the result to a list of Feedbacks.""" + sig = inspect.signature(scorer_fn) + params = list(sig.parameters.keys()) + + kwargs = {} + if "outputs" in params: + kwargs["outputs"] = outputs + if "expectations" in params: + kwargs["expectations"] = expectations + if "inputs" in params: + kwargs["inputs"] = inputs + + try: + result = scorer_fn(**kwargs) + except Exception as e: + return [Feedback(name=getattr(scorer_fn, "__name__", "unknown"), value="no", rationale=str(e))] + + if isinstance(result, list): + return result + elif isinstance(result, Feedback): + return [result] + return [] + + +def _run_deterministic_scorers( + task: SkillTask, + scorer_config: dict[str, Any], +) -> list[Feedback]: + """Run deterministic scorers against a task's expected response.""" + outputs = {"response": task.get("answer", "")} + expectations = task.get("expectations", {}) + inputs = {"prompt": task.get("input", "")} + + if scorer_config: + scorers = build_scorers(scorer_config) + else: + scorers = [ + python_syntax, + sql_syntax, + pattern_adherence, + no_hallucinated_apis, + expected_facts_present, + ] + + all_feedbacks = [] + for scorer_fn in scorers: + # Skip LLM-based scorers -- only deterministic + scorer_name = getattr(scorer_fn, "__name__", "") or getattr(scorer_fn, "name", "") + if scorer_name in ("Safety", "Guidelines", "skill_quality"): + continue + feedbacks = _run_scorer(scorer_fn, outputs, expectations, inputs) + all_feedbacks.extend(feedbacks) + + return all_feedbacks + + +def _validate_skill_structure(candidate_text: str) -> list[Feedback]: + """Validate the SKILL.md structure itself.""" + outputs = {"response": candidate_text} + feedbacks = [] + + for scorer_fn in [python_syntax, sql_syntax, no_hallucinated_apis]: + result = _run_scorer(scorer_fn, outputs, {}, {}) + for fb in result: + feedbacks.append(Feedback(name=f"skill_{fb.name}", value=fb.value, rationale=fb.rationale)) + + return feedbacks + + +# Type aliases for GEPA adapter generics +_Trajectory = dict[str, Any] +_RolloutOutput = dict[str, Any] + + +class SkillAdapter(GEPAAdapter): + """GEPA adapter that evaluates candidate SKILL.md texts using existing scorers. + + The seed_candidate dict has a single key "skill_md" whose value is the + SKILL.md content. The adapter evaluates each data instance (test case) + by running the configured scorers. + """ + + SKILL_KEY = "skill_md" + + def __init__( + self, + skill_name: str, + mode: Literal["static", "generative"] = "static", + task_lm: str | None = None, + ): + self.skill_name = skill_name + self.mode = mode + self.task_lm = task_lm + self.scorer_config = load_scorer_config(skill_name) + + # Get original token count for efficiency scoring + skill_path = _find_skill_md(skill_name) + self.original_token_count = count_tokens(skill_path.read_text()) if skill_path else 0 + + def evaluate( + self, + batch: list[dict[str, Any]], + candidate: dict[str, str], + capture_traces: bool = False, + ) -> EvaluationBatch: + """Evaluate candidate SKILL.md against a batch of test cases. + + Args: + batch: List of DefaultDataInst dicts ({input, additional_context, answer}) + candidate: Dict with key "skill_md" -> SKILL.md content + capture_traces: Whether to capture traces (for reflection dataset) + + Returns: + EvaluationBatch with trajectories containing scores and diagnostics + """ + candidate_text = candidate.get(self.SKILL_KEY, "") + outputs: list[dict[str, Any]] = [] + scores: list[float] = [] + trajectories = [] + + for data_inst in batch: + all_feedbacks = [] + + # Build a SkillTask from the data instance for scorer compatibility + task: SkillTask = { + "id": data_inst.get("additional_context", {}).get("id", ""), + "input": data_inst.get("input", ""), + "answer": data_inst.get("answer", ""), + "additional_context": data_inst.get("additional_context", {}), + "metadata": {}, + } + + # Decode expectations if stored in additional_context + expectations_json = data_inst.get("additional_context", {}).get("expectations", "") + if expectations_json: + import json + try: + task["expectations"] = json.loads(expectations_json) + except (json.JSONDecodeError, TypeError): + task["expectations"] = {} + + if self.mode == "generative" and self.task_lm: + # Generate a fresh response using the candidate skill + import litellm + messages = [ + { + "role": "system", + "content": ( + "You are an AI assistant with the following skill documentation:\n\n" + f"{candidate_text}\n\n" + "Use this documentation to answer the user's question." + ), + }, + {"role": "user", "content": task.get("input", "")}, + ] + response = litellm.completion(model=self.task_lm, messages=messages) + task["answer"] = response.choices[0].message.content + + # 1. Run deterministic scorers against the response + response_feedbacks = _run_deterministic_scorers(task, self.scorer_config) + all_feedbacks.extend(response_feedbacks) + + # 2. Validate the skill structure itself + structure_feedbacks = _validate_skill_structure(candidate_text) + all_feedbacks.extend(structure_feedbacks) + + # 3. Convert to score + diagnostics + composite, diagnostics = feedback_to_asi(all_feedbacks) + + # 4. Factor in token efficiency + efficiency = token_efficiency_score(candidate_text, self.original_token_count) + + # Weighted composite: 80% quality, 20% token efficiency + final_score = 0.8 * composite + 0.2 * efficiency + + output = {"full_assistant_response": task.get("answer", "")} + outputs.append(output) + scores.append(final_score) + + trajectory = { + "data": data_inst, + "full_assistant_response": task.get("answer", ""), + "score": final_score, + "quality_score": composite, + "efficiency_score": efficiency, + "diagnostics": diagnostics, + "failure_messages": diagnostics.get("_failure_messages", []), + } + trajectories.append(trajectory) + + return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories) + + def make_reflective_dataset( + self, + candidate: dict[str, str], + eval_batch: EvaluationBatch, + components_to_update: list[str], + ) -> dict[str, list[dict[str, Any]]]: + """Build reflective dataset from evaluation results for GEPA's mutation proposer. + + Extracts failure messages from diagnostics so the reflection LM + knows exactly what went wrong and can propose targeted mutations. + """ + reflective_data: dict[str, list[dict[str, Any]]] = {} + + for component in components_to_update: + examples = [] + for traj in eval_batch.trajectories: + failure_msgs = traj.get("failure_messages", []) + if not failure_msgs: + continue + + examples.append({ + "input": traj.get("data", {}).get("input", ""), + "current_text": candidate.get(component, ""), + "feedback": "\n".join(failure_msgs), + "score": traj.get("score", 0.0), + }) + + reflective_data[component] = examples + + return reflective_data + + +def create_skill_adapter( + skill_name: str, + mode: Literal["static", "generative"] = "static", + task_lm: str | None = None, +) -> SkillAdapter: + """Create a SkillAdapter for GEPA optimization. + + Args: + skill_name: Name of the skill being optimized + mode: "static" or "generative" + task_lm: LLM model string for generative mode + + Returns: + Configured SkillAdapter instance + """ + return SkillAdapter(skill_name=skill_name, mode=mode, task_lm=task_lm) + + +def build_optimization_background(skill_name: str, original_token_count: int) -> str: + """Build the background context string for GEPA's reflection LM.""" + return ( + f"You are optimizing a SKILL.md file for the '{skill_name}' Databricks skill. " + "SKILL.md files teach AI agents (like Claude Code) how to use specific Databricks features. " + "They contain patterns, code examples, API references, and best practices.\n\n" + "The skill is evaluated against test cases that check:\n" + "- Python/SQL code syntax validity\n" + "- Adherence to expected patterns (regex matches)\n" + "- Absence of hallucinated/deprecated APIs\n" + "- Presence of expected factual information\n" + "- Overall structural quality of the skill document\n\n" + f"IMPORTANT: The current skill is {original_token_count:,} tokens. " + "Optimized skills should be MORE CONCISE, not larger. " + "Remove redundant examples, consolidate similar patterns, " + "and eliminate verbose explanations that don't add value. " + "Every token consumed is agent context window budget -- keep skills lean and focused." + ) diff --git a/.test/src/skill_test/optimize/gskill/README.md b/.test/src/skill_test/optimize/gskill/README.md new file mode 100644 index 00000000..1c4621b0 --- /dev/null +++ b/.test/src/skill_test/optimize/gskill/README.md @@ -0,0 +1,128 @@ +# gskill: Auto-Generate Optimized Skills for Your Repository + +`gskill` uses [GEPA](https://github.com/databricks/gepa) to automatically generate optimized SKILL.md files that teach Claude Code how to work with your specific Databricks project patterns. + +## What It Does + +1. **Scans your repository** for Databricks patterns (SDK usage, SQL, notebooks, configs) +2. **Generates a SKILL.md** optimized for AI agent consumption +3. **Validates quality** using the skill-test evaluation framework +4. **Outputs to `.claude/skills/`** so Claude Code automatically picks it up + +## Quick Start + +### Prerequisites + +```bash +# Install GEPA +pip install gepa>=0.1.0 + +# Set up LLM API keys (for GEPA reflection) +export OPENAI_API_KEY=your-key-here +``` + +### Generate a Skill + +```bash +# From the ai-dev-kit repository +cd /path/to/ai-dev-kit + +# Generate a skill for your project repo +uv run python -c " +from skill_test.optimize.gskill import run_gskill +result = run_gskill('/path/to/your/databricks-project') +print(f'Generated: {result[\"skill_path\"]}') +" +``` + +### Using with Claude Code + +Once generated, the skill is automatically available to Claude Code: + +``` +your-repo/ +├── .claude/ +│ └── skills/ +│ └── your-repo/ +│ └── SKILL.md # <- Generated skill +├── src/ +│ └── ... +``` + +Claude Code reads `.claude/skills/*/SKILL.md` files and uses them as context when helping with your code. + +## Configuration + +### Presets + +| Preset | Iterations | Best For | +|--------|-----------|----------| +| `quick` | 15 | Initial generation, small repos | +| `standard` | 50 | Most repos (default) | +| `thorough` | 150 | Large repos, production quality | + +### Custom Context + +Provide additional files for gskill to consider: + +```python +from skill_test.optimize.gskill import run_gskill + +result = run_gskill( + repo_path="/path/to/your/repo", + preset="standard", + context_files=[ + "docs/architecture.md", + "README.md", + "src/config.py", + ], +) +``` + +## Evaluating Generated Skills + +Use the ai-dev-kit evaluation framework to validate generated skills: + +```bash +# 1. Add test cases for the generated skill +uv run python .test/scripts/init_skill.py your-skill-name + +# 2. Add ground truth test cases +uv run python .test/scripts/add.py your-skill-name + +# 3. Run evaluation +uv run python .test/scripts/run_eval.py your-skill-name + +# 4. Optimize further with GEPA +uv run python .test/scripts/optimize.py your-skill-name +``` + +## How It Works + +``` +Your Repository + │ + ▼ +┌──────────┐ ┌──────────┐ ┌──────────┐ +│ Scan for │ --> │ Generate │ --> │ Validate │ --> SKILL.md +│ patterns │ │ SKILL.md │ │ quality │ +└──────────┘ └──────────┘ └──────────┘ + │ + ▼ + GEPA optimize_anything + (reflects on quality, + iterates to improve) +``` + +GEPA's `optimize_anything` treats the SKILL.md as the artifact to optimize. It: +- Starts with patterns found in your repo as the seed +- Uses GEPA's reflection LM to propose improvements +- Scores each iteration for quality, correctness, and conciseness +- Selects the best candidate via Pareto frontier optimization + +## Tips + +- **Keep skills focused**: One skill per domain (e.g., separate skills for "data pipeline" and "model serving") +- **Add test cases**: Skills with ground truth test cases optimize much better than bootstrap mode +- **Iterate**: Run `optimize.py` after adding test cases for incremental improvement +- **Token budget**: Skills should be as concise as possible -- every token consumed is agent context window budget diff --git a/.test/src/skill_test/optimize/gskill/__init__.py b/.test/src/skill_test/optimize/gskill/__init__.py new file mode 100644 index 00000000..d27b7b51 --- /dev/null +++ b/.test/src/skill_test/optimize/gskill/__init__.py @@ -0,0 +1,5 @@ +"""gskill: Generate optimized skills for customer repositories using GEPA.""" + +from .pipeline import run_gskill + +__all__ = ["run_gskill"] diff --git a/.test/src/skill_test/optimize/gskill/pipeline.py b/.test/src/skill_test/optimize/gskill/pipeline.py new file mode 100644 index 00000000..fdc0a8fb --- /dev/null +++ b/.test/src/skill_test/optimize/gskill/pipeline.py @@ -0,0 +1,209 @@ +"""gskill pipeline: wraps GEPA optimize for customer repository skill generation. + +Configures GEPA with Databricks-appropriate defaults, generates an optimized +SKILL.md, and outputs it in the standard format. +""" + +import re +from pathlib import Path +from typing import Any + +import gepa +from gepa import GEPAAdapter, EvaluationBatch + +from ..config import get_preset + + +class _GSkillAdapter(GEPAAdapter): + """Minimal adapter for gskill that scores SKILL.md structural quality.""" + + SKILL_KEY = "skill_md" + + def evaluate( + self, + batch: list[dict[str, Any]], + candidate: dict[str, str], + capture_traces: bool = False, + ) -> EvaluationBatch: + """Score candidate skill based on structural quality metrics.""" + import ast + + candidate_text = candidate.get(self.SKILL_KEY, "") + trajectories = [] + + for data_inst in batch: + score = 0.0 + parts = 0 + + # Has markdown headers + if re.search(r"^#{1,3}\s+", candidate_text, re.MULTILINE): + score += 1.0 + parts += 1 + + # Has code blocks + code_blocks = re.findall(r"```(\w+)\n(.*?)```", candidate_text, re.DOTALL) + if code_blocks: + score += 1.0 + parts += 1 + + # Python blocks parse + py_blocks = [b for lang, b in code_blocks if lang == "python"] + if py_blocks: + valid = sum(1 for b in py_blocks if _parses(b)) + score += valid / len(py_blocks) + parts += 1 + + # Reasonable length (not too short, not too long) + word_count = len(candidate_text.split()) + if 200 <= word_count <= 5000: + score += 1.0 + parts += 1 + + final = score / parts if parts > 0 else 0.0 + + trajectories.append({ + "data": data_inst, + "full_assistant_response": candidate_text[:200], + "score": final, + }) + + return EvaluationBatch(trajectories=trajectories) + + def make_reflective_dataset( + self, + candidate: dict[str, str], + eval_batch: EvaluationBatch, + components_to_update: list[str], + ) -> dict[str, list[dict[str, Any]]]: + reflective_data: dict[str, list[dict[str, Any]]] = {} + for component in components_to_update: + examples = [] + for traj in eval_batch.trajectories: + if traj.get("score", 1.0) < 0.8: + examples.append({ + "input": traj.get("data", {}).get("input", ""), + "current_text": candidate.get(component, ""), + "feedback": "Skill structural quality is below threshold.", + "score": traj.get("score", 0.0), + }) + reflective_data[component] = examples + return reflective_data + + +def _parses(code: str) -> bool: + """Check if Python code parses without syntax errors.""" + import ast + try: + ast.parse(code) + return True + except SyntaxError: + return False + + +def run_gskill( + repo_path: str | Path, + skill_name: str | None = None, + output_dir: str | Path | None = None, + preset: str = "standard", + context_files: list[str] | None = None, +) -> dict[str, Any]: + """Generate an optimized skill for a customer repository using GEPA. + + Scans the repo for Databricks patterns, generates a SKILL.md optimized + for Claude Code consumption, and outputs to .claude/skills//SKILL.md. + + Args: + repo_path: Path to the customer's repository + skill_name: Name for the generated skill (auto-detected if None) + output_dir: Override output directory + preset: GEPA optimization preset + context_files: Additional files to provide as context + + Returns: + Dict with generated skill path, quality score, and metadata + """ + repo_path = Path(repo_path).resolve() + if not repo_path.exists(): + raise FileNotFoundError(f"Repository not found: {repo_path}") + + preset_config = get_preset(preset) + + if skill_name is None: + skill_name = repo_path.name + + # Gather repo context + repo_context = _scan_repo(repo_path, context_files) + + # Build seed candidate + seed_content = ( + f"# {skill_name}\n\n" + "## Overview\n\n" + f"Patterns and best practices for the {skill_name} project.\n\n" + + repo_context + ) + + seed_candidate = {_GSkillAdapter.SKILL_KEY: seed_content} + + # Build a synthetic trainset from the repo + trainset = [ + { + "input": f"Help me understand the patterns in {skill_name}", + "additional_context": {}, + "answer": "", + }, + { + "input": f"Show me code examples from {skill_name}", + "additional_context": {}, + "answer": "", + }, + ] + + adapter = _GSkillAdapter() + + # Run GEPA + result = gepa.optimize( + seed_candidate=seed_candidate, + trainset=trainset, + adapter=adapter, + **preset_config.to_kwargs(), + ) + + generated_content = result.best_candidate.get(_GSkillAdapter.SKILL_KEY, seed_content) + + # Write output + if output_dir is None: + output_dir = repo_path / ".claude" / "skills" / skill_name + else: + output_dir = Path(output_dir) + + output_dir.mkdir(parents=True, exist_ok=True) + skill_path = output_dir / "SKILL.md" + skill_path.write_text(generated_content) + + return { + "skill_name": skill_name, + "skill_path": str(skill_path), + "content_length": len(generated_content), + "repo_path": str(repo_path), + "preset": preset, + } + + +def _scan_repo(repo_path: Path, context_files: list[str] | None = None) -> str: + """Scan repository for Databricks-relevant patterns and build context.""" + context_parts = [] + + # Read explicitly provided context files + if context_files: + for f in context_files: + p = Path(f) if Path(f).is_absolute() else repo_path / f + if p.exists(): + content = p.read_text()[:5000] # Cap at 5K per file + context_parts.append(f"### {p.name}\n\n```\n{content}\n```\n") + + # Auto-scan for README + readme = repo_path / "README.md" + if readme.exists() and not context_files: + context_parts.append(f"### README\n\n{readme.read_text()[:3000]}\n") + + return "\n".join(context_parts) if context_parts else "" diff --git a/.test/src/skill_test/optimize/review.py b/.test/src/skill_test/optimize/review.py new file mode 100644 index 00000000..0b5f1115 --- /dev/null +++ b/.test/src/skill_test/optimize/review.py @@ -0,0 +1,191 @@ +"""Review and apply workflow for optimization results. + +Provides human-readable output of optimization results and the ability +to apply the optimized SKILL.md to the repository. +""" + +import difflib +from pathlib import Path + +from .runner import OptimizationResult +from .evaluator import _find_skill_md + + +def review_optimization(result: OptimizationResult) -> None: + """Print optimization summary for human review. + + Shows: score improvement, token reduction, diff of changed sections, + per-test-case score breakdown, validation set performance. + """ + print(f"\n{'=' * 60}") + print(f" Optimization Results: {result.skill_name}") + print(f"{'=' * 60}") + + # Quality scores + improvement_sign = "+" if result.improvement >= 0 else "" + print(f" Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " + f"({improvement_sign}{result.improvement:.3f})") + + # Token counts + reduction_sign = "+" if result.token_reduction_pct >= 0 else "" + print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " + f"({reduction_sign}{result.token_reduction_pct:.1f}%)") + + # Validation scores + if result.val_scores: + avg_val = sum(result.val_scores.values()) / len(result.val_scores) + print(f" Validation: avg={avg_val:.3f} ({len(result.val_scores)} cases)") + + # GEPA iterations + if result.gepa_result and hasattr(result.gepa_result, "iterations"): + print(f" Iterations: {result.gepa_result.iterations}") + + # MLflow run + if result.mlflow_run_id: + print(f" MLflow run: {result.mlflow_run_id}") + + print() + + # Diff summary + if result.diff_summary and result.diff_summary != "No changes": + print(" Changes:") + for line in result.diff_summary.split("\n"): + print(f" {line}") + print() + + # Detailed diff (first 50 lines) + if result.original_content != result.optimized_content: + diff_lines = list(difflib.unified_diff( + result.original_content.splitlines(keepends=True), + result.optimized_content.splitlines(keepends=True), + fromfile="original SKILL.md", + tofile="optimized SKILL.md", + n=2, + )) + if len(diff_lines) > 50: + print(f" Diff (first 50 of {len(diff_lines)} lines):") + for line in diff_lines[:50]: + print(f" {line}", end="") + print(f"\n ... ({len(diff_lines) - 50} more lines)") + else: + print(" Diff:") + for line in diff_lines: + print(f" {line}", end="") + print() + else: + print(" No changes to SKILL.md content.") + + # Validation breakdown + if result.val_scores: + print(" Validation scores by test case:") + for task_id, score in sorted(result.val_scores.items()): + status = "PASS" if score >= 0.5 else "FAIL" + print(f" {status} {task_id}: {score:.3f}") + print() + + # Apply hint + print(f" To apply: uv run python .test/scripts/optimize.py {result.skill_name} --apply") + print(f"{'=' * 60}\n") + + +def apply_optimization(result: OptimizationResult) -> Path: + """Overwrite the original SKILL.md with the optimized version. + + Also updates baseline via existing baseline workflow if possible. + + Args: + result: OptimizationResult from optimize_skill() + + Returns: + Path to the updated SKILL.md + + Raises: + FileNotFoundError: If original SKILL.md cannot be found + ValueError: If optimization did not improve the skill + """ + skill_path = _find_skill_md(result.skill_name) + if skill_path is None: + raise FileNotFoundError(f"Cannot find SKILL.md for '{result.skill_name}'") + + if result.optimized_content == result.original_content: + print(f"No changes to apply for '{result.skill_name}'.") + return skill_path + + if result.improvement < 0: + raise ValueError( + f"Optimization regressed quality ({result.improvement:+.3f}). " + "Refusing to apply. Use --force to override." + ) + + # Write optimized content + skill_path.write_text(result.optimized_content) + + print(f"Applied optimized SKILL.md to {skill_path}") + print(f" Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " + f"({result.improvement:+.3f})") + print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " + f"({result.token_reduction_pct:+.1f}%)") + + # Try to update baseline + try: + from ..runners.compare import save_baseline + + if result.mlflow_run_id: + save_baseline( + skill_name=result.skill_name, + run_id=result.mlflow_run_id, + metrics={"optimized_score": result.optimized_score}, + test_count=len(result.val_scores) if result.val_scores else 0, + ) + print(f" Baseline updated.") + except Exception: + pass + + return skill_path + + +def format_cost_estimate( + train_count: int, + val_count: int | None, + preset: str, + mode: str, +) -> str: + """Estimate the cost of running optimization. + + Args: + train_count: Number of training tasks + val_count: Number of validation tasks (or None) + preset: Preset name + mode: "static" or "generative" + + Returns: + Human-readable cost estimate string + """ + # Rough estimates based on preset + max_calls = {"quick": 15, "standard": 50, "thorough": 150}.get(preset, 50) + + # Each metric call runs all scorers on all train tasks + calls_per_iteration = train_count + if val_count: + calls_per_iteration += val_count + + total_scorer_calls = max_calls * calls_per_iteration + + if mode == "static": + # Static mode: ~$0.001 per scorer call (just deterministic checks) + est_cost = total_scorer_calls * 0.001 + else: + # Generative mode: ~$0.01 per call (LLM generation + scoring) + est_cost = total_scorer_calls * 0.01 + + # GEPA reflection calls + reflection_cost = max_calls * 0.02 # ~$0.02 per reflection + + total = est_cost + reflection_cost + + return ( + f"Estimated cost: ~${total:.2f}\n" + f" Scorer calls: {total_scorer_calls:,} x {'$0.001' if mode == 'static' else '$0.01'}\n" + f" Reflection calls: {max_calls} x $0.02\n" + f" Max iterations: {max_calls}" + ) diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py new file mode 100644 index 00000000..b811bcb3 --- /dev/null +++ b/.test/src/skill_test/optimize/runner.py @@ -0,0 +1,284 @@ +"""End-to-end orchestrator for GEPA skill optimization. + +Workflow: load skill -> split dataset -> build adapter -> optimize -> log results +""" + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +import gepa + +from ..config import SkillTestConfig +from ..runners.evaluate import setup_mlflow +from .config import get_preset +from .evaluator import ( + SkillAdapter, + create_skill_adapter, + count_tokens, + build_optimization_background, + _find_skill_md, + token_efficiency_score, +) +from .splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances + + +@dataclass +class OptimizationResult: + """Result of a GEPA optimization run.""" + + skill_name: str + original_score: float + optimized_score: float + improvement: float + original_content: str + optimized_content: str + original_token_count: int + optimized_token_count: int + token_reduction_pct: float + diff_summary: str + val_scores: dict[str, float] + mlflow_run_id: str | None + gepa_result: Any + + +def _compute_diff_summary(original: str, optimized: str) -> str: + """Generate a human-readable summary of changes between original and optimized.""" + import difflib + import re + + original_lines = original.splitlines(keepends=True) + optimized_lines = optimized.splitlines(keepends=True) + + diff = list(difflib.unified_diff(original_lines, optimized_lines, fromfile="original", tofile="optimized", n=1)) + + if not diff: + return "No changes" + + added = sum(1 for line in diff if line.startswith("+") and not line.startswith("+++")) + removed = sum(1 for line in diff if line.startswith("-") and not line.startswith("---")) + + summary_parts = [] + if added: + summary_parts.append(f"+{added} lines added") + if removed: + summary_parts.append(f"-{removed} lines removed") + + # Extract changed section headers for context + changed_sections = set() + for line in diff: + content = line[1:].strip() if line.startswith(("+", "-")) and not line.startswith(("+++", "---")) else "" + if content.startswith("#"): + changed_sections.add(content) + + summary = ", ".join(summary_parts) + if changed_sections: + sections = "\n".join(f" ~ {s}" for s in sorted(changed_sections)[:10]) + summary += f"\n\nChanged sections:\n{sections}" + + return summary + + +def _evaluate_on_tasks( + adapter: SkillAdapter, + candidate: dict[str, str], + tasks: list[dict[str, Any]], +) -> tuple[float, dict[str, float]]: + """Run adapter on a set of tasks and return mean score + per-task scores.""" + gepa_instances = to_gepa_instances(tasks) + eval_batch = adapter.evaluate(gepa_instances, candidate) + + per_task: dict[str, float] = {} + for i, score in enumerate(eval_batch.scores): + task_id = tasks[i].get("id", f"task_{i}") + per_task[task_id] = score + + mean_score = sum(per_task.values()) / len(per_task) if per_task else 0.0 + return mean_score, per_task + + +def optimize_skill( + skill_name: str, + mode: Literal["static", "generative"] = "static", + preset: Literal["quick", "standard", "thorough"] = "standard", + task_lm: str | None = None, + dry_run: bool = False, +) -> OptimizationResult: + """Run end-to-end GEPA optimization on a skill. + + 1. Load current SKILL.md as seed_candidate + 2. Create train/val datasets from ground_truth.yaml + 3. Build adapter from existing scorers + 4. Run gepa.optimize() + 5. Log results to MLflow + 6. Return OptimizationResult with original/optimized scores and diff + + Args: + skill_name: Name of the skill to optimize + mode: "static" (uses ground truth responses) or "generative" (generates fresh) + preset: GEPA config preset ("quick", "standard", "thorough") + task_lm: LLM model for generative mode + dry_run: If True, show config and dataset info without running optimization + + Returns: + OptimizationResult with scores, content, and diff + + Raises: + FileNotFoundError: If SKILL.md cannot be found + """ + # 1. Load current SKILL.md + skill_path = _find_skill_md(skill_name) + if skill_path is None: + raise FileNotFoundError( + f"Could not find SKILL.md for '{skill_name}'. " + "Expected at .claude/skills/{name}/SKILL.md or databricks-skills/{name}/SKILL.md" + ) + + original_content = skill_path.read_text() + original_token_count = count_tokens(original_content) + + # 2. Create train/val datasets + try: + train, val = create_gepa_datasets(skill_name) + except FileNotFoundError: + train, val = [], None + + if not train: + # Bootstrap mode + train = generate_bootstrap_tasks(skill_name) + val = None + print(f"No test cases found for '{skill_name}'. Using {len(train)} auto-generated tasks.") + print(f"For better results, add test cases: skill-test add {skill_name}") + + # 3. Build adapter + adapter = create_skill_adapter(skill_name, mode=mode, task_lm=task_lm) + + # seed_candidate is a dict with our SKILL_KEY + seed_candidate = {SkillAdapter.SKILL_KEY: original_content} + + # 4. Get preset config + preset_config = get_preset(preset) + + # Dry run: show info and exit + if dry_run: + print(f"\n=== Dry Run: {skill_name} ===") + print(f"SKILL.md path: {skill_path}") + print(f"Original tokens: {original_token_count:,}") + print(f"Train tasks: {len(train)}") + print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") + print(f"Mode: {mode}") + print(f"Preset: {preset} (max_metric_calls={preset_config.max_metric_calls})") + if mode == "generative": + print(f"Task LM: {task_lm or 'not set'}") + + # Evaluate current score + original_score, _ = _evaluate_on_tasks(adapter, seed_candidate, train) + print(f"Current score: {original_score:.3f}") + + return OptimizationResult( + skill_name=skill_name, + original_score=original_score, + optimized_score=original_score, + improvement=0.0, + original_content=original_content, + optimized_content=original_content, + original_token_count=original_token_count, + optimized_token_count=original_token_count, + token_reduction_pct=0.0, + diff_summary="Dry run - no optimization performed", + val_scores={}, + mlflow_run_id=None, + gepa_result=None, + ) + + # Evaluate original score + original_score, _ = _evaluate_on_tasks(adapter, seed_candidate, train) + + # 5. Convert datasets to GEPA format + trainset = to_gepa_instances(train) + valset = to_gepa_instances(val) if val else None + + # 6. Run gepa.optimize() + gepa_kwargs = { + "seed_candidate": seed_candidate, + "trainset": trainset, + "adapter": adapter, + **preset_config.to_kwargs(), + } + if valset: + gepa_kwargs["valset"] = valset + if task_lm: + gepa_kwargs["task_lm"] = task_lm + + result = gepa.optimize(**gepa_kwargs) + + # result.best_candidate is dict[str, str] + optimized_content = result.best_candidate.get(SkillAdapter.SKILL_KEY, original_content) + optimized_token_count = count_tokens(optimized_content) + + # Evaluate optimized on train + optimized_candidate = {SkillAdapter.SKILL_KEY: optimized_content} + optimized_score, _ = _evaluate_on_tasks(adapter, optimized_candidate, train) + + # Evaluate on val if available + val_scores: dict[str, float] = {} + if val: + _, val_scores = _evaluate_on_tasks(adapter, optimized_candidate, val) + + # Token reduction + token_reduction_pct = ( + (original_token_count - optimized_token_count) / original_token_count * 100 + if original_token_count > 0 + else 0.0 + ) + + # Diff summary + diff_summary = _compute_diff_summary(original_content, optimized_content) + + # 7. Log to MLflow (best-effort) + mlflow_run_id = None + try: + import mlflow + + stc = SkillTestConfig() + setup_mlflow(stc) + + with mlflow.start_run(run_name=f"{skill_name}_optimize_{preset}"): + mlflow.set_tags( + { + "optimizer": "gepa", + "skill_name": skill_name, + "preset": preset, + "mode": mode, + } + ) + mlflow.log_metrics( + { + "original_score": original_score, + "optimized_score": optimized_score, + "improvement": optimized_score - original_score, + "original_tokens": float(original_token_count), + "optimized_tokens": float(optimized_token_count), + "token_reduction_pct": token_reduction_pct, + "total_metric_calls": float(result.total_metric_calls or 0), + } + ) + mlflow_run_id = mlflow.active_run().info.run_id + except Exception: + pass + + return OptimizationResult( + skill_name=skill_name, + original_score=original_score, + optimized_score=optimized_score, + improvement=optimized_score - original_score, + original_content=original_content, + optimized_content=optimized_content, + original_token_count=original_token_count, + optimized_token_count=optimized_token_count, + token_reduction_pct=token_reduction_pct, + diff_summary=diff_summary, + val_scores=val_scores, + mlflow_run_id=mlflow_run_id, + gepa_result=result, + ) diff --git a/.test/src/skill_test/optimize/splitter.py b/.test/src/skill_test/optimize/splitter.py new file mode 100644 index 00000000..52bbb87b --- /dev/null +++ b/.test/src/skill_test/optimize/splitter.py @@ -0,0 +1,200 @@ +"""Train/val dataset splitting for GEPA optimization. + +Loads ground_truth.yaml test cases and splits them into train/val sets, +stratified by metadata.category when possible. + +GEPA's DefaultDataInst format: {"input": str, "additional_context": dict[str, str], "answer": str} + +We store our internal task representation alongside, and convert to GEPA format +when needed via to_gepa_instances(). +""" + +import json +import random +import re +from collections import defaultdict +from pathlib import Path +from typing import Any, TypedDict + +from ..dataset import EvalRecord, get_dataset_source + + +class SkillTask(TypedDict, total=False): + """Internal task representation (superset of GEPA DefaultDataInst).""" + + id: str + input: str # The prompt (maps to DefaultDataInst.input) + answer: str # Expected response (maps to DefaultDataInst.answer) + additional_context: dict[str, str] # Extra context (maps to DefaultDataInst.additional_context) + expectations: dict[str, Any] # Scorer expectations (not sent to GEPA directly) + metadata: dict[str, Any] # Category, difficulty, etc. + + +def _record_to_task(record: EvalRecord) -> SkillTask: + """Convert an EvalRecord to our internal task format.""" + task: SkillTask = { + "id": record.id, + "input": record.inputs.get("prompt", ""), + "additional_context": {}, + "answer": "", + "metadata": record.metadata or {}, + } + if record.outputs: + task["answer"] = record.outputs.get("response", "") + if record.expectations: + task["expectations"] = record.expectations + # Also encode expectations into additional_context for GEPA reflection + task["additional_context"]["expectations"] = json.dumps(record.expectations) + return task + + +def to_gepa_instances(tasks: list[SkillTask]) -> list[dict[str, Any]]: + """Convert internal tasks to GEPA DefaultDataInst format. + + Returns list of {"input": str, "additional_context": dict[str,str], "answer": str} + """ + return [ + { + "input": t["input"], + "additional_context": t.get("additional_context", {}), + "answer": t.get("answer", ""), + } + for t in tasks + ] + + +def create_gepa_datasets( + skill_name: str, + val_ratio: float = 0.2, + base_path: Path | None = None, + seed: int = 42, +) -> tuple[list[SkillTask], list[SkillTask] | None]: + """Load ground_truth.yaml, stratify by metadata.category, split into train/val. + + For skills with <5 test cases: uses all as train, val=None (single-task mode). + For skills with >=5 test cases: stratified train/val split (generalization mode). + + Args: + skill_name: Name of the skill to load test cases for + val_ratio: Fraction of test cases to hold out for validation + base_path: Override base path for skills directory + seed: Random seed for reproducible splits + + Returns: + Tuple of (train_tasks, val_tasks). val_tasks is None if <5 test cases. + """ + source = get_dataset_source(skill_name, base_path) + records = source.load() + + if not records: + return [], None + + tasks = [_record_to_task(r) for r in records] + + # Too few for a meaningful val split + if len(tasks) < 5: + return tasks, None + + # Stratify by category + by_category: dict[str, list[SkillTask]] = defaultdict(list) + for task in tasks: + cat = task.get("metadata", {}).get("category", "_uncategorized") + by_category[cat].append(task) + + rng = random.Random(seed) + train: list[SkillTask] = [] + val: list[SkillTask] = [] + + for cat, cat_tasks in by_category.items(): + rng.shuffle(cat_tasks) + n_val = max(1, int(len(cat_tasks) * val_ratio)) + + # Ensure at least 1 train sample per category + if len(cat_tasks) - n_val < 1: + n_val = len(cat_tasks) - 1 + + if n_val <= 0: + train.extend(cat_tasks) + else: + val.extend(cat_tasks[:n_val]) + train.extend(cat_tasks[n_val:]) + + # If val ended up empty, fall back + if not val: + return tasks, None + + return train, val + + +def generate_bootstrap_tasks(skill_name: str, base_path: Path | None = None) -> list[SkillTask]: + """Generate synthetic tasks from a SKILL.md when no ground_truth.yaml exists. + + Parses the SKILL.md for documented patterns and generates basic test prompts + that exercise each pattern. + + Args: + skill_name: Name of the skill + base_path: Override base path for skills directory + + Returns: + List of synthetic SkillTask dicts + """ + if base_path is None: + # Find repo root for path resolution + from .evaluator import _find_repo_root + + repo_root = _find_repo_root() + skill_md_candidates = [ + repo_root / ".claude" / "skills" / skill_name / "SKILL.md", + repo_root / "databricks-skills" / skill_name / "SKILL.md", + ] + else: + skill_md_candidates = [base_path.parent / skill_name / "SKILL.md"] + + skill_content = None + for path in skill_md_candidates: + if path.exists(): + skill_content = path.read_text() + break + + if not skill_content: + return [] + + tasks: list[SkillTask] = [] + + # Extract h2/h3 headers as topic areas + headers = re.findall(r"^#{2,3}\s+(.+)$", skill_content, re.MULTILINE) + + for i, header in enumerate(headers): + tasks.append( + { + "id": f"bootstrap_{i:03d}", + "input": f"Using the {skill_name} skill, help me with: {header}", + "additional_context": {}, + "answer": "", + "metadata": {"category": "bootstrap", "source": "auto_generated"}, + } + ) + + # Extract code block language hints for targeted prompts + code_langs = set(re.findall(r"```(\w+)\n", skill_content)) + for lang in code_langs: + tasks.append( + { + "id": f"bootstrap_lang_{lang}", + "input": f"Show me a {lang} example using {skill_name} patterns", + "additional_context": {}, + "answer": "", + "metadata": {"category": "bootstrap", "source": "auto_generated"}, + } + ) + + return tasks or [ + { + "id": "bootstrap_general", + "input": f"Explain the key patterns in {skill_name}", + "additional_context": {}, + "answer": "", + "metadata": {"category": "bootstrap", "source": "auto_generated"}, + } + ] diff --git a/.test/tests/test_optimize_e2e.py b/.test/tests/test_optimize_e2e.py new file mode 100644 index 00000000..9299cb0f --- /dev/null +++ b/.test/tests/test_optimize_e2e.py @@ -0,0 +1,311 @@ +"""End-to-end test: optimize an existing skill, assert quality up + tokens down. + +This test validates the entire GEPA optimization pipeline works and that +GEPA actually produces better, leaner skills. + +NOTE: The E2E class is a slow integration test that calls GEPA with real LLM +reflection. It requires: + - gepa>=0.0.7 installed + - OPENAI_API_KEY set (for GEPA reflection LM) + +Run unit tests only: + cd .test && uv run pytest tests/test_optimize_e2e.py -v -k "not E2E" + +Run everything (slow): + cd .test && uv run pytest tests/test_optimize_e2e.py -v -s +""" + +import pytest + +from skill_test.optimize.evaluator import token_efficiency_score, count_tokens +from skill_test.optimize.splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances +from skill_test.optimize.asi import feedback_to_score, feedback_to_asi +from skill_test.optimize.config import PRESETS, GEPAPreset + +try: + from mlflow.entities import Feedback + + HAS_MLFLOW = True +except ImportError: + Feedback = None + HAS_MLFLOW = False + +try: + import gepa + + HAS_GEPA = True +except ImportError: + HAS_GEPA = False + + +# -------------------------------------------------------------------------- +# Step 1: Unit tests (no GEPA/LLM required) +# -------------------------------------------------------------------------- + + +class TestTokenEfficiency: + """Verification: token counting and efficiency scoring.""" + + def test_same_size_scores_one(self): + text = "Hello world, this is a test." + tokens = count_tokens(text) + assert token_efficiency_score(text, tokens) == 1.0 + + def test_smaller_scores_one(self): + assert token_efficiency_score("short", 100) == 1.0 + + def test_double_size_scores_zero(self): + text = "word " * 200 + tokens = count_tokens(text) + assert token_efficiency_score(text + text, tokens) == pytest.approx(0.0, abs=0.05) + + def test_ten_pct_larger(self): + base = "word " * 100 + base_tokens = count_tokens(base) + larger = base + "extra " * 10 + larger_tokens = count_tokens(larger) + ratio = larger_tokens / base_tokens + expected = max(0.0, min(1.0, 2.0 - ratio)) + assert token_efficiency_score(larger, base_tokens) == pytest.approx(expected, abs=0.05) + + def test_zero_original_returns_one(self): + assert token_efficiency_score("anything", 0) == 1.0 + + +class TestSplitter: + """Verification: dataset splitting and bootstrap task generation.""" + + def test_small_dataset_no_val(self): + """Skills with <5 test cases should use all as train, val=None.""" + try: + train, val = create_gepa_datasets("databricks-genie") + if len(train) < 5: + assert val is None + except FileNotFoundError: + pytest.skip("No ground_truth.yaml for databricks-genie") + + def test_model_serving_has_split(self): + """databricks-model-serving should have enough cases for a split.""" + try: + train, val = create_gepa_datasets("databricks-model-serving") + assert len(train) > 0 + if len(train) + (len(val) if val else 0) >= 5: + assert val is not None + assert len(val) > 0 + except FileNotFoundError: + pytest.skip("No ground_truth.yaml for databricks-model-serving") + + def test_reproducible_splits(self): + """Same seed should produce identical splits.""" + try: + train1, val1 = create_gepa_datasets("databricks-model-serving", seed=42) + train2, val2 = create_gepa_datasets("databricks-model-serving", seed=42) + assert [t["id"] for t in train1] == [t["id"] for t in train2] + if val1 and val2: + assert [t["id"] for t in val1] == [t["id"] for t in val2] + except FileNotFoundError: + pytest.skip("No ground_truth.yaml for databricks-model-serving") + + def test_tasks_have_correct_keys(self): + """Tasks should have the expected keys for GEPA compatibility.""" + try: + train, _ = create_gepa_datasets("databricks-model-serving") + assert len(train) > 0 + for task in train: + assert "id" in task + assert "input" in task + assert "answer" in task + assert "additional_context" in task + except FileNotFoundError: + pytest.skip("No ground_truth.yaml for databricks-model-serving") + + def test_to_gepa_instances(self): + """to_gepa_instances should produce DefaultDataInst-compatible dicts.""" + try: + train, _ = create_gepa_datasets("databricks-model-serving") + instances = to_gepa_instances(train) + assert len(instances) == len(train) + for inst in instances: + assert "input" in inst + assert "additional_context" in inst + assert "answer" in inst + # Should NOT have internal-only keys + assert "id" not in inst + assert "metadata" not in inst + except FileNotFoundError: + pytest.skip("No ground_truth.yaml for databricks-model-serving") + + def test_bootstrap_tasks_generated(self): + """Bootstrap should generate tasks from SKILL.md headers.""" + tasks = generate_bootstrap_tasks("databricks-model-serving") + assert len(tasks) > 0 + for task in tasks: + assert "id" in task + assert "input" in task + assert "additional_context" in task + assert "metadata" in task + + +@pytest.mark.skipif(not HAS_MLFLOW, reason="mlflow not installed") +class TestASI: + """Verification: Feedback -> GEPA score conversion.""" + + def test_yes_scores_one(self): + assert feedback_to_score(Feedback(name="test", value="yes")) == 1.0 + + def test_no_scores_zero(self): + assert feedback_to_score(Feedback(name="test", value="no")) == 0.0 + + def test_skip_returns_none(self): + assert feedback_to_score(Feedback(name="test", value="skip")) is None + + def test_numeric_value(self): + assert feedback_to_score(Feedback(name="test", value="0.75")) == 0.75 + + def test_feedback_to_asi_composite(self): + feedbacks = [ + Feedback(name="syntax", value="yes", rationale="Valid"), + Feedback(name="pattern", value="no", rationale="Missing X"), + Feedback(name="optional", value="skip", rationale="N/A"), + ] + score, diag = feedback_to_asi(feedbacks) + # Mean of [1.0, 0.0] = 0.5 + assert score == pytest.approx(0.5) + assert diag["syntax"]["score"] == 1.0 + assert diag["pattern"]["score"] == 0.0 + assert diag["optional"]["status"] == "skipped" + assert diag["_summary"]["scored"] == 2 + assert diag["_summary"]["skipped"] == 1 + # Failure messages collected + assert len(diag["_failure_messages"]) >= 1 + + +class TestConfig: + """Verification: GEPA config presets.""" + + def test_presets_exist(self): + assert "quick" in PRESETS + assert "standard" in PRESETS + assert "thorough" in PRESETS + + def test_quick_has_fewer_calls(self): + assert PRESETS["quick"].max_metric_calls < PRESETS["standard"].max_metric_calls + + def test_thorough_has_most_calls(self): + assert PRESETS["thorough"].max_metric_calls > PRESETS["standard"].max_metric_calls + + def test_to_kwargs(self): + kwargs = PRESETS["quick"].to_kwargs() + assert "max_metric_calls" in kwargs + assert "reflection_lm" in kwargs + assert kwargs["max_metric_calls"] == 15 + + +# -------------------------------------------------------------------------- +# Step 6: New skill test (bootstrap mode) +# -------------------------------------------------------------------------- + + +class TestBootstrapMode: + """Verification: new skills without ground_truth.yaml can still bootstrap.""" + + def test_nonexistent_skill_returns_empty(self): + tasks = generate_bootstrap_tasks("nonexistent-skill-xyz") + # No SKILL.md found -> empty list + assert tasks == [] + + def test_bootstrap_has_gepa_format(self): + """Bootstrap tasks should be GEPA-compatible after conversion.""" + tasks = generate_bootstrap_tasks("databricks-model-serving") + if not tasks: + pytest.skip("No SKILL.md found for databricks-model-serving") + instances = to_gepa_instances(tasks) + for inst in instances: + assert isinstance(inst["input"], str) + assert isinstance(inst["additional_context"], dict) + assert isinstance(inst["answer"], str) + + +# -------------------------------------------------------------------------- +# Step 2: Dry run (requires adapter but not GEPA optimization) +# -------------------------------------------------------------------------- + + +@pytest.mark.skipif(not HAS_GEPA, reason="gepa not installed") +class TestDryRun: + """Verification: dry run shows config without calling GEPA.""" + + def test_dry_run_returns_result(self): + from skill_test.optimize.runner import optimize_skill + + try: + result = optimize_skill( + skill_name="databricks-model-serving", + mode="static", + preset="quick", + dry_run=True, + ) + assert result.improvement == 0.0 + assert result.original_content == result.optimized_content + assert result.gepa_result is None + assert result.original_token_count > 0 + print(f"\nDry run score: {result.original_score:.3f}") + print(f"Original tokens: {result.original_token_count:,}") + except FileNotFoundError: + pytest.skip("SKILL.md not found for databricks-model-serving") + + +# -------------------------------------------------------------------------- +# Steps 3-5, 7-8: E2E integration (requires GEPA + LLM API key) +# -------------------------------------------------------------------------- + + +@pytest.mark.skipif(not HAS_GEPA, reason="gepa not installed") +@pytest.mark.slow +class TestOptimizeE2E: + """End-to-end optimization test. + + Picks an existing skill, runs GEPA optimization, and asserts both + quality improvement and token reduction. + """ + + def test_optimize_improves_quality_and_reduces_tokens(self): + """Optimize databricks-spark-declarative-pipelines (largest skill). + + Asserts: + 1. Quality score does not regress + 2. Token count does not increase by >5% + 3. Validation set score within 5% of train (no overfitting) + """ + from skill_test.optimize.runner import optimize_skill + + result = optimize_skill( + skill_name="databricks-spark-declarative-pipelines", + mode="static", + preset="quick", + ) + + # 1. Quality must not regress + assert result.optimized_score >= result.original_score, ( + f"Quality regressed: {result.original_score:.3f} -> {result.optimized_score:.3f}" + ) + + # 2. Token count must not increase significantly + assert result.optimized_token_count <= result.original_token_count * 1.05, ( + f"Tokens grew: {result.original_token_count:,} -> {result.optimized_token_count:,}" + ) + + # 3. No overfitting + if result.val_scores: + avg_val = sum(result.val_scores.values()) / len(result.val_scores) + assert avg_val >= result.optimized_score - 0.05, ( + f"Overfitting: train={result.optimized_score:.3f}, val={avg_val:.3f}" + ) + + print(f"\n=== E2E Results ===") + print(f"Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " + f"({result.improvement:+.3f})") + print(f"Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " + f"({result.token_reduction_pct:+.1f}%)") + if result.mlflow_run_id: + print(f"MLflow: {result.mlflow_run_id}") From 24419cb55712610ee7bd5b0da252012b55840c38 Mon Sep 17 00:00:00 2001 From: CAholder Date: Sat, 21 Feb 2026 21:42:14 -0800 Subject: [PATCH 02/20] optimization gepa updates and a demo/tutorial notebook --- .test/README.md | 230 +++++----- .../gepa_skill_optimization_demo.ipynb | 406 ++++++++++++++++++ .test/scripts/optimize.py | 7 + .test/src/skill_test/cli/__init__.py | 5 + .test/src/skill_test/cli/commands.py | 5 +- .test/src/skill_test/optimize/config.py | 32 +- .test/src/skill_test/optimize/runner.py | 7 +- 7 files changed, 551 insertions(+), 141 deletions(-) create mode 100644 .test/notebooks/gepa_skill_optimization_demo.ipynb diff --git a/.test/README.md b/.test/README.md index d5c8fe46..91043d66 100644 --- a/.test/README.md +++ b/.test/README.md @@ -1,23 +1,95 @@ -# Skill Testing Framework +# Skill Testing & Optimization Framework -Test Databricks skills with real execution on serverless compute. +Evaluate and optimize Databricks SKILL.md files using automated scorers and [GEPA](https://github.com/gepa-ai/gepa)-powered optimization. -**Note:** This framework is for contributors only and is not distributed via install_skills.sh. +## Quick Start: Optimize a Skill + +One command evaluates a skill's current quality, runs GEPA optimization, and shows the results: + +```bash +uv run python .test/scripts/optimize.py databricks-model-serving --preset quick --apply +``` + +This will: +1. Load the SKILL.md and its test cases from `ground_truth.yaml` +2. Score the current skill against deterministic scorers (syntax, patterns, APIs, facts) +3. Run GEPA's optimization loop (reflect on failures, propose mutations, select via Pareto frontier) +4. Show a diff with quality improvement and token reduction +5. Apply the optimized SKILL.md back to disk ## Setup ```bash -uv pip install -e ".test/[dev]" -.test/install_skill_test.sh +# Install with optimization dependencies +uv pip install -e ".test/[all]" + +# Authentication for the reflection model (pick one) +# Option A: Databricks Model Serving (default) +export DATABRICKS_API_KEY="dapi..." +export DATABRICKS_API_BASE="https://.cloud.databricks.com" + +# Option B: OpenAI +export OPENAI_API_KEY="sk-..." +export GEPA_REFLECTION_LM="openai/gpt-4o" +``` + +--- + +## Optimization Commands + +### Evaluate + Optimize a Skill + +```bash +# Standard optimization (50 iterations) +uv run python .test/scripts/optimize.py + +# Quick pass (15 iterations, good for initial check) +uv run python .test/scripts/optimize.py --preset quick + +# Thorough optimization (150 iterations, production quality) +uv run python .test/scripts/optimize.py --preset thorough + +# Dry run: see scores and config without calling GEPA +uv run python .test/scripts/optimize.py --dry-run + +# Optimize and apply the result +uv run python .test/scripts/optimize.py --apply + +# Optimize all skills that have test cases +uv run python .test/scripts/optimize.py --all --preset quick ``` -Requires a Databricks workspace with serverless SQL/compute enabled. +### Changing the Reflection Model + +GEPA uses a reflection LM to analyze scorer failures and propose skill improvements. The default is **Databricks Model Serving** (`databricks-gpt-5-2`). + +| Method | Example | +|--------|---------| +| Environment variable | `export GEPA_REFLECTION_LM="databricks/databricks-gpt-5-2"` | +| CLI flag | `--reflection-lm "openai/gpt-4o"` | +| Python | `optimize_skill("my-skill", reflection_lm="anthropic/claude-sonnet-4-5-20250514")` | + +Model strings use [litellm provider prefixes](https://docs.litellm.ai/docs/providers): + +| Provider | Prefix | Example | +|----------|--------|---------| +| Databricks Model Serving | `databricks/` | `databricks/databricks-gpt-5-2` | +| OpenAI | `openai/` | `openai/gpt-4o` | +| Anthropic | `anthropic/` | `anthropic/claude-sonnet-4-5-20250514` | + +### Authentication + +| Provider | Required Environment Variables | +|----------|-------------------------------| +| Databricks | `DATABRICKS_API_KEY`, `DATABRICKS_API_BASE` | +| OpenAI | `OPENAI_API_KEY` | +| Anthropic | `ANTHROPIC_API_KEY` | --- -## New Skill Journey +## Building Test Cases for a Skill -Complete workflow for testing a skill from scratch (e.g., `mlflow-evaluation`). +Skills need test cases before optimization can work well. The workflow: ### 1. Initialize Test Scaffolding @@ -25,11 +97,7 @@ Complete workflow for testing a skill from scratch (e.g., `mlflow-evaluation`). /skill-test init ``` -Claude will: -1. Read the skill's SKILL.md documentation -2. Generate `manifest.yaml` with appropriate scorers -3. Create empty `ground_truth.yaml` and `candidates.yaml` templates -4. Recommend test prompts based on documentation +Generates `manifest.yaml` with scorer config, empty `ground_truth.yaml`, and `candidates.yaml`. ### 2. Add Test Cases @@ -37,87 +105,38 @@ Claude will: /skill-test add ``` -Run this with the recommended prompts from init. Claude will: -1. Ask for your test prompt -2. Invoke the skill to generate a response -3. Execute code blocks on Databricks -4. Auto-save passing tests to `ground_truth.yaml` -5. Save failing tests to `candidates.yaml` for review - -Repeat for each recommended prompt. +Interactively generates test cases. Passing tests go to `ground_truth.yaml`, failing ones to `candidates.yaml` for review. ### 3. Review Candidates ``` /skill-test review -``` - -Review any tests that failed execution and were saved to candidates: -1. Load pending tests from `candidates.yaml` -2. Present each with prompt, response, and execution results -3. Allow you to approve, reject, skip, or edit -4. Promote approved candidates to `ground_truth.yaml` - -For batch approval of successful tests: -``` /skill-test review --batch --filter-success ``` ### 4. Configure Scorers (Optional) -``` -/skill-test scorers -``` - -View current scorer configuration. To update: - +Edit `.test/skills//manifest.yaml` or: ``` /skill-test scorers update --add-guideline "Must use CLUSTER BY" ``` -Or edit `.test/skills//manifest.yaml` directly to: -- Add/remove scorers -- Update default guidelines -- Configure trace expectations - ### 5. Run Evaluation ``` /skill-test run ``` -Executes code blocks on Databricks or locally (depends on SKILLS, MCP, etc.) and reports pass/fail for each test in `ground_truth.yaml`. - -**Note:** Requires test cases in ground_truth.yaml (from steps 2-3). - -### 6. MLflow Evaluation (Optional) - -``` -/skill-test mlflow -``` - -Runs full evaluation with LLM judges and logs results to MLflow. Provides deeper quality assessment beyond pass/fail execution. - -### 7. Save Baseline +### 6. Save Baseline + Check Regressions ``` /skill-test baseline -``` - -Saves current metrics to `baselines//baseline.yaml`. - -### 8. Check Regressions - -After skill changes: -``` /skill-test regression ``` -Compares current pass rate against the saved baseline. - --- -## Trace Evaluation (In Progress) +## Trace Evaluation Capture Claude Code sessions and evaluate against skill expectations. @@ -133,60 +152,29 @@ mlflow autolog claude -u databricks -n "$MLFLOW_EXPERIMENT_NAME" . ### Evaluate Traces -**Local trace file:** ``` /skill-test trace-eval --trace ~/.claude/projects/.../session.jsonl -``` - -**From MLflow run ID** (from `mlflow.search_runs`): -``` /skill-test trace-eval --run-id abc123 -``` - -**From MLflow trace ID** (from `mlflow.get_trace`): -``` -/skill-test trace-eval --trace-id tr-d416fccdab46e2dea6bad1d0bd8aaaa8 -``` - -**List available traces:** -``` -/skill-test list-traces --local /skill-test list-traces --experiment "$MLFLOW_EXPERIMENT_NAME" ``` -### Configure Expectations - -In `manifest.yaml`: -```yaml -scorers: - trace_expectations: - tool_limits: - Bash: 15 - mcp__databricks__execute_sql: 10 - token_budget: - max_total: 150000 - required_tools: - - Read - banned_tools: - - "DROP DATABASE" -``` - --- ## Command Reference -| Command | Description | -|---------|-------------| -| `run` | Execute tests against ground truth (default) | -| `init` | Generate test scaffolding from skill docs | -| `add` | Add test cases interactively | -| `review` | Review and promote candidates | -| `baseline` | Save current results as baseline | -| `regression` | Compare against baseline | -| `mlflow` | Full evaluation with LLM judges | -| `trace-eval` | Evaluate session traces | -| `list-traces` | List available traces | -| `scorers` | View/update scorer config | +| Command | Description | +|------------|------------------------------------------| +| `run` | Execute tests against ground truth | +| `init` | Generate test scaffolding from skill docs| +| `add` | Add test cases interactively | +| `review` | Review and promote candidates | +| `baseline` | Save current results as baseline | +| `regression` | Compare against baseline | +| `mlflow` | Full evaluation with LLM judges | +| `optimize` | Optimize skill with GEPA | +| `trace-eval` | Evaluate session traces | +| `list-traces` | List available traces | +| `scorers` | View/update scorer config | --- @@ -204,28 +192,6 @@ scorers: --- -## Test Case Format - -```yaml -test_cases: - - id: "eval_basic_001" - inputs: - prompt: "Create a scorer for response length" - outputs: - response: | - ```python - @scorer - def response_length(outputs): - return Feedback(name="length", value=len(outputs["response"])) - ``` - execution_success: true - expectations: - expected_facts: ["@scorer", "Feedback"] - guidelines: ["Must use mlflow.genai.scorers"] -``` - ---- - ## CI/CD ```bash diff --git a/.test/notebooks/gepa_skill_optimization_demo.ipynb b/.test/notebooks/gepa_skill_optimization_demo.ipynb new file mode 100644 index 00000000..7d8cba92 --- /dev/null +++ b/.test/notebooks/gepa_skill_optimization_demo.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GEPA Skill Optimization Demo\n", + "\n", + "This notebook demonstrates how the skill-test framework uses [GEPA](https://github.com/gepa-ai/gepa) to automatically optimize Databricks SKILL.md files for **quality** and **token efficiency**.\n", + "\n", + "SKILL.md files teach AI agents (like Claude Code) Databricks patterns. Every token in a skill consumes agent context window budget, so skills should be as concise and high-quality as possible.\n", + "\n", + "**What GEPA does:**\n", + "1. Scores the current SKILL.md against deterministic scorers (syntax, patterns, APIs, facts)\n", + "2. Reflects on failures and proposes mutations to improve the skill\n", + "3. Selects the best candidate via Pareto frontier optimization\n", + "4. Repeats until quality converges or budget is exhausted" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "\n", + "# Add skill-test to path\n", + "repo_root = Path(\".\").resolve()\n", + "while not (repo_root / \".test\" / \"src\").exists() and repo_root != repo_root.parent:\n", + " repo_root = repo_root.parent\n", + "sys.path.insert(0, str(repo_root / \".test\" / \"src\"))\n", + "\n", + "print(f\"Repo root: {repo_root}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Configure the reflection model -- pick ONE:\n", + "\n", + "# Option A: Databricks Model Serving (default, recommended)\n", + "# os.environ[\"DATABRICKS_API_KEY\"] = \"dapi...\" \n", + "# os.environ[\"DATABRICKS_API_BASE\"] = \"https://.cloud.databricks.com\"\n", + "# os.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n", + "\n", + "# Option B: OpenAI\n", + "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", + "# os.environ[\"GEPA_REFLECTION_LM\"] = \"openai/gpt-4o\"\n", + "\n", + "print(f\"Reflection LM: {os.environ.get('GEPA_REFLECTION_LM', 'databricks/databricks-gpt-5-2 (default)')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Inspect the Skill\n", + "\n", + "Let's look at the `databricks-model-serving` skill -- its current size, test cases, and baseline score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "SKILL_NAME = \"databricks-model-serving\"\n", + "\n", + "from skill_test.optimize.evaluator import _find_skill_md, count_tokens\n", + "from skill_test.optimize.splitter import create_gepa_datasets\n", + "\n", + "# Load skill\n", + "skill_path = _find_skill_md(SKILL_NAME)\n", + "original_content = skill_path.read_text()\n", + "original_tokens = count_tokens(original_content)\n", + "\n", + "# Load test cases\n", + "train, val = create_gepa_datasets(SKILL_NAME)\n", + "\n", + "print(f\"Skill: {SKILL_NAME}\")\n", + "print(f\"Path: {skill_path}\")\n", + "print(f\"Lines: {len(original_content.splitlines())}\")\n", + "print(f\"Tokens: {original_tokens:,}\")\n", + "print(f\"Train cases: {len(train)}\")\n", + "print(f\"Val cases: {len(val) if val else 'None'}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show first few test cases\n", + "for t in train[:3]:\n", + " print(f\"\\n--- {t['id']} ---\")\n", + " print(f\"Prompt: {t['input'][:100]}...\")\n", + " if t.get('answer'):\n", + " print(f\"Answer: {t['answer'][:100]}...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Evaluate Current Quality (Baseline)\n", + "\n", + "Before optimizing, measure the current skill quality using the scorer pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from skill_test.optimize.evaluator import create_skill_adapter, SkillAdapter\n", + "from skill_test.optimize.splitter import to_gepa_instances\n", + "\n", + "adapter = create_skill_adapter(SKILL_NAME)\n", + "seed_candidate = {SkillAdapter.SKILL_KEY: original_content}\n", + "\n", + "# Evaluate on all train tasks\n", + "gepa_instances = to_gepa_instances(train)\n", + "eval_batch = adapter.evaluate(gepa_instances, seed_candidate)\n", + "\n", + "print(f\"{'Task ID':<35} {'Score':>8}\")\n", + "print(\"-\" * 45)\n", + "for i, score in enumerate(eval_batch.scores):\n", + " task_id = train[i]['id']\n", + " status = 'PASS' if score >= 0.5 else 'FAIL'\n", + " print(f\"{task_id:<35} {score:>7.3f} {status}\")\n", + "\n", + "baseline_score = sum(eval_batch.scores) / len(eval_batch.scores)\n", + "print(f\"\\nBaseline Score: {baseline_score:.3f}\")\n", + "print(f\"Token Count: {original_tokens:,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Run GEPA Optimization\n", + "\n", + "Now run the optimization. GEPA will:\n", + "- Use the current SKILL.md as the seed candidate\n", + "- Run scorers against each test case\n", + "- Reflect on failures to propose mutations\n", + "- Select the best candidate via Pareto frontier\n", + "- Penalize token bloat (80% quality, 20% efficiency weighting)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from skill_test.optimize.runner import optimize_skill\n", + "\n", + "result = optimize_skill(\n", + " skill_name=SKILL_NAME,\n", + " mode=\"static\",\n", + " preset=\"quick\", # 15 iterations -- increase to \"standard\" (50) or \"thorough\" (150) for better results\n", + ")\n", + "\n", + "print(f\"Optimization complete!\")\n", + "print(f\"GEPA metric calls: {result.gepa_result.total_metric_calls}\")\n", + "print(f\"Candidates explored: {result.gepa_result.num_candidates}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Results Comparison\n", + "\n", + "Compare the original vs. optimized skill across quality and token efficiency." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\" * 60)\n", + "print(f\" OPTIMIZATION RESULTS: {SKILL_NAME}\")\n", + "print(\"=\" * 60)\n", + "print()\n", + "\n", + "# Quality comparison\n", + "quality_delta = result.improvement\n", + "quality_pct = (quality_delta / result.original_score * 100) if result.original_score > 0 else 0\n", + "print(f\" Quality Score\")\n", + "print(f\" Before: {result.original_score:.3f}\")\n", + "print(f\" After: {result.optimized_score:.3f}\")\n", + "print(f\" Delta: {quality_delta:+.3f} ({quality_pct:+.1f}%)\")\n", + "print()\n", + "\n", + "# Token comparison \n", + "token_delta = result.original_token_count - result.optimized_token_count\n", + "print(f\" Token Count\")\n", + "print(f\" Before: {result.original_token_count:,}\")\n", + "print(f\" After: {result.optimized_token_count:,}\")\n", + "print(f\" Saved: {token_delta:,} tokens ({result.token_reduction_pct:.1f}% reduction)\")\n", + "print()\n", + "\n", + "# Line count comparison\n", + "orig_lines = len(result.original_content.splitlines())\n", + "opt_lines = len(result.optimized_content.splitlines())\n", + "print(f\" Lines\")\n", + "print(f\" Before: {orig_lines}\")\n", + "print(f\" After: {opt_lines}\")\n", + "print(f\" Saved: {orig_lines - opt_lines} lines\")\n", + "print()\n", + "\n", + "# Validation scores\n", + "if result.val_scores:\n", + " avg_val = sum(result.val_scores.values()) / len(result.val_scores)\n", + " print(f\" Validation (held-out test cases)\")\n", + " for tid, score in result.val_scores.items():\n", + " print(f\" {tid}: {score:.3f}\")\n", + " print(f\" Average: {avg_val:.3f}\")\n", + "\n", + "print()\n", + "print(\"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visual comparison bar chart\n", + "try:\n", + " import matplotlib.pyplot as plt\n", + " import matplotlib\n", + " matplotlib.rcParams['font.family'] = 'monospace'\n", + "\n", + " fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n", + "\n", + " # Quality scores\n", + " ax = axes[0]\n", + " bars = ax.bar(\n", + " ['Before', 'After'],\n", + " [result.original_score, result.optimized_score],\n", + " color=['#d4534b', '#4a9c5d'],\n", + " width=0.5\n", + " )\n", + " ax.set_ylim(0, 1.1)\n", + " ax.set_ylabel('Quality Score')\n", + " ax.set_title(f'Quality: {result.original_score:.3f} → {result.optimized_score:.3f}')\n", + " for bar, val in zip(bars, [result.original_score, result.optimized_score]):\n", + " ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,\n", + " f'{val:.3f}', ha='center', fontweight='bold')\n", + "\n", + " # Token counts\n", + " ax = axes[1]\n", + " bars = ax.bar(\n", + " ['Before', 'After'],\n", + " [result.original_token_count, result.optimized_token_count],\n", + " color=['#d4534b', '#4a9c5d'],\n", + " width=0.5\n", + " )\n", + " ax.set_ylabel('Token Count')\n", + " ax.set_title(f'Tokens: {result.original_token_count:,} → {result.optimized_token_count:,} ({result.token_reduction_pct:.0f}% reduction)')\n", + " for bar, val in zip(bars, [result.original_token_count, result.optimized_token_count]):\n", + " ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 50,\n", + " f'{val:,}', ha='center', fontweight='bold')\n", + "\n", + " fig.suptitle(f'GEPA Optimization: {SKILL_NAME}', fontsize=14, fontweight='bold')\n", + " plt.tight_layout()\n", + " plt.show()\n", + "except ImportError:\n", + " print(\"(matplotlib not installed -- skipping chart)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Review the Diff\n", + "\n", + "Inspect what GEPA changed in the SKILL.md." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from skill_test.optimize.review import review_optimization\n", + "\n", + "review_optimization(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: Apply (Optional)\n", + "\n", + "If the results look good, apply the optimized SKILL.md. Uncomment the cell below to write it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to apply:\n", + "# from skill_test.optimize.review import apply_optimization\n", + "# apply_optimization(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using a Different Reflection Model\n", + "\n", + "By default, GEPA uses `databricks/databricks-gpt-5-2` via Databricks Model Serving.\n", + "You can change this per-call:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: use OpenAI instead of Databricks\n", + "# result = optimize_skill(\n", + "# skill_name=\"databricks-model-serving\",\n", + "# preset=\"quick\",\n", + "# reflection_lm=\"openai/gpt-4o\",\n", + "# )\n", + "\n", + "# Example: use Anthropic\n", + "# result = optimize_skill(\n", + "# skill_name=\"databricks-model-serving\",\n", + "# preset=\"quick\",\n", + "# reflection_lm=\"anthropic/claude-sonnet-4-5-20250514\",\n", + "# )\n", + "\n", + "# Or set the default for all calls:\n", + "# os.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n", + "\n", + "print(\"See README.md for full model configuration options.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "The GEPA optimization pipeline:\n", + "\n", + "| Metric | Before | After | Change |\n", + "|--------|--------|-------|--------|\n", + "| Quality Score | `result.original_score` | `result.optimized_score` | `result.improvement` |\n", + "| Token Count | `result.original_token_count` | `result.optimized_token_count` | `result.token_reduction_pct`% |\n", + "\n", + "Key points:\n", + "- **Quality gate**: Existing scorers (syntax, patterns, APIs, facts) are reused as-is\n", + "- **Token efficiency**: 80/20 quality/efficiency weighting penalizes bloated skills\n", + "- **Validation split**: Held-out test cases detect overfitting\n", + "- **Reflection LM**: Configurable via `--reflection-lm` flag or `GEPA_REFLECTION_LM` env var\n", + "- **Default model**: `databricks/databricks-gpt-5-2` via Databricks Model Serving" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py index f0a84212..4dd29819 100644 --- a/.test/scripts/optimize.py +++ b/.test/scripts/optimize.py @@ -68,6 +68,11 @@ def main(): default=None, help="LLM model for generative mode (e.g., openai/gpt-4o)", ) + parser.add_argument( + "--reflection-lm", + default=None, + help="Override GEPA reflection model (default: GEPA_REFLECTION_LM env or databricks/databricks-gpt-5-2)", + ) parser.add_argument( "--dry-run", action="store_true", @@ -108,6 +113,7 @@ def main(): mode=args.mode, preset=args.preset, task_lm=args.task_lm, + reflection_lm=args.reflection_lm, dry_run=args.dry_run, ) review_optimization(result) @@ -136,6 +142,7 @@ def main(): mode=args.mode, preset=args.preset, task_lm=args.task_lm, + reflection_lm=args.reflection_lm, dry_run=args.dry_run, ) review_optimization(result) diff --git a/.test/src/skill_test/cli/__init__.py b/.test/src/skill_test/cli/__init__.py index 9e79785e..246d7113 100644 --- a/.test/src/skill_test/cli/__init__.py +++ b/.test/src/skill_test/cli/__init__.py @@ -150,6 +150,7 @@ def main(): opt_preset = "standard" opt_mode = "static" opt_task_lm = None + opt_reflection_lm = None opt_dry_run = False opt_apply = False @@ -164,6 +165,9 @@ def main(): elif args[i] == "--task-lm" and i + 1 < len(args): opt_task_lm = args[i + 1] i += 2 + elif args[i] == "--reflection-lm" and i + 1 < len(args): + opt_reflection_lm = args[i + 1] + i += 2 elif args[i] == "--dry-run": opt_dry_run = True i += 1 @@ -176,6 +180,7 @@ def main(): result = optimize( skill_name, ctx, preset=opt_preset, mode=opt_mode, task_lm=opt_task_lm, + reflection_lm=opt_reflection_lm, dry_run=opt_dry_run, apply=opt_apply, ) else: diff --git a/.test/src/skill_test/cli/commands.py b/.test/src/skill_test/cli/commands.py index 6798d68c..be619d5a 100644 --- a/.test/src/skill_test/cli/commands.py +++ b/.test/src/skill_test/cli/commands.py @@ -1407,10 +1407,11 @@ def optimize( preset: str = "standard", mode: str = "static", task_lm: Optional[str] = None, + reflection_lm: Optional[str] = None, dry_run: bool = False, apply: bool = False, ) -> Dict[str, Any]: - """Optimize a skill using GEPA's optimize_anything. + """Optimize a skill using GEPA. Runs the full optimization pipeline: evaluate -> optimize -> review. Optionally applies the optimized result to the SKILL.md. @@ -1421,6 +1422,7 @@ def optimize( preset: GEPA preset ("quick", "standard", "thorough") mode: "static" (uses ground truth) or "generative" (generates fresh responses) task_lm: LLM model for generative mode + reflection_lm: Override GEPA reflection model dry_run: Show config and estimate cost without running apply: Apply the optimized result to SKILL.md @@ -1443,6 +1445,7 @@ def optimize( mode=mode, preset=preset, task_lm=task_lm, + reflection_lm=reflection_lm, dry_run=dry_run, ) diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index 63636275..8fc59a0b 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -1,19 +1,28 @@ """GEPA configuration presets for skill optimization. -GEPA's optimize() accepts flat kwargs. Presets are stored as dicts -that get unpacked into gepa.optimize(**preset). +GEPA's optimize() accepts flat kwargs. Presets are stored as dataclasses +that get unpacked into gepa.optimize(**preset.to_kwargs()). + +The reflection LM defaults to Databricks Model Serving (databricks-gpt-5-2). +Override via the GEPA_REFLECTION_LM environment variable or the --reflection-lm flag. """ +import os from dataclasses import dataclass from typing import Any +DEFAULT_REFLECTION_LM = os.environ.get( + "GEPA_REFLECTION_LM", "databricks/databricks-gpt-5-2" +) + + @dataclass class GEPAPreset: """Configuration preset for gepa.optimize() calls.""" max_metric_calls: int - reflection_lm: str = "openai/gpt-4o" + reflection_lm: str = DEFAULT_REFLECTION_LM candidate_selection_strategy: str = "pareto" reflection_minibatch_size: int = 3 skip_perfect_score: bool = True @@ -38,11 +47,12 @@ def to_kwargs(self) -> dict[str, Any]: } -def get_preset(name: str) -> GEPAPreset: - """Get a GEPA config preset by name. +def get_preset(name: str, reflection_lm: str | None = None) -> GEPAPreset: + """Get a GEPA config preset by name, optionally overriding the reflection LM. Args: name: One of "quick", "standard", "thorough" + reflection_lm: Override reflection LM model string (e.g., "databricks/databricks-gpt-5-2") Returns: GEPAPreset instance @@ -52,4 +62,14 @@ def get_preset(name: str) -> GEPAPreset: """ if name not in PRESETS: raise KeyError(f"Unknown preset '{name}'. Choose from: {list(PRESETS.keys())}") - return PRESETS[name] + preset = PRESETS[name] + if reflection_lm: + preset = GEPAPreset( + max_metric_calls=preset.max_metric_calls, + reflection_lm=reflection_lm, + candidate_selection_strategy=preset.candidate_selection_strategy, + reflection_minibatch_size=preset.reflection_minibatch_size, + skip_perfect_score=preset.skip_perfect_score, + display_progress_bar=preset.display_progress_bar, + ) + return preset diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index b811bcb3..0bcf3eb1 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -102,6 +102,7 @@ def optimize_skill( mode: Literal["static", "generative"] = "static", preset: Literal["quick", "standard", "thorough"] = "standard", task_lm: str | None = None, + reflection_lm: str | None = None, dry_run: bool = False, ) -> OptimizationResult: """Run end-to-end GEPA optimization on a skill. @@ -118,6 +119,7 @@ def optimize_skill( mode: "static" (uses ground truth responses) or "generative" (generates fresh) preset: GEPA config preset ("quick", "standard", "thorough") task_lm: LLM model for generative mode + reflection_lm: Override reflection LM (default: GEPA_REFLECTION_LM env or databricks/databricks-gpt-5-2) dry_run: If True, show config and dataset info without running optimization Returns: @@ -156,8 +158,8 @@ def optimize_skill( # seed_candidate is a dict with our SKILL_KEY seed_candidate = {SkillAdapter.SKILL_KEY: original_content} - # 4. Get preset config - preset_config = get_preset(preset) + # 4. Get preset config (with optional reflection LM override) + preset_config = get_preset(preset, reflection_lm=reflection_lm) # Dry run: show info and exit if dry_run: @@ -168,6 +170,7 @@ def optimize_skill( print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Mode: {mode}") print(f"Preset: {preset} (max_metric_calls={preset_config.max_metric_calls})") + print(f"Reflection LM: {preset_config.reflection_lm}") if mode == "generative": print(f"Task LM: {task_lm or 'not set'}") From 5f0c102010b3fa6f0922791b0404a5299fd63c1a Mon Sep 17 00:00:00 2001 From: CAholder Date: Sat, 21 Feb 2026 22:48:30 -0800 Subject: [PATCH 03/20] fixed config json extra parameter issues --- databricks-builder-app/app.yaml | 4 ++-- databricks-builder-app/requirements.txt | 2 +- databricks-builder-app/server/services/agent.py | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/databricks-builder-app/app.yaml b/databricks-builder-app/app.yaml index 5efe3f00..94694545 100644 --- a/databricks-builder-app/app.yaml +++ b/databricks-builder-app/app.yaml @@ -92,9 +92,9 @@ env: # instead of hitting Anthropic directly. The app dynamically sets ANTHROPIC_BASE_URL # and ANTHROPIC_AUTH_TOKEN from the user's Databricks credentials at runtime. - name: ANTHROPIC_MODEL - value: "databricks-claude-opus-4-5" + value: "databricks-claude-opus-4-6" - name: ANTHROPIC_MODEL_MINI - value: "databricks-claude-sonnet-4-5" + value: "databricks-claude-sonnet-4-6" - name: CLAUDE_CODE_STREAM_CLOSE_TIMEOUT value: "3600000" diff --git a/databricks-builder-app/requirements.txt b/databricks-builder-app/requirements.txt index e9287669..48b5cb7c 100644 --- a/databricks-builder-app/requirements.txt +++ b/databricks-builder-app/requirements.txt @@ -16,7 +16,7 @@ greenlet>=3.0.0 psycopg2-binary>=2.9.11 # Claude Agent SDK -claude-agent-sdk>=0.1.19 +claude-agent-sdk>=0.1.39 anthropic>=0.42.0 # MLflow for tracing Claude Code conversations diff --git a/databricks-builder-app/server/services/agent.py b/databricks-builder-app/server/services/agent.py index 1f3cd0f7..a7e0f42c 100644 --- a/databricks-builder-app/server/services/agent.py +++ b/databricks-builder-app/server/services/agent.py @@ -412,11 +412,15 @@ async def stream_agent_response( claude_env['ANTHROPIC_AUTH_TOKEN'] = databricks_token # Set the model to use (required for Databricks FMAPI) - anthropic_model = os.environ.get('ANTHROPIC_MODEL', 'databricks-claude-opus-4-5') + anthropic_model = os.environ.get('ANTHROPIC_MODEL', 'databricks-claude-opus-4-6') claude_env['ANTHROPIC_MODEL'] = anthropic_model - # Disable beta headers for Databricks FMAPI compatibility - claude_env['ANTHROPIC_CUSTOM_HEADERS'] = 'x-databricks-disable-beta-headers: true' + # Disable beta headers and experimental betas for Databricks FMAPI compatibility + # ANTHROPIC_CUSTOM_HEADERS enables coding agent mode on FMAPI + # CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS prevents context_management and other + # experimental body parameters that FMAPI doesn't support (400: Extra inputs not permitted) + claude_env['ANTHROPIC_CUSTOM_HEADERS'] = 'x-databricks-use-coding-agent-mode: true' + claude_env['CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS'] = '1' logger.info(f'Configured Databricks model serving: {anthropic_base_url} with model {anthropic_model}') logger.info(f'Claude env vars: BASE_URL={claude_env.get("ANTHROPIC_BASE_URL")}, MODEL={claude_env.get("ANTHROPIC_MODEL")}') From 33ae5a52175a0c65f58060cf26ab117c4ddfa0f4 Mon Sep 17 00:00:00 2001 From: CAholder Date: Sat, 21 Feb 2026 22:58:50 -0800 Subject: [PATCH 04/20] added optimizations for tools --- .test/README.md | 26 ++ .../gepa_skill_optimization_demo.ipynb | 37 +-- .test/scripts/optimize.py | 24 +- .test/src/skill_test/optimize/evaluator.py | 69 ++++- .test/src/skill_test/optimize/review.py | 47 +-- .test/src/skill_test/optimize/runner.py | 124 +++++--- .test/src/skill_test/optimize/tools.py | 271 ++++++++++++++++++ 7 files changed, 510 insertions(+), 88 deletions(-) create mode 100644 .test/src/skill_test/optimize/tools.py diff --git a/.test/README.md b/.test/README.md index 91043d66..d7626622 100644 --- a/.test/README.md +++ b/.test/README.md @@ -59,6 +59,32 @@ uv run python .test/scripts/optimize.py --apply uv run python .test/scripts/optimize.py --all --preset quick ``` +### Optimize MCP Tool Descriptions + +GEPA can also optimize the `@mcp.tool` docstrings in `databricks-mcp-server/`. Tool descriptions are what the AI agent sees when deciding which tool to call -- concise, accurate descriptions lead to better tool selection. + +```bash +# Optimize a skill AND its related tool modules together +uv run python .test/scripts/optimize.py databricks-model-serving --include-tools --tool-modules serving sql + +# Optimize specific tool modules alongside a skill +uv run python .test/scripts/optimize.py databricks-model-serving --include-tools --tool-modules serving compute jobs + +# Optimize ALL tool modules alongside a skill +uv run python .test/scripts/optimize.py databricks-model-serving --include-tools + +# Optimize ONLY tool descriptions (no SKILL.md) +uv run python .test/scripts/optimize.py databricks-model-serving --tools-only --tool-modules serving + +# Dry run to see components and token counts +uv run python .test/scripts/optimize.py databricks-model-serving --include-tools --dry-run +``` + +When `--include-tools` is used, GEPA creates one component per tool module (e.g., `tools_sql`, `tools_serving`) and round-robins through them alongside `skill_md`. The `--apply` flag writes optimized docstrings back to the MCP server source files. + +Available tool modules (88 tools across 16 modules): +`agent_bricks`, `aibi_dashboards`, `apps`, `compute`, `file`, `genie`, `jobs`, `lakebase`, `manifest`, `pipelines`, `serving`, `sql`, `unity_catalog`, `user`, `vector_search`, `volume_files` + ### Changing the Reflection Model GEPA uses a reflection LM to analyze scorer failures and propose skill improvements. The default is **Databricks Model Serving** (`databricks-gpt-5-2`). diff --git a/.test/notebooks/gepa_skill_optimization_demo.ipynb b/.test/notebooks/gepa_skill_optimization_demo.ipynb index 7d8cba92..f63a0d93 100644 --- a/.test/notebooks/gepa_skill_optimization_demo.ipynb +++ b/.test/notebooks/gepa_skill_optimization_demo.ipynb @@ -335,38 +335,21 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## Using a Different Reflection Model\n", - "\n", - "By default, GEPA uses `databricks/databricks-gpt-5-2` via Databricks Model Serving.\n", - "You can change this per-call:" - ] + "source": "## Multi-Component Optimization: Skills + Tools\n\nGEPA supports optimizing multiple text components simultaneously. You can optimize SKILL.md files **alongside** MCP tool descriptions in a single run.\n\nGEPA's `RoundRobinReflectionComponentSelector` cycles through components one at a time, so each gets dedicated reflection and mutation." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Example: use OpenAI instead of Databricks\n", - "# result = optimize_skill(\n", - "# skill_name=\"databricks-model-serving\",\n", - "# preset=\"quick\",\n", - "# reflection_lm=\"openai/gpt-4o\",\n", - "# )\n", - "\n", - "# Example: use Anthropic\n", - "# result = optimize_skill(\n", - "# skill_name=\"databricks-model-serving\",\n", - "# preset=\"quick\",\n", - "# reflection_lm=\"anthropic/claude-sonnet-4-5-20250514\",\n", - "# )\n", - "\n", - "# Or set the default for all calls:\n", - "# os.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n", - "\n", - "print(\"See README.md for full model configuration options.\")" - ] + "source": "# Inspect available MCP tools\nfrom skill_test.optimize.tools import get_tool_stats, extract_tool_descriptions, tools_to_gepa_components\n\nstats = get_tool_stats()\nprint(f\"MCP Tool Modules: {stats['modules']}\")\nprint(f\"Total Tools: {stats['total_tools']}\")\nprint(f\"Total Chars: {stats['total_description_chars']:,}\")\nprint()\nfor mod, info in stats[\"per_module\"].items():\n print(f\" {mod:<20} {info['tools']:>2} tools {info['chars']:>6,} chars\")\n\n# Show what GEPA components look like for selected modules\ntool_map = extract_tool_descriptions(modules=[\"serving\", \"sql\"])\ncomponents = tools_to_gepa_components(tool_map, per_module=True)\nprint(f\"\\nGEPA components for serving + sql: {list(components.keys())}\")\nfor name, text in components.items():\n from skill_test.optimize.evaluator import count_tokens\n print(f\" {name}: {count_tokens(text):,} tokens\")" + }, + { + "cell_type": "code", + "source": "## Changing the Reflection Model\n\nBy default, GEPA uses `databricks/databricks-gpt-5-2` via Databricks Model Serving.\nOverride per-call or via environment variable:\n\n```python\n# Per-call\nresult = optimize_skill(\"my-skill\", reflection_lm=\"openai/gpt-4o\")\n\n# Environment variable (persistent)\nos.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n```\n\nSee README.md for full model configuration options.", + "metadata": {}, + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -403,4 +386,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py index 4dd29819..b5b04c35 100644 --- a/.test/scripts/optimize.py +++ b/.test/scripts/optimize.py @@ -81,7 +81,23 @@ def main(): parser.add_argument( "--apply", action="store_true", - help="Apply the optimized SKILL.md (overwrites original)", + help="Apply the optimized SKILL.md and/or tool descriptions", + ) + parser.add_argument( + "--include-tools", + action="store_true", + help="Include MCP tool descriptions as additional optimization components", + ) + parser.add_argument( + "--tool-modules", + nargs="*", + default=None, + help="Specific tool modules to optimize (e.g., sql compute serving). Default: all.", + ) + parser.add_argument( + "--tools-only", + action="store_true", + help="Optimize ONLY tool descriptions, not the SKILL.md", ) args = parser.parse_args() @@ -114,6 +130,9 @@ def main(): preset=args.preset, task_lm=args.task_lm, reflection_lm=args.reflection_lm, + include_tools=args.include_tools, + tool_modules=args.tool_modules, + tools_only=args.tools_only, dry_run=args.dry_run, ) review_optimization(result) @@ -143,6 +162,9 @@ def main(): preset=args.preset, task_lm=args.task_lm, reflection_lm=args.reflection_lm, + include_tools=args.include_tools, + tool_modules=args.tool_modules, + tools_only=args.tools_only, dry_run=args.dry_run, ) review_optimization(result) diff --git a/.test/src/skill_test/optimize/evaluator.py b/.test/src/skill_test/optimize/evaluator.py index 20150938..4e5df7f7 100644 --- a/.test/src/skill_test/optimize/evaluator.py +++ b/.test/src/skill_test/optimize/evaluator.py @@ -71,6 +71,14 @@ def count_tokens(text: str) -> int: return len(enc.encode(text)) +def token_efficiency_score_raw(candidate_tokens: int, original_tokens: int) -> float: + """Score 0-1 based on token count ratio. Same logic as token_efficiency_score.""" + if original_tokens <= 0: + return 1.0 + ratio = candidate_tokens / original_tokens + return max(0.0, min(1.0, 2.0 - ratio)) + + def _run_scorer(scorer_fn: Any, outputs: dict, expectations: dict, inputs: dict) -> list[Feedback]: """Run a single scorer and normalize the result to a list of Feedbacks.""" sig = inspect.signature(scorer_fn) @@ -147,11 +155,13 @@ def _validate_skill_structure(candidate_text: str) -> list[Feedback]: class SkillAdapter(GEPAAdapter): - """GEPA adapter that evaluates candidate SKILL.md texts using existing scorers. + """GEPA adapter that evaluates candidate texts using existing scorers. + + Supports multi-component optimization: + - "skill_md": SKILL.md content (primary) + - "tools_*": MCP tool description blocks (optional, one per module) - The seed_candidate dict has a single key "skill_md" whose value is the - SKILL.md content. The adapter evaluates each data instance (test case) - by running the configured scorers. + GEPA's RoundRobinReflectionComponentSelector cycles through all components. """ SKILL_KEY = "skill_md" @@ -161,15 +171,26 @@ def __init__( skill_name: str, mode: Literal["static", "generative"] = "static", task_lm: str | None = None, + original_token_counts: dict[str, int] | None = None, ): self.skill_name = skill_name self.mode = mode self.task_lm = task_lm self.scorer_config = load_scorer_config(skill_name) - # Get original token count for efficiency scoring - skill_path = _find_skill_md(skill_name) - self.original_token_count = count_tokens(skill_path.read_text()) if skill_path else 0 + # Per-component original token counts for efficiency scoring + if original_token_counts: + self.original_token_counts = original_token_counts + else: + skill_path = _find_skill_md(skill_name) + self.original_token_counts = { + self.SKILL_KEY: count_tokens(skill_path.read_text()) if skill_path else 0 + } + + @property + def original_token_count(self) -> int: + """Total original token count across all components.""" + return sum(self.original_token_counts.values()) def evaluate( self, @@ -241,8 +262,10 @@ def evaluate( # 3. Convert to score + diagnostics composite, diagnostics = feedback_to_asi(all_feedbacks) - # 4. Factor in token efficiency - efficiency = token_efficiency_score(candidate_text, self.original_token_count) + # 4. Factor in token efficiency (across ALL components) + total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) + total_original_tokens = self.original_token_count + efficiency = token_efficiency_score_raw(total_candidate_tokens, total_original_tokens) # Weighted composite: 80% quality, 20% token efficiency final_score = 0.8 * composite + 0.2 * efficiency @@ -300,6 +323,7 @@ def create_skill_adapter( skill_name: str, mode: Literal["static", "generative"] = "static", task_lm: str | None = None, + original_token_counts: dict[str, int] | None = None, ) -> SkillAdapter: """Create a SkillAdapter for GEPA optimization. @@ -307,15 +331,37 @@ def create_skill_adapter( skill_name: Name of the skill being optimized mode: "static" or "generative" task_lm: LLM model string for generative mode + original_token_counts: Per-component original token counts (for multi-component) Returns: Configured SkillAdapter instance """ - return SkillAdapter(skill_name=skill_name, mode=mode, task_lm=task_lm) + return SkillAdapter( + skill_name=skill_name, + mode=mode, + task_lm=task_lm, + original_token_counts=original_token_counts, + ) -def build_optimization_background(skill_name: str, original_token_count: int) -> str: +def build_optimization_background( + skill_name: str, + original_token_count: int, + component_names: list[str] | None = None, +) -> str: """Build the background context string for GEPA's reflection LM.""" + components_desc = "" + if component_names and any(c.startswith("tools_") for c in component_names): + tool_modules = [c.replace("tools_", "") for c in component_names if c.startswith("tools_")] + components_desc = ( + "\n\nYou are also optimizing MCP tool descriptions for these modules: " + f"{', '.join(tool_modules)}. " + "Tool descriptions are docstrings on @mcp.tool functions. They tell the AI agent " + "what each tool does, its parameters, and return values. Keep descriptions " + "accurate, concise, and action-oriented. Include usage hints that help the agent " + "choose the right tool.\n" + ) + return ( f"You are optimizing a SKILL.md file for the '{skill_name}' Databricks skill. " "SKILL.md files teach AI agents (like Claude Code) how to use specific Databricks features. " @@ -331,4 +377,5 @@ def build_optimization_background(skill_name: str, original_token_count: int) -> "Remove redundant examples, consolidate similar patterns, " "and eliminate verbose explanations that don't add value. " "Every token consumed is agent context window budget -- keep skills lean and focused." + f"{components_desc}" ) diff --git a/.test/src/skill_test/optimize/review.py b/.test/src/skill_test/optimize/review.py index 0b5f1115..908b279a 100644 --- a/.test/src/skill_test/optimize/review.py +++ b/.test/src/skill_test/optimize/review.py @@ -88,39 +88,54 @@ def review_optimization(result: OptimizationResult) -> None: print(f"{'=' * 60}\n") -def apply_optimization(result: OptimizationResult) -> Path: - """Overwrite the original SKILL.md with the optimized version. +def apply_optimization(result: OptimizationResult) -> Path | None: + """Apply optimized SKILL.md and/or tool descriptions. - Also updates baseline via existing baseline workflow if possible. + Writes back: + - SKILL.md (if changed) + - MCP tool docstrings (if tools were included in optimization) Args: result: OptimizationResult from optimize_skill() Returns: - Path to the updated SKILL.md + Path to the updated SKILL.md (or None if tools_only) Raises: - FileNotFoundError: If original SKILL.md cannot be found ValueError: If optimization did not improve the skill """ - skill_path = _find_skill_md(result.skill_name) - if skill_path is None: - raise FileNotFoundError(f"Cannot find SKILL.md for '{result.skill_name}'") - - if result.optimized_content == result.original_content: - print(f"No changes to apply for '{result.skill_name}'.") - return skill_path - if result.improvement < 0: raise ValueError( f"Optimization regressed quality ({result.improvement:+.3f}). " "Refusing to apply. Use --force to override." ) - # Write optimized content - skill_path.write_text(result.optimized_content) + skill_path = None + + # Apply SKILL.md changes + if result.optimized_content and result.optimized_content != result.original_content: + skill_path = _find_skill_md(result.skill_name) + if skill_path: + skill_path.write_text(result.optimized_content) + print(f"Applied optimized SKILL.md to {skill_path}") + + # Apply tool description changes + if result.tool_map and result.components: + from .tools import parse_gepa_component, write_tool_descriptions + + all_optimized_tools = {} + for comp_name, comp_text in result.components.items(): + if comp_name.startswith("tools_"): + parsed = parse_gepa_component(comp_text) + all_optimized_tools.update(parsed) + + if all_optimized_tools: + modified = write_tool_descriptions(all_optimized_tools, result.tool_map) + if modified: + print(f"Applied optimized tool descriptions to {len(modified)} files:") + for f in modified: + print(f" {f}") - print(f"Applied optimized SKILL.md to {skill_path}") print(f" Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " f"({result.improvement:+.3f})") print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index 0bcf3eb1..bbf019ba 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -21,6 +21,13 @@ token_efficiency_score, ) from .splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances +from .tools import ( + extract_tool_descriptions, + tools_to_gepa_components, + parse_gepa_component, + write_tool_descriptions, + get_tool_stats, +) @dataclass @@ -31,15 +38,19 @@ class OptimizationResult: original_score: float optimized_score: float improvement: float - original_content: str - optimized_content: str - original_token_count: int + original_content: str # SKILL.md content (may be empty for tools_only) + optimized_content: str # Optimized SKILL.md content + original_token_count: int # Total tokens across ALL components optimized_token_count: int token_reduction_pct: float diff_summary: str val_scores: dict[str, float] mlflow_run_id: str | None gepa_result: Any + # Multi-component data + components: dict[str, str] | None = None # All optimized component texts + original_components: dict[str, str] | None = None # All original component texts + tool_map: Any = None # For writing back tool descriptions def _compute_diff_summary(original: str, optimized: str) -> str: @@ -103,16 +114,20 @@ def optimize_skill( preset: Literal["quick", "standard", "thorough"] = "standard", task_lm: str | None = None, reflection_lm: str | None = None, + include_tools: bool = False, + tool_modules: list[str] | None = None, + tools_only: bool = False, dry_run: bool = False, ) -> OptimizationResult: - """Run end-to-end GEPA optimization on a skill. + """Run end-to-end GEPA optimization on a skill, optionally with MCP tools. 1. Load current SKILL.md as seed_candidate - 2. Create train/val datasets from ground_truth.yaml - 3. Build adapter from existing scorers - 4. Run gepa.optimize() - 5. Log results to MLflow - 6. Return OptimizationResult with original/optimized scores and diff + 2. Optionally load MCP tool descriptions as additional components + 3. Create train/val datasets from ground_truth.yaml + 4. Build adapter from existing scorers + 5. Run gepa.optimize() (round-robin across all components) + 6. Log results to MLflow + 7. Return OptimizationResult with original/optimized scores and diff Args: skill_name: Name of the skill to optimize @@ -120,6 +135,9 @@ def optimize_skill( preset: GEPA config preset ("quick", "standard", "thorough") task_lm: LLM model for generative mode reflection_lm: Override reflection LM (default: GEPA_REFLECTION_LM env or databricks/databricks-gpt-5-2) + include_tools: If True, include MCP tool descriptions as additional GEPA components + tool_modules: Specific tool modules to include (e.g., ["sql", "compute"]). None = all. + tools_only: If True, optimize ONLY tool descriptions (no SKILL.md) dry_run: If True, show config and dataset info without running optimization Returns: @@ -130,14 +148,24 @@ def optimize_skill( """ # 1. Load current SKILL.md skill_path = _find_skill_md(skill_name) - if skill_path is None: + if not tools_only and skill_path is None: raise FileNotFoundError( f"Could not find SKILL.md for '{skill_name}'. " "Expected at .claude/skills/{name}/SKILL.md or databricks-skills/{name}/SKILL.md" ) - original_content = skill_path.read_text() - original_token_count = count_tokens(original_content) + original_content = skill_path.read_text() if skill_path else "" + original_token_count = count_tokens(original_content) if original_content else 0 + + # 1b. Load MCP tool descriptions if requested + tool_map = None + tool_components: dict[str, str] = {} + if include_tools or tools_only: + tool_map = extract_tool_descriptions(modules=tool_modules) + tool_components = tools_to_gepa_components(tool_map, per_module=True) + stats = get_tool_stats() + print(f"Tool modules: {stats['modules']}, tools: {stats['total_tools']}, " + f"description chars: {stats['total_description_chars']:,}") # 2. Create train/val datasets try: @@ -146,17 +174,30 @@ def optimize_skill( train, val = [], None if not train: - # Bootstrap mode train = generate_bootstrap_tasks(skill_name) val = None print(f"No test cases found for '{skill_name}'. Using {len(train)} auto-generated tasks.") print(f"For better results, add test cases: skill-test add {skill_name}") - # 3. Build adapter - adapter = create_skill_adapter(skill_name, mode=mode, task_lm=task_lm) + # 3. Build seed_candidate (multi-component if tools included) + original_token_counts: dict[str, int] = {} + seed_candidate: dict[str, str] = {} + + if not tools_only: + seed_candidate[SkillAdapter.SKILL_KEY] = original_content + original_token_counts[SkillAdapter.SKILL_KEY] = original_token_count + + for comp_name, comp_text in tool_components.items(): + seed_candidate[comp_name] = comp_text + original_token_counts[comp_name] = count_tokens(comp_text) - # seed_candidate is a dict with our SKILL_KEY - seed_candidate = {SkillAdapter.SKILL_KEY: original_content} + total_original_tokens = sum(original_token_counts.values()) + + # 3b. Build adapter with per-component token counts + adapter = create_skill_adapter( + skill_name, mode=mode, task_lm=task_lm, + original_token_counts=original_token_counts, + ) # 4. Get preset config (with optional reflection LM override) preset_config = get_preset(preset, reflection_lm=reflection_lm) @@ -164,8 +205,12 @@ def optimize_skill( # Dry run: show info and exit if dry_run: print(f"\n=== Dry Run: {skill_name} ===") - print(f"SKILL.md path: {skill_path}") - print(f"Original tokens: {original_token_count:,}") + if not tools_only: + print(f"SKILL.md path: {skill_path}") + print(f"Components: {list(seed_candidate.keys())}") + print(f"Total original tokens: {total_original_tokens:,}") + for comp, tokens in original_token_counts.items(): + print(f" {comp}: {tokens:,} tokens") print(f"Train tasks: {len(train)}") print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Mode: {mode}") @@ -185,13 +230,16 @@ def optimize_skill( improvement=0.0, original_content=original_content, optimized_content=original_content, - original_token_count=original_token_count, - optimized_token_count=original_token_count, + original_token_count=total_original_tokens, + optimized_token_count=total_original_tokens, token_reduction_pct=0.0, diff_summary="Dry run - no optimization performed", val_scores={}, mlflow_run_id=None, gepa_result=None, + components=dict(seed_candidate), + original_components=dict(seed_candidate), + tool_map=tool_map, ) # Evaluate original score @@ -215,26 +263,33 @@ def optimize_skill( result = gepa.optimize(**gepa_kwargs) - # result.best_candidate is dict[str, str] - optimized_content = result.best_candidate.get(SkillAdapter.SKILL_KEY, original_content) - optimized_token_count = count_tokens(optimized_content) + # result.best_candidate is dict[str, str] (all components) + best = result.best_candidate + optimized_content = best.get(SkillAdapter.SKILL_KEY, original_content) + optimized_token_count = sum(count_tokens(v) for v in best.values()) - # Evaluate optimized on train - optimized_candidate = {SkillAdapter.SKILL_KEY: optimized_content} - optimized_score, _ = _evaluate_on_tasks(adapter, optimized_candidate, train) + # Evaluate optimized on train (using full multi-component candidate) + optimized_score, _ = _evaluate_on_tasks(adapter, best, train) # Evaluate on val if available val_scores: dict[str, float] = {} if val: - _, val_scores = _evaluate_on_tasks(adapter, optimized_candidate, val) + _, val_scores = _evaluate_on_tasks(adapter, best, val) - # Token reduction + # Token reduction (total across all components) token_reduction_pct = ( - (original_token_count - optimized_token_count) / original_token_count * 100 - if original_token_count > 0 + (total_original_tokens - optimized_token_count) / total_original_tokens * 100 + if total_original_tokens > 0 else 0.0 ) + # Write optimized tool descriptions back if tools were included + optimized_tool_components: dict[str, str] = {} + if tool_map and not dry_run: + for comp_name, comp_text in best.items(): + if comp_name.startswith("tools_"): + optimized_tool_components[comp_name] = comp_text + # Diff summary diff_summary = _compute_diff_summary(original_content, optimized_content) @@ -260,7 +315,7 @@ def optimize_skill( "original_score": original_score, "optimized_score": optimized_score, "improvement": optimized_score - original_score, - "original_tokens": float(original_token_count), + "original_tokens": float(total_original_tokens), "optimized_tokens": float(optimized_token_count), "token_reduction_pct": token_reduction_pct, "total_metric_calls": float(result.total_metric_calls or 0), @@ -277,11 +332,14 @@ def optimize_skill( improvement=optimized_score - original_score, original_content=original_content, optimized_content=optimized_content, - original_token_count=original_token_count, + original_token_count=total_original_tokens, optimized_token_count=optimized_token_count, token_reduction_pct=token_reduction_pct, diff_summary=diff_summary, val_scores=val_scores, mlflow_run_id=mlflow_run_id, gepa_result=result, + components=dict(best), + original_components=dict(seed_candidate), + tool_map=tool_map, ) diff --git a/.test/src/skill_test/optimize/tools.py b/.test/src/skill_test/optimize/tools.py new file mode 100644 index 00000000..29ef6c24 --- /dev/null +++ b/.test/src/skill_test/optimize/tools.py @@ -0,0 +1,271 @@ +"""Tool description extraction and writing for MCP server tools. + +Extracts @mcp.tool docstrings from Python source files, formats them for GEPA +optimization, and writes optimized descriptions back to source files. + +Each tool module (sql.py, compute.py, etc.) becomes one GEPA component so +GEPA's round-robin selector cycles through modules efficiently. +""" + +import ast +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +# Separator used between tools within a module's GEPA component text +TOOL_SEPARATOR = "\n\n### TOOL: " + +MCP_TOOLS_DIR = Path(__file__).resolve().parents[5] / "databricks-mcp-server" / "databricks_mcp_server" / "tools" + + +@dataclass +class ToolDescription: + """A single tool's description extracted from source.""" + + name: str # Function name + docstring: str # The full docstring text + module: str # Module name (e.g., "sql", "compute") + lineno: int # Line number of the function def + source_path: Path # Path to the source file + + +def _find_tools_dir() -> Path: + """Find the MCP tools directory.""" + if MCP_TOOLS_DIR.exists(): + return MCP_TOOLS_DIR + # Fallback: search from repo root + from .evaluator import _find_repo_root + + repo_root = _find_repo_root() + candidate = repo_root / "databricks-mcp-server" / "databricks_mcp_server" / "tools" + if candidate.exists(): + return candidate + raise FileNotFoundError( + "Could not find MCP tools directory. " + "Expected at databricks-mcp-server/databricks_mcp_server/tools/" + ) + + +def extract_tool_descriptions( + modules: list[str] | None = None, + tools_dir: Path | None = None, +) -> dict[str, list[ToolDescription]]: + """Extract all @mcp.tool docstrings from MCP server tool files. + + Args: + modules: Optional list of module names to extract (e.g., ["sql", "compute"]). + If None, extracts all modules. + tools_dir: Override path to tools directory. + + Returns: + Dict mapping module_name -> list of ToolDescription. + """ + if tools_dir is None: + tools_dir = _find_tools_dir() + + results: dict[str, list[ToolDescription]] = {} + + for py_file in sorted(tools_dir.glob("*.py")): + module_name = py_file.stem + if module_name == "__init__": + continue + if modules and module_name not in modules: + continue + + source = py_file.read_text() + tree = ast.parse(source) + + tool_descs = [] + for node in ast.walk(tree): + if not isinstance(node, ast.FunctionDef): + continue + # Check if decorated with @mcp.tool + for dec in node.decorator_list: + is_mcp_tool = False + if isinstance(dec, ast.Attribute) and isinstance(dec.value, ast.Name): + if dec.value.id == "mcp" and dec.attr == "tool": + is_mcp_tool = True + elif isinstance(dec, ast.Name) and dec.id == "mcp": + is_mcp_tool = True + if is_mcp_tool: + docstring = ast.get_docstring(node) or "" + tool_descs.append( + ToolDescription( + name=node.name, + docstring=docstring, + module=module_name, + lineno=node.lineno, + source_path=py_file, + ) + ) + break + + if tool_descs: + results[module_name] = tool_descs + + return results + + +def tools_to_gepa_components( + tool_map: dict[str, list[ToolDescription]], + per_module: bool = True, +) -> dict[str, str]: + """Convert extracted tool descriptions into GEPA component text blocks. + + Args: + tool_map: Output of extract_tool_descriptions() + per_module: If True, one GEPA component per module (e.g., "tools_sql"). + If False, all tools in a single "tool_descriptions" component. + + Returns: + Dict mapping component_name -> text block. + """ + if per_module: + components = {} + for module_name, tools in tool_map.items(): + text_parts = [] + for td in tools: + text_parts.append(f"### TOOL: {td.name}\n{td.docstring}") + components[f"tools_{module_name}"] = "\n\n".join(text_parts) + return components + else: + all_parts = [] + for module_name, tools in sorted(tool_map.items()): + for td in tools: + all_parts.append(f"### TOOL: {td.name} (module: {module_name})\n{td.docstring}") + return {"tool_descriptions": "\n\n".join(all_parts)} + + +def parse_gepa_component(component_text: str) -> dict[str, str]: + """Parse a GEPA component text block back into individual tool descriptions. + + Args: + component_text: Text block with ### TOOL: markers + + Returns: + Dict mapping tool_name -> optimized docstring + """ + tools = {} + parts = re.split(r"### TOOL:\s*", component_text) + for part in parts: + part = part.strip() + if not part: + continue + # First line has the tool name (possibly with module annotation) + lines = part.split("\n", 1) + name_line = lines[0].strip() + # Remove module annotation if present: "execute_sql (module: sql)" + name = re.match(r"(\w+)", name_line).group(1) if re.match(r"(\w+)", name_line) else name_line + docstring = lines[1].strip() if len(lines) > 1 else "" + tools[name] = docstring + return tools + + +def write_tool_descriptions( + optimized: dict[str, str], + tool_map: dict[str, list[ToolDescription]], +) -> list[Path]: + """Write optimized docstrings back to MCP server source files. + + Uses AST to locate the exact docstring positions and replaces them + in the source text while preserving all other code. + + Args: + optimized: Dict mapping tool_name -> optimized docstring text + tool_map: Original extraction map (for source file locations) + + Returns: + List of modified file paths + """ + # Group updates by file + updates_by_file: dict[Path, list[tuple[ToolDescription, str]]] = {} + for module_name, tools in tool_map.items(): + for td in tools: + if td.name in optimized: + updates_by_file.setdefault(td.source_path, []).append((td, optimized[td.name])) + + modified_files = [] + for file_path, updates in updates_by_file.items(): + source = file_path.read_text() + tree = ast.parse(source) + source_lines = source.splitlines(keepends=True) + + # Process updates in reverse line order to preserve positions + updates_sorted = sorted(updates, key=lambda x: x[0].lineno, reverse=True) + + for td, new_docstring in updates_sorted: + # Find the function node + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) and node.name == td.name and node.lineno == td.lineno: + # Find the docstring node (first Expr with a Constant string) + if ( + node.body + and isinstance(node.body[0], ast.Expr) + and isinstance(node.body[0].value, ast.Constant) + and isinstance(node.body[0].value.value, str) + ): + doc_node = node.body[0] + # Get the docstring's line range + start_line = doc_node.lineno - 1 # 0-indexed + end_line = doc_node.end_lineno # exclusive + + # Detect indentation from the original docstring line + original_line = source_lines[start_line] + indent = re.match(r"(\s*)", original_line).group(1) + + # Build new docstring with proper indentation + new_doc_lines = [f'{indent}"""\n'] + for line in new_docstring.split("\n"): + if line.strip(): + new_doc_lines.append(f"{indent}{line}\n") + else: + new_doc_lines.append("\n") + new_doc_lines.append(f'{indent}"""\n') + + # Replace lines + source_lines[start_line:end_line] = new_doc_lines + break + + new_source = "".join(source_lines) + + # Validate the new source parses + try: + ast.parse(new_source) + except SyntaxError as e: + print(f"WARNING: Optimized source for {file_path.name} has syntax error: {e}") + print("Skipping this file.") + continue + + file_path.write_text(new_source) + modified_files.append(file_path) + + return modified_files + + +def list_tool_modules(tools_dir: Path | None = None) -> list[str]: + """List available tool module names.""" + if tools_dir is None: + tools_dir = _find_tools_dir() + return sorted( + f.stem for f in tools_dir.glob("*.py") + if f.stem != "__init__" + ) + + +def get_tool_stats(tools_dir: Path | None = None) -> dict[str, Any]: + """Get statistics about available MCP tools.""" + tool_map = extract_tool_descriptions(tools_dir=tools_dir) + total_tools = sum(len(tools) for tools in tool_map.values()) + total_chars = sum( + len(td.docstring) for tools in tool_map.values() for td in tools + ) + return { + "modules": len(tool_map), + "total_tools": total_tools, + "total_description_chars": total_chars, + "per_module": { + name: {"tools": len(tools), "chars": sum(len(td.docstring) for td in tools)} + for name, tools in tool_map.items() + }, + } From 7b6e1eaa5f0b1089d49d2c9069296cd89ae6eee4 Mon Sep 17 00:00:00 2001 From: CAholder Date: Sat, 21 Feb 2026 23:15:43 -0800 Subject: [PATCH 05/20] fix to use gepa 0.1.0 --- .test/src/skill_test/optimize/__init__.py | 15 +- .test/src/skill_test/optimize/asi.py | 39 +- .test/src/skill_test/optimize/config.py | 80 ++--- .test/src/skill_test/optimize/evaluator.py | 339 ++++++------------ .../skill_test/optimize/gskill/pipeline.py | 187 ++++------ .test/src/skill_test/optimize/runner.py | 252 ++++++------- .test/tests/test_optimize_e2e.py | 216 ++++------- 7 files changed, 400 insertions(+), 728 deletions(-) diff --git a/.test/src/skill_test/optimize/__init__.py b/.test/src/skill_test/optimize/__init__.py index 5dd5cc9f..87cab788 100644 --- a/.test/src/skill_test/optimize/__init__.py +++ b/.test/src/skill_test/optimize/__init__.py @@ -1,22 +1,21 @@ -"""GEPA-powered skill optimization for the skill-test framework. +"""GEPA-powered skill optimization using optimize_anything API. Public API: - optimize_skill() - End-to-end optimize a SKILL.md - create_skill_adapter() - Create a GEPA adapter for a skill - OptimizationResult - Dataclass with optimization results - PRESETS - GEPA config presets (quick, standard, thorough) + optimize_skill() - End-to-end optimize a SKILL.md (and optionally tools) + create_skill_evaluator() - Create a GEPA evaluator for a skill + OptimizationResult - Dataclass with optimization results + PRESETS - GEPA config presets (quick, standard, thorough) """ from .runner import optimize_skill, OptimizationResult -from .evaluator import create_skill_adapter, SkillAdapter +from .evaluator import create_skill_evaluator from .config import PRESETS from .review import review_optimization, apply_optimization __all__ = [ "optimize_skill", "OptimizationResult", - "create_skill_adapter", - "SkillAdapter", + "create_skill_evaluator", "PRESETS", "review_optimization", "apply_optimization", diff --git a/.test/src/skill_test/optimize/asi.py b/.test/src/skill_test/optimize/asi.py index fea0fd70..d8ff506d 100644 --- a/.test/src/skill_test/optimize/asi.py +++ b/.test/src/skill_test/optimize/asi.py @@ -1,11 +1,12 @@ -"""ASI diagnostics: convert MLflow Feedback to GEPA (score, diagnostics) contract. +"""ASI diagnostics: convert MLflow Feedback to optimize_anything SideInfo. -Collects failure diagnostics so the adapter's make_reflective_dataset() can -provide actionable context to GEPA's reflection LM. +Routes failure diagnostics through oa.log() so GEPA's reflection LM gets +actionable context about what went wrong with each scorer. """ from typing import Any +import gepa.optimize_anything as oa from mlflow.entities import Feedback @@ -17,9 +18,6 @@ def feedback_to_score(feedback: Feedback) -> float | None: "no" -> 0.0 "skip" -> None (excluded from scoring) numeric -> float(value) - - Returns: - Float score or None if the feedback should be excluded. """ value = feedback.value if value == "yes": @@ -36,29 +34,27 @@ def feedback_to_score(feedback: Feedback) -> float | None: def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: - """Convert a list of MLflow Feedback objects to GEPA (score, diagnostics). + """Convert MLflow Feedback objects to optimize_anything (score, SideInfo). - Computes the mean score across all non-skipped feedbacks and builds - a diagnostics dict with per-scorer results. + Computes the mean score across non-skipped feedbacks and builds a + SideInfo dict. Logs failures via oa.log() so GEPA's reflection LM + sees actionable failure context. Args: feedbacks: List of MLflow Feedback objects from running scorers Returns: - Tuple of (composite_score, diagnostics_dict) - - composite_score: 0.0-1.0 mean across all scorable feedbacks - - diagnostics_dict: per-scorer name -> {score, rationale, value} + Tuple of (composite_score, side_info_dict) """ scores = [] - diagnostics: dict[str, Any] = {} - failure_messages: list[str] = [] + side_info: dict[str, Any] = {} for fb in feedbacks: score = feedback_to_score(fb) name = fb.name or "unnamed" if score is None: - diagnostics[name] = { + side_info[name] = { "score": None, "value": fb.value, "rationale": fb.rationale or "", @@ -67,22 +63,20 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: continue scores.append(score) - diagnostics[name] = { + side_info[name] = { "score": score, "value": fb.value, "rationale": fb.rationale or "", "status": "pass" if score >= 0.5 else "fail", } - # Collect failure messages for reflection dataset + # Route failures through oa.log() for GEPA reflection if score < 1.0: - failure_messages.append( - f"Scorer '{name}' returned {fb.value}: {fb.rationale or 'no rationale'}" - ) + oa.log(f"Scorer '{name}' returned {fb.value}: {fb.rationale or 'no rationale'}") composite = sum(scores) / len(scores) if scores else 0.0 - diagnostics["_summary"] = { + side_info["_summary"] = { "composite_score": composite, "total_scorers": len(feedbacks), "scored": len(scores), @@ -90,6 +84,5 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: "passed": sum(1 for s in scores if s >= 0.5), "failed": sum(1 for s in scores if s < 0.5), } - diagnostics["_failure_messages"] = failure_messages - return composite, diagnostics + return composite, side_info diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index 8fc59a0b..c0ed12c6 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -1,75 +1,55 @@ """GEPA configuration presets for skill optimization. -GEPA's optimize() accepts flat kwargs. Presets are stored as dataclasses -that get unpacked into gepa.optimize(**preset.to_kwargs()). - -The reflection LM defaults to Databricks Model Serving (databricks-gpt-5-2). -Override via the GEPA_REFLECTION_LM environment variable or the --reflection-lm flag. +Uses the optimize_anything API with GEPAConfig/EngineConfig/ReflectionConfig. """ import os -from dataclasses import dataclass -from typing import Any +from gepa.optimize_anything import GEPAConfig, EngineConfig, ReflectionConfig DEFAULT_REFLECTION_LM = os.environ.get( "GEPA_REFLECTION_LM", "databricks/databricks-gpt-5-2" ) - -@dataclass -class GEPAPreset: - """Configuration preset for gepa.optimize() calls.""" - - max_metric_calls: int - reflection_lm: str = DEFAULT_REFLECTION_LM - candidate_selection_strategy: str = "pareto" - reflection_minibatch_size: int = 3 - skip_perfect_score: bool = True - display_progress_bar: bool = True - - def to_kwargs(self) -> dict[str, Any]: - """Convert to kwargs dict for gepa.optimize().""" - return { - "max_metric_calls": self.max_metric_calls, - "reflection_lm": self.reflection_lm, - "candidate_selection_strategy": self.candidate_selection_strategy, - "reflection_minibatch_size": self.reflection_minibatch_size, - "skip_perfect_score": self.skip_perfect_score, - "display_progress_bar": self.display_progress_bar, - } - - -PRESETS: dict[str, GEPAPreset] = { - "quick": GEPAPreset(max_metric_calls=15), - "standard": GEPAPreset(max_metric_calls=50), - "thorough": GEPAPreset(max_metric_calls=150), +PRESETS: dict[str, GEPAConfig] = { + "quick": GEPAConfig( + engine=EngineConfig(max_metric_calls=15, parallel=True), + reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + ), + "standard": GEPAConfig( + engine=EngineConfig(max_metric_calls=50, parallel=True), + reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + ), + "thorough": GEPAConfig( + engine=EngineConfig(max_metric_calls=150, parallel=True), + reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + ), } -def get_preset(name: str, reflection_lm: str | None = None) -> GEPAPreset: +def get_preset(name: str, reflection_lm: str | None = None) -> GEPAConfig: """Get a GEPA config preset by name, optionally overriding the reflection LM. Args: name: One of "quick", "standard", "thorough" - reflection_lm: Override reflection LM model string (e.g., "databricks/databricks-gpt-5-2") + reflection_lm: Override reflection LM model string Returns: - GEPAPreset instance - - Raises: - KeyError: If preset name is not recognized + GEPAConfig instance """ if name not in PRESETS: raise KeyError(f"Unknown preset '{name}'. Choose from: {list(PRESETS.keys())}") - preset = PRESETS[name] + config = PRESETS[name] if reflection_lm: - preset = GEPAPreset( - max_metric_calls=preset.max_metric_calls, - reflection_lm=reflection_lm, - candidate_selection_strategy=preset.candidate_selection_strategy, - reflection_minibatch_size=preset.reflection_minibatch_size, - skip_perfect_score=preset.skip_perfect_score, - display_progress_bar=preset.display_progress_bar, + config = GEPAConfig( + engine=config.engine, + reflection=ReflectionConfig( + reflection_lm=reflection_lm, + reflection_minibatch_size=config.reflection.reflection_minibatch_size, + skip_perfect_score=config.reflection.skip_perfect_score, + ), + merge=config.merge, + refiner=config.refiner, + tracking=config.tracking, ) - return preset + return config diff --git a/.test/src/skill_test/optimize/evaluator.py b/.test/src/skill_test/optimize/evaluator.py index 4e5df7f7..288bc368 100644 --- a/.test/src/skill_test/optimize/evaluator.py +++ b/.test/src/skill_test/optimize/evaluator.py @@ -1,16 +1,15 @@ -"""Evaluator bridge: wrap existing MLflow scorers into a GEPA adapter. +"""Evaluator bridge: wrap existing MLflow scorers into optimize_anything evaluators. -This is the core integration point between the skill-test scorer framework -and GEPA's optimize(). It implements GEPAAdapter to evaluate candidate -SKILL.md texts against test cases using existing scorers. +Creates GEPA-compatible evaluator functions that take a candidate (str or dict) +and a task example, run existing scorers, and return (score, SideInfo). """ import inspect from pathlib import Path -from typing import Any, Literal +from typing import Any, Callable, Literal import tiktoken -from gepa import EvaluationBatch, GEPAAdapter +import gepa.optimize_anything as oa from mlflow.entities import Feedback from ..runners.evaluate import build_scorers, load_scorer_config @@ -22,9 +21,12 @@ expected_facts_present, ) from .asi import feedback_to_asi -from .splitter import SkillTask +# --------------------------------------------------------------------------- +# Path utilities +# --------------------------------------------------------------------------- + def _find_repo_root() -> Path: """Find the repo root by searching upward for .test/src/.""" current = Path(__file__).resolve().parent @@ -50,12 +52,20 @@ def _find_skill_md(skill_name: str) -> Path | None: return None +# --------------------------------------------------------------------------- +# Token utilities +# --------------------------------------------------------------------------- + +def count_tokens(text: str) -> int: + """Count tokens using cl100k_base encoding.""" + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) + + def token_efficiency_score(candidate_text: str, original_token_count: int) -> float: """Score 0-1 based on how concise the candidate is vs. the original. - - Same size or smaller = 1.0 - - 10% larger = 0.9, 20% larger = 0.8, etc. - - Capped at 0.0 for 100%+ bloat + Same size or smaller = 1.0, linear penalty up to 0.0 at 2x. """ if original_token_count <= 0: return 1.0 @@ -65,22 +75,12 @@ def token_efficiency_score(candidate_text: str, original_token_count: int) -> fl return max(0.0, min(1.0, 2.0 - ratio)) -def count_tokens(text: str) -> int: - """Count tokens using cl100k_base encoding.""" - enc = tiktoken.get_encoding("cl100k_base") - return len(enc.encode(text)) - - -def token_efficiency_score_raw(candidate_tokens: int, original_tokens: int) -> float: - """Score 0-1 based on token count ratio. Same logic as token_efficiency_score.""" - if original_tokens <= 0: - return 1.0 - ratio = candidate_tokens / original_tokens - return max(0.0, min(1.0, 2.0 - ratio)) - +# --------------------------------------------------------------------------- +# Scorer execution +# --------------------------------------------------------------------------- def _run_scorer(scorer_fn: Any, outputs: dict, expectations: dict, inputs: dict) -> list[Feedback]: - """Run a single scorer and normalize the result to a list of Feedbacks.""" + """Run a single scorer and normalize result to a list of Feedbacks.""" sig = inspect.signature(scorer_fn) params = list(sig.parameters.keys()) @@ -105,33 +105,26 @@ def _run_scorer(scorer_fn: Any, outputs: dict, expectations: dict, inputs: dict) def _run_deterministic_scorers( - task: SkillTask, + response: str, + expectations: dict[str, Any], + prompt: str, scorer_config: dict[str, Any], ) -> list[Feedback]: - """Run deterministic scorers against a task's expected response.""" - outputs = {"response": task.get("answer", "")} - expectations = task.get("expectations", {}) - inputs = {"prompt": task.get("input", "")} + """Run deterministic scorers against a response.""" + outputs = {"response": response} + inputs = {"prompt": prompt} if scorer_config: scorers = build_scorers(scorer_config) else: - scorers = [ - python_syntax, - sql_syntax, - pattern_adherence, - no_hallucinated_apis, - expected_facts_present, - ] + scorers = [python_syntax, sql_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] all_feedbacks = [] for scorer_fn in scorers: - # Skip LLM-based scorers -- only deterministic scorer_name = getattr(scorer_fn, "__name__", "") or getattr(scorer_fn, "name", "") if scorer_name in ("Safety", "Guidelines", "skill_quality"): continue - feedbacks = _run_scorer(scorer_fn, outputs, expectations, inputs) - all_feedbacks.extend(feedbacks) + all_feedbacks.extend(_run_scorer(scorer_fn, outputs, expectations, inputs)) return all_feedbacks @@ -140,208 +133,114 @@ def _validate_skill_structure(candidate_text: str) -> list[Feedback]: """Validate the SKILL.md structure itself.""" outputs = {"response": candidate_text} feedbacks = [] - for scorer_fn in [python_syntax, sql_syntax, no_hallucinated_apis]: result = _run_scorer(scorer_fn, outputs, {}, {}) for fb in result: feedbacks.append(Feedback(name=f"skill_{fb.name}", value=fb.value, rationale=fb.rationale)) - return feedbacks -# Type aliases for GEPA adapter generics -_Trajectory = dict[str, Any] -_RolloutOutput = dict[str, Any] +# --------------------------------------------------------------------------- +# Evaluator factory (optimize_anything compatible) +# --------------------------------------------------------------------------- +SKILL_KEY = "skill_md" -class SkillAdapter(GEPAAdapter): - """GEPA adapter that evaluates candidate texts using existing scorers. - Supports multi-component optimization: - - "skill_md": SKILL.md content (primary) - - "tools_*": MCP tool description blocks (optional, one per module) +def create_skill_evaluator( + skill_name: str, + mode: Literal["static", "generative"] = "static", + task_lm: str | None = None, + original_token_counts: dict[str, int] | None = None, +) -> Callable: + """Create an optimize_anything-compatible evaluator for a skill. - GEPA's RoundRobinReflectionComponentSelector cycles through all components. + Returns a function: (candidate, example) -> (score, side_info) + + The candidate is dict[str, str] (may have "skill_md" + "tools_*" keys). + The example is a task dict from the dataset. """ + scorer_config = load_scorer_config(skill_name) - SKILL_KEY = "skill_md" - - def __init__( - self, - skill_name: str, - mode: Literal["static", "generative"] = "static", - task_lm: str | None = None, - original_token_counts: dict[str, int] | None = None, - ): - self.skill_name = skill_name - self.mode = mode - self.task_lm = task_lm - self.scorer_config = load_scorer_config(skill_name) - - # Per-component original token counts for efficiency scoring - if original_token_counts: - self.original_token_counts = original_token_counts - else: - skill_path = _find_skill_md(skill_name) - self.original_token_counts = { - self.SKILL_KEY: count_tokens(skill_path.read_text()) if skill_path else 0 - } - - @property - def original_token_count(self) -> int: - """Total original token count across all components.""" - return sum(self.original_token_counts.values()) - - def evaluate( - self, - batch: list[dict[str, Any]], - candidate: dict[str, str], - capture_traces: bool = False, - ) -> EvaluationBatch: - """Evaluate candidate SKILL.md against a batch of test cases. + # Compute original token count for efficiency scoring + if original_token_counts is None: + skill_path = _find_skill_md(skill_name) + original_token_counts = { + SKILL_KEY: count_tokens(skill_path.read_text()) if skill_path else 0 + } + total_original_tokens = sum(original_token_counts.values()) + + def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: + """Evaluate a candidate against a single task example. Args: - batch: List of DefaultDataInst dicts ({input, additional_context, answer}) - candidate: Dict with key "skill_md" -> SKILL.md content - capture_traces: Whether to capture traces (for reflection dataset) + candidate: dict[str, str] with "skill_md" and/or "tools_*" keys + example: Task dict with "input", "answer", "additional_context" Returns: - EvaluationBatch with trajectories containing scores and diagnostics + (score, side_info) tuple for optimize_anything """ - candidate_text = candidate.get(self.SKILL_KEY, "") - outputs: list[dict[str, Any]] = [] - scores: list[float] = [] - trajectories = [] - - for data_inst in batch: - all_feedbacks = [] - - # Build a SkillTask from the data instance for scorer compatibility - task: SkillTask = { - "id": data_inst.get("additional_context", {}).get("id", ""), - "input": data_inst.get("input", ""), - "answer": data_inst.get("answer", ""), - "additional_context": data_inst.get("additional_context", {}), - "metadata": {}, - } - - # Decode expectations if stored in additional_context - expectations_json = data_inst.get("additional_context", {}).get("expectations", "") - if expectations_json: - import json - try: - task["expectations"] = json.loads(expectations_json) - except (json.JSONDecodeError, TypeError): - task["expectations"] = {} - - if self.mode == "generative" and self.task_lm: - # Generate a fresh response using the candidate skill - import litellm - messages = [ - { - "role": "system", - "content": ( - "You are an AI assistant with the following skill documentation:\n\n" - f"{candidate_text}\n\n" - "Use this documentation to answer the user's question." - ), - }, - {"role": "user", "content": task.get("input", "")}, - ] - response = litellm.completion(model=self.task_lm, messages=messages) - task["answer"] = response.choices[0].message.content - - # 1. Run deterministic scorers against the response - response_feedbacks = _run_deterministic_scorers(task, self.scorer_config) - all_feedbacks.extend(response_feedbacks) - - # 2. Validate the skill structure itself + candidate_text = candidate.get(SKILL_KEY, "") + all_feedbacks: list[Feedback] = [] + + # Decode expectations from additional_context + expectations = {} + expectations_json = example.get("additional_context", {}).get("expectations", "") + if expectations_json: + import json + try: + expectations = json.loads(expectations_json) + except (json.JSONDecodeError, TypeError): + pass + + response = example.get("answer", "") + + if mode == "generative" and task_lm: + import litellm + messages = [ + {"role": "system", "content": f"Skill documentation:\n\n{candidate_text}\n\nAnswer the user's question."}, + {"role": "user", "content": example.get("input", "")}, + ] + resp = litellm.completion(model=task_lm, messages=messages) + response = resp.choices[0].message.content + + # 1. Score the response against test expectations + response_feedbacks = _run_deterministic_scorers( + response, expectations, example.get("input", ""), scorer_config + ) + all_feedbacks.extend(response_feedbacks) + + # 2. Validate skill structure + if candidate_text: structure_feedbacks = _validate_skill_structure(candidate_text) all_feedbacks.extend(structure_feedbacks) - # 3. Convert to score + diagnostics - composite, diagnostics = feedback_to_asi(all_feedbacks) - - # 4. Factor in token efficiency (across ALL components) - total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) - total_original_tokens = self.original_token_count - efficiency = token_efficiency_score_raw(total_candidate_tokens, total_original_tokens) - - # Weighted composite: 80% quality, 20% token efficiency - final_score = 0.8 * composite + 0.2 * efficiency - - output = {"full_assistant_response": task.get("answer", "")} - outputs.append(output) - scores.append(final_score) - - trajectory = { - "data": data_inst, - "full_assistant_response": task.get("answer", ""), - "score": final_score, - "quality_score": composite, - "efficiency_score": efficiency, - "diagnostics": diagnostics, - "failure_messages": diagnostics.get("_failure_messages", []), - } - trajectories.append(trajectory) - - return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories) - - def make_reflective_dataset( - self, - candidate: dict[str, str], - eval_batch: EvaluationBatch, - components_to_update: list[str], - ) -> dict[str, list[dict[str, Any]]]: - """Build reflective dataset from evaluation results for GEPA's mutation proposer. - - Extracts failure messages from diagnostics so the reflection LM - knows exactly what went wrong and can propose targeted mutations. - """ - reflective_data: dict[str, list[dict[str, Any]]] = {} - - for component in components_to_update: - examples = [] - for traj in eval_batch.trajectories: - failure_msgs = traj.get("failure_messages", []) - if not failure_msgs: - continue + # 3. Convert to score + side_info (with oa.log() for failures) + composite, side_info = feedback_to_asi(all_feedbacks) - examples.append({ - "input": traj.get("data", {}).get("input", ""), - "current_text": candidate.get(component, ""), - "feedback": "\n".join(failure_msgs), - "score": traj.get("score", 0.0), - }) + # 4. Token efficiency across ALL components + total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) + if total_original_tokens > 0: + ratio = total_candidate_tokens / total_original_tokens + efficiency = max(0.0, min(1.0, 2.0 - ratio)) + else: + efficiency = 1.0 - reflective_data[component] = examples + # Weighted composite: 80% quality, 20% token efficiency + final_score = 0.8 * composite + 0.2 * efficiency - return reflective_data + side_info["scores"] = { + "quality": composite, + "token_efficiency": efficiency, + } + side_info["token_counts"] = { + "candidate_total": total_candidate_tokens, + "original_total": total_original_tokens, + } + return final_score, side_info -def create_skill_adapter( - skill_name: str, - mode: Literal["static", "generative"] = "static", - task_lm: str | None = None, - original_token_counts: dict[str, int] | None = None, -) -> SkillAdapter: - """Create a SkillAdapter for GEPA optimization. - - Args: - skill_name: Name of the skill being optimized - mode: "static" or "generative" - task_lm: LLM model string for generative mode - original_token_counts: Per-component original token counts (for multi-component) - - Returns: - Configured SkillAdapter instance - """ - return SkillAdapter( - skill_name=skill_name, - mode=mode, - task_lm=task_lm, - original_token_counts=original_token_counts, - ) + return evaluator def build_optimization_background( @@ -356,10 +255,8 @@ def build_optimization_background( components_desc = ( "\n\nYou are also optimizing MCP tool descriptions for these modules: " f"{', '.join(tool_modules)}. " - "Tool descriptions are docstrings on @mcp.tool functions. They tell the AI agent " - "what each tool does, its parameters, and return values. Keep descriptions " - "accurate, concise, and action-oriented. Include usage hints that help the agent " - "choose the right tool.\n" + "Tool descriptions are docstrings on @mcp.tool functions. Keep them " + "accurate, concise, and action-oriented.\n" ) return ( @@ -372,8 +269,8 @@ def build_optimization_background( "- Absence of hallucinated/deprecated APIs\n" "- Presence of expected factual information\n" "- Overall structural quality of the skill document\n\n" - f"IMPORTANT: The current skill is {original_token_count:,} tokens. " - "Optimized skills should be MORE CONCISE, not larger. " + f"IMPORTANT: The current artifacts total {original_token_count:,} tokens. " + "Optimized versions should be MORE CONCISE, not larger. " "Remove redundant examples, consolidate similar patterns, " "and eliminate verbose explanations that don't add value. " "Every token consumed is agent context window budget -- keep skills lean and focused." diff --git a/.test/src/skill_test/optimize/gskill/pipeline.py b/.test/src/skill_test/optimize/gskill/pipeline.py index fdc0a8fb..7badfe7c 100644 --- a/.test/src/skill_test/optimize/gskill/pipeline.py +++ b/.test/src/skill_test/optimize/gskill/pipeline.py @@ -1,105 +1,19 @@ -"""gskill pipeline: wraps GEPA optimize for customer repository skill generation. +"""gskill pipeline: generate optimized skills for customer repositories. -Configures GEPA with Databricks-appropriate defaults, generates an optimized -SKILL.md, and outputs it in the standard format. +Uses optimize_anything to produce SKILL.md files from repository context. """ +import ast import re from pathlib import Path from typing import Any -import gepa -from gepa import GEPAAdapter, EvaluationBatch +from gepa.optimize_anything import optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig +import gepa.optimize_anything as oa from ..config import get_preset -class _GSkillAdapter(GEPAAdapter): - """Minimal adapter for gskill that scores SKILL.md structural quality.""" - - SKILL_KEY = "skill_md" - - def evaluate( - self, - batch: list[dict[str, Any]], - candidate: dict[str, str], - capture_traces: bool = False, - ) -> EvaluationBatch: - """Score candidate skill based on structural quality metrics.""" - import ast - - candidate_text = candidate.get(self.SKILL_KEY, "") - trajectories = [] - - for data_inst in batch: - score = 0.0 - parts = 0 - - # Has markdown headers - if re.search(r"^#{1,3}\s+", candidate_text, re.MULTILINE): - score += 1.0 - parts += 1 - - # Has code blocks - code_blocks = re.findall(r"```(\w+)\n(.*?)```", candidate_text, re.DOTALL) - if code_blocks: - score += 1.0 - parts += 1 - - # Python blocks parse - py_blocks = [b for lang, b in code_blocks if lang == "python"] - if py_blocks: - valid = sum(1 for b in py_blocks if _parses(b)) - score += valid / len(py_blocks) - parts += 1 - - # Reasonable length (not too short, not too long) - word_count = len(candidate_text.split()) - if 200 <= word_count <= 5000: - score += 1.0 - parts += 1 - - final = score / parts if parts > 0 else 0.0 - - trajectories.append({ - "data": data_inst, - "full_assistant_response": candidate_text[:200], - "score": final, - }) - - return EvaluationBatch(trajectories=trajectories) - - def make_reflective_dataset( - self, - candidate: dict[str, str], - eval_batch: EvaluationBatch, - components_to_update: list[str], - ) -> dict[str, list[dict[str, Any]]]: - reflective_data: dict[str, list[dict[str, Any]]] = {} - for component in components_to_update: - examples = [] - for traj in eval_batch.trajectories: - if traj.get("score", 1.0) < 0.8: - examples.append({ - "input": traj.get("data", {}).get("input", ""), - "current_text": candidate.get(component, ""), - "feedback": "Skill structural quality is below threshold.", - "score": traj.get("score", 0.0), - }) - reflective_data[component] = examples - return reflective_data - - -def _parses(code: str) -> bool: - """Check if Python code parses without syntax errors.""" - import ast - try: - ast.parse(code) - return True - except SyntaxError: - return False - - def run_gskill( repo_path: str | Path, skill_name: str | None = None, @@ -107,10 +21,7 @@ def run_gskill( preset: str = "standard", context_files: list[str] | None = None, ) -> dict[str, Any]: - """Generate an optimized skill for a customer repository using GEPA. - - Scans the repo for Databricks patterns, generates a SKILL.md optimized - for Claude Code consumption, and outputs to .claude/skills//SKILL.md. + """Generate an optimized skill for a customer repository. Args: repo_path: Path to the customer's repository @@ -120,21 +31,19 @@ def run_gskill( context_files: Additional files to provide as context Returns: - Dict with generated skill path, quality score, and metadata + Dict with generated skill path and metadata """ repo_path = Path(repo_path).resolve() if not repo_path.exists(): raise FileNotFoundError(f"Repository not found: {repo_path}") - preset_config = get_preset(preset) + config = get_preset(preset) if skill_name is None: skill_name = repo_path.name - # Gather repo context repo_context = _scan_repo(repo_path, context_files) - # Build seed candidate seed_content = ( f"# {skill_name}\n\n" "## Overview\n\n" @@ -142,35 +51,57 @@ def run_gskill( + repo_context ) - seed_candidate = {_GSkillAdapter.SKILL_KEY: seed_content} + def evaluate(candidate: str, example: dict) -> tuple[float, dict]: + """Score structural quality of generated skill.""" + score = 0.0 + parts = 0 + + if re.search(r"^#{1,3}\s+", candidate, re.MULTILINE): + score += 1.0 + parts += 1 + + code_blocks = re.findall(r"```(\w+)\n(.*?)```", candidate, re.DOTALL) + if code_blocks: + score += 1.0 + parts += 1 + py_blocks = [b for lang, b in code_blocks if lang == "python"] + if py_blocks: + valid = sum(1 for b in py_blocks if _parses(b)) + score += valid / len(py_blocks) + parts += 1 + + word_count = len(candidate.split()) + if 200 <= word_count <= 5000: + score += 1.0 + parts += 1 + + final = score / parts if parts > 0 else 0.0 + oa.log(f"Structure score: {final:.2f}, words: {word_count}") + + return final, {"structure_score": final, "word_count": word_count} - # Build a synthetic trainset from the repo trainset = [ - { - "input": f"Help me understand the patterns in {skill_name}", - "additional_context": {}, - "answer": "", - }, - { - "input": f"Show me code examples from {skill_name}", - "additional_context": {}, - "answer": "", - }, + {"input": f"Help me understand patterns in {skill_name}", "additional_context": {}, "answer": ""}, + {"input": f"Show code examples from {skill_name}", "additional_context": {}, "answer": ""}, ] - adapter = _GSkillAdapter() - - # Run GEPA - result = gepa.optimize( - seed_candidate=seed_candidate, - trainset=trainset, - adapter=adapter, - **preset_config.to_kwargs(), + result = optimize_anything( + seed_candidate=seed_content, + evaluator=evaluate, + dataset=trainset, + objective=f"Generate a SKILL.md that teaches an AI coding agent the patterns in {skill_name}.", + background=( + "SKILL.md files teach AI agents (Claude Code) repository-specific patterns. " + "Focus on Databricks patterns: Unity Catalog, MLflow, Spark, Delta Lake, etc. " + "Be CONCISE and ACTION-ORIENTED. Lead with code examples." + ), + config=config, ) - generated_content = result.best_candidate.get(_GSkillAdapter.SKILL_KEY, seed_content) + generated_content = result.best_candidate + if isinstance(generated_content, dict): + generated_content = list(generated_content.values())[0] - # Write output if output_dir is None: output_dir = repo_path / ".claude" / "skills" / skill_name else: @@ -189,19 +120,23 @@ def run_gskill( } +def _parses(code: str) -> bool: + try: + ast.parse(code) + return True + except SyntaxError: + return False + + def _scan_repo(repo_path: Path, context_files: list[str] | None = None) -> str: - """Scan repository for Databricks-relevant patterns and build context.""" context_parts = [] - - # Read explicitly provided context files if context_files: for f in context_files: p = Path(f) if Path(f).is_absolute() else repo_path / f if p.exists(): - content = p.read_text()[:5000] # Cap at 5K per file + content = p.read_text()[:5000] context_parts.append(f"### {p.name}\n\n```\n{content}\n```\n") - # Auto-scan for README readme = repo_path / "README.md" if readme.exists() and not context_files: context_parts.append(f"### README\n\n{readme.read_text()[:3000]}\n") diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index bbf019ba..e364ca7b 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -1,31 +1,30 @@ """End-to-end orchestrator for GEPA skill optimization. -Workflow: load skill -> split dataset -> build adapter -> optimize -> log results +Uses optimize_anything API: evaluator function + GEPAConfig. """ +import difflib +import re from dataclasses import dataclass from pathlib import Path from typing import Any, Literal -import gepa +from gepa.optimize_anything import optimize_anything, GEPAConfig from ..config import SkillTestConfig from ..runners.evaluate import setup_mlflow from .config import get_preset from .evaluator import ( - SkillAdapter, - create_skill_adapter, + SKILL_KEY, + create_skill_evaluator, count_tokens, build_optimization_background, _find_skill_md, - token_efficiency_score, ) from .splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances from .tools import ( extract_tool_descriptions, tools_to_gepa_components, - parse_gepa_component, - write_tool_descriptions, get_tool_stats, ) @@ -38,29 +37,24 @@ class OptimizationResult: original_score: float optimized_score: float improvement: float - original_content: str # SKILL.md content (may be empty for tools_only) - optimized_content: str # Optimized SKILL.md content - original_token_count: int # Total tokens across ALL components + original_content: str + optimized_content: str + original_token_count: int optimized_token_count: int token_reduction_pct: float diff_summary: str val_scores: dict[str, float] mlflow_run_id: str | None gepa_result: Any - # Multi-component data - components: dict[str, str] | None = None # All optimized component texts - original_components: dict[str, str] | None = None # All original component texts - tool_map: Any = None # For writing back tool descriptions + components: dict[str, str] | None = None + original_components: dict[str, str] | None = None + tool_map: Any = None def _compute_diff_summary(original: str, optimized: str) -> str: - """Generate a human-readable summary of changes between original and optimized.""" - import difflib - import re - + """Generate a human-readable summary of changes.""" original_lines = original.splitlines(keepends=True) optimized_lines = optimized.splitlines(keepends=True) - diff = list(difflib.unified_diff(original_lines, optimized_lines, fromfile="original", tofile="optimized", n=1)) if not diff: @@ -69,20 +63,19 @@ def _compute_diff_summary(original: str, optimized: str) -> str: added = sum(1 for line in diff if line.startswith("+") and not line.startswith("+++")) removed = sum(1 for line in diff if line.startswith("-") and not line.startswith("---")) - summary_parts = [] + parts = [] if added: - summary_parts.append(f"+{added} lines added") + parts.append(f"+{added} lines added") if removed: - summary_parts.append(f"-{removed} lines removed") + parts.append(f"-{removed} lines removed") - # Extract changed section headers for context changed_sections = set() for line in diff: content = line[1:].strip() if line.startswith(("+", "-")) and not line.startswith(("+++", "---")) else "" if content.startswith("#"): changed_sections.add(content) - summary = ", ".join(summary_parts) + summary = ", ".join(parts) if changed_sections: sections = "\n".join(f" ~ {s}" for s in sorted(changed_sections)[:10]) summary += f"\n\nChanged sections:\n{sections}" @@ -90,22 +83,15 @@ def _compute_diff_summary(original: str, optimized: str) -> str: return summary -def _evaluate_on_tasks( - adapter: SkillAdapter, - candidate: dict[str, str], - tasks: list[dict[str, Any]], -) -> tuple[float, dict[str, float]]: - """Run adapter on a set of tasks and return mean score + per-task scores.""" +def _evaluate_on_tasks(evaluator, candidate, tasks): + """Run evaluator on tasks and return mean score + per-task scores.""" gepa_instances = to_gepa_instances(tasks) - eval_batch = adapter.evaluate(gepa_instances, candidate) - - per_task: dict[str, float] = {} - for i, score in enumerate(eval_batch.scores): - task_id = tasks[i].get("id", f"task_{i}") - per_task[task_id] = score - - mean_score = sum(per_task.values()) / len(per_task) if per_task else 0.0 - return mean_score, per_task + per_task = {} + for i, inst in enumerate(gepa_instances): + score, _ = evaluator(candidate, inst) + per_task[tasks[i].get("id", f"task_{i}")] = score + mean = sum(per_task.values()) / len(per_task) if per_task else 0.0 + return mean, per_task def optimize_skill( @@ -119,45 +105,29 @@ def optimize_skill( tools_only: bool = False, dry_run: bool = False, ) -> OptimizationResult: - """Run end-to-end GEPA optimization on a skill, optionally with MCP tools. + """Run end-to-end GEPA optimization on a skill and/or tools. - 1. Load current SKILL.md as seed_candidate - 2. Optionally load MCP tool descriptions as additional components - 3. Create train/val datasets from ground_truth.yaml - 4. Build adapter from existing scorers - 5. Run gepa.optimize() (round-robin across all components) - 6. Log results to MLflow - 7. Return OptimizationResult with original/optimized scores and diff + Uses optimize_anything API with a simple evaluator function. Args: skill_name: Name of the skill to optimize - mode: "static" (uses ground truth responses) or "generative" (generates fresh) - preset: GEPA config preset ("quick", "standard", "thorough") - task_lm: LLM model for generative mode - reflection_lm: Override reflection LM (default: GEPA_REFLECTION_LM env or databricks/databricks-gpt-5-2) - include_tools: If True, include MCP tool descriptions as additional GEPA components - tool_modules: Specific tool modules to include (e.g., ["sql", "compute"]). None = all. - tools_only: If True, optimize ONLY tool descriptions (no SKILL.md) - dry_run: If True, show config and dataset info without running optimization - - Returns: - OptimizationResult with scores, content, and diff - - Raises: - FileNotFoundError: If SKILL.md cannot be found + mode: "static" or "generative" + preset: "quick" (15), "standard" (50), "thorough" (150) + task_lm: LLM for generative mode + reflection_lm: Override reflection LM + include_tools: Include MCP tool descriptions as additional components + tool_modules: Specific tool modules (None = all) + tools_only: Optimize ONLY tool descriptions + dry_run: Show config without running """ - # 1. Load current SKILL.md + # 1. Load SKILL.md skill_path = _find_skill_md(skill_name) if not tools_only and skill_path is None: - raise FileNotFoundError( - f"Could not find SKILL.md for '{skill_name}'. " - "Expected at .claude/skills/{name}/SKILL.md or databricks-skills/{name}/SKILL.md" - ) + raise FileNotFoundError(f"Could not find SKILL.md for '{skill_name}'") original_content = skill_path.read_text() if skill_path else "" - original_token_count = count_tokens(original_content) if original_content else 0 - # 1b. Load MCP tool descriptions if requested + # 1b. Load MCP tool descriptions tool_map = None tool_components: dict[str, str] = {} if include_tools or tools_only: @@ -167,25 +137,13 @@ def optimize_skill( print(f"Tool modules: {stats['modules']}, tools: {stats['total_tools']}, " f"description chars: {stats['total_description_chars']:,}") - # 2. Create train/val datasets - try: - train, val = create_gepa_datasets(skill_name) - except FileNotFoundError: - train, val = [], None - - if not train: - train = generate_bootstrap_tasks(skill_name) - val = None - print(f"No test cases found for '{skill_name}'. Using {len(train)} auto-generated tasks.") - print(f"For better results, add test cases: skill-test add {skill_name}") - - # 3. Build seed_candidate (multi-component if tools included) - original_token_counts: dict[str, int] = {} + # 2. Build seed_candidate (multi-component dict) seed_candidate: dict[str, str] = {} + original_token_counts: dict[str, int] = {} if not tools_only: - seed_candidate[SkillAdapter.SKILL_KEY] = original_content - original_token_counts[SkillAdapter.SKILL_KEY] = original_token_count + seed_candidate[SKILL_KEY] = original_content + original_token_counts[SKILL_KEY] = count_tokens(original_content) for comp_name, comp_text in tool_components.items(): seed_candidate[comp_name] = comp_text @@ -193,16 +151,27 @@ def optimize_skill( total_original_tokens = sum(original_token_counts.values()) - # 3b. Build adapter with per-component token counts - adapter = create_skill_adapter( + # 3. Load datasets + try: + train, val = create_gepa_datasets(skill_name) + except FileNotFoundError: + train, val = [], None + + if not train: + train = generate_bootstrap_tasks(skill_name) + val = None + print(f"No test cases found. Using {len(train)} auto-generated tasks.") + + # 4. Build evaluator + evaluator = create_skill_evaluator( skill_name, mode=mode, task_lm=task_lm, original_token_counts=original_token_counts, ) - # 4. Get preset config (with optional reflection LM override) - preset_config = get_preset(preset, reflection_lm=reflection_lm) + # 5. Get config + config = get_preset(preset, reflection_lm=reflection_lm) - # Dry run: show info and exit + # Dry run if dry_run: print(f"\n=== Dry Run: {skill_name} ===") if not tools_only: @@ -214,13 +183,10 @@ def optimize_skill( print(f"Train tasks: {len(train)}") print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Mode: {mode}") - print(f"Preset: {preset} (max_metric_calls={preset_config.max_metric_calls})") - print(f"Reflection LM: {preset_config.reflection_lm}") - if mode == "generative": - print(f"Task LM: {task_lm or 'not set'}") + print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls})") + print(f"Reflection LM: {config.reflection.reflection_lm}") - # Evaluate current score - original_score, _ = _evaluate_on_tasks(adapter, seed_candidate, train) + original_score, _ = _evaluate_on_tasks(evaluator, seed_candidate, train) print(f"Current score: {original_score:.3f}") return OptimizationResult( @@ -242,85 +208,69 @@ def optimize_skill( tool_map=tool_map, ) - # Evaluate original score - original_score, _ = _evaluate_on_tasks(adapter, seed_candidate, train) + # Evaluate original + original_score, _ = _evaluate_on_tasks(evaluator, seed_candidate, train) - # 5. Convert datasets to GEPA format + # 6. Build background and objective + background = build_optimization_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + ) + objective = ( + f"Optimize the '{skill_name}' skill for maximum quality and minimum token count. " + "Higher quality scores and fewer tokens are both better." + ) + + # 7. Convert datasets to GEPA format trainset = to_gepa_instances(train) valset = to_gepa_instances(val) if val else None - # 6. Run gepa.optimize() - gepa_kwargs = { - "seed_candidate": seed_candidate, - "trainset": trainset, - "adapter": adapter, - **preset_config.to_kwargs(), - } - if valset: - gepa_kwargs["valset"] = valset - if task_lm: - gepa_kwargs["task_lm"] = task_lm - - result = gepa.optimize(**gepa_kwargs) - - # result.best_candidate is dict[str, str] (all components) + # 8. Run optimize_anything + result = optimize_anything( + seed_candidate=seed_candidate, + evaluator=evaluator, + dataset=trainset, + valset=valset, + objective=objective, + background=background, + config=config, + ) + + # 9. Extract results best = result.best_candidate - optimized_content = best.get(SkillAdapter.SKILL_KEY, original_content) + optimized_content = best.get(SKILL_KEY, original_content) optimized_token_count = sum(count_tokens(v) for v in best.values()) - # Evaluate optimized on train (using full multi-component candidate) - optimized_score, _ = _evaluate_on_tasks(adapter, best, train) + optimized_score, _ = _evaluate_on_tasks(evaluator, best, train) - # Evaluate on val if available val_scores: dict[str, float] = {} if val: - _, val_scores = _evaluate_on_tasks(adapter, best, val) + _, val_scores = _evaluate_on_tasks(evaluator, best, val) - # Token reduction (total across all components) token_reduction_pct = ( (total_original_tokens - optimized_token_count) / total_original_tokens * 100 - if total_original_tokens > 0 - else 0.0 + if total_original_tokens > 0 else 0.0 ) - # Write optimized tool descriptions back if tools were included - optimized_tool_components: dict[str, str] = {} - if tool_map and not dry_run: - for comp_name, comp_text in best.items(): - if comp_name.startswith("tools_"): - optimized_tool_components[comp_name] = comp_text - - # Diff summary diff_summary = _compute_diff_summary(original_content, optimized_content) - # 7. Log to MLflow (best-effort) + # 10. MLflow logging (best-effort) mlflow_run_id = None try: import mlflow - stc = SkillTestConfig() setup_mlflow(stc) - with mlflow.start_run(run_name=f"{skill_name}_optimize_{preset}"): - mlflow.set_tags( - { - "optimizer": "gepa", - "skill_name": skill_name, - "preset": preset, - "mode": mode, - } - ) - mlflow.log_metrics( - { - "original_score": original_score, - "optimized_score": optimized_score, - "improvement": optimized_score - original_score, - "original_tokens": float(total_original_tokens), - "optimized_tokens": float(optimized_token_count), - "token_reduction_pct": token_reduction_pct, - "total_metric_calls": float(result.total_metric_calls or 0), - } - ) + mlflow.set_tags({"optimizer": "gepa", "skill_name": skill_name, "preset": preset, "mode": mode}) + mlflow.log_metrics({ + "original_score": original_score, + "optimized_score": optimized_score, + "improvement": optimized_score - original_score, + "original_tokens": float(total_original_tokens), + "optimized_tokens": float(optimized_token_count), + "token_reduction_pct": token_reduction_pct, + "total_metric_calls": float(result.total_metric_calls or 0), + }) mlflow_run_id = mlflow.active_run().info.run_id except Exception: pass diff --git a/.test/tests/test_optimize_e2e.py b/.test/tests/test_optimize_e2e.py index 9299cb0f..355beba0 100644 --- a/.test/tests/test_optimize_e2e.py +++ b/.test/tests/test_optimize_e2e.py @@ -1,51 +1,39 @@ -"""End-to-end test: optimize an existing skill, assert quality up + tokens down. +"""End-to-end tests for GEPA skill optimization using optimize_anything API. -This test validates the entire GEPA optimization pipeline works and that -GEPA actually produces better, leaner skills. +Unit tests run without API keys. E2E tests require GEPA reflection LM access. -NOTE: The E2E class is a slow integration test that calls GEPA with real LLM -reflection. It requires: - - gepa>=0.0.7 installed - - OPENAI_API_KEY set (for GEPA reflection LM) +Run unit tests: + cd .test && uv run pytest tests/test_optimize_e2e.py -v -k "not TestOptimizeE2E" -Run unit tests only: - cd .test && uv run pytest tests/test_optimize_e2e.py -v -k "not E2E" - -Run everything (slow): +Run everything (slow, requires API key): cd .test && uv run pytest tests/test_optimize_e2e.py -v -s """ import pytest -from skill_test.optimize.evaluator import token_efficiency_score, count_tokens +from skill_test.optimize.evaluator import token_efficiency_score, count_tokens, SKILL_KEY from skill_test.optimize.splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances from skill_test.optimize.asi import feedback_to_score, feedback_to_asi -from skill_test.optimize.config import PRESETS, GEPAPreset try: from mlflow.entities import Feedback - HAS_MLFLOW = True except ImportError: Feedback = None HAS_MLFLOW = False try: - import gepa - + from gepa.optimize_anything import GEPAConfig, EngineConfig, ReflectionConfig HAS_GEPA = True except ImportError: HAS_GEPA = False # -------------------------------------------------------------------------- -# Step 1: Unit tests (no GEPA/LLM required) +# Unit tests (no GEPA/LLM required) # -------------------------------------------------------------------------- - class TestTokenEfficiency: - """Verification: token counting and efficiency scoring.""" - def test_same_size_scores_one(self): text = "Hello world, this is a test." tokens = count_tokens(text) @@ -59,68 +47,40 @@ def test_double_size_scores_zero(self): tokens = count_tokens(text) assert token_efficiency_score(text + text, tokens) == pytest.approx(0.0, abs=0.05) - def test_ten_pct_larger(self): - base = "word " * 100 - base_tokens = count_tokens(base) - larger = base + "extra " * 10 - larger_tokens = count_tokens(larger) - ratio = larger_tokens / base_tokens - expected = max(0.0, min(1.0, 2.0 - ratio)) - assert token_efficiency_score(larger, base_tokens) == pytest.approx(expected, abs=0.05) - def test_zero_original_returns_one(self): assert token_efficiency_score("anything", 0) == 1.0 class TestSplitter: - """Verification: dataset splitting and bootstrap task generation.""" - - def test_small_dataset_no_val(self): - """Skills with <5 test cases should use all as train, val=None.""" - try: - train, val = create_gepa_datasets("databricks-genie") - if len(train) < 5: - assert val is None - except FileNotFoundError: - pytest.skip("No ground_truth.yaml for databricks-genie") - def test_model_serving_has_split(self): - """databricks-model-serving should have enough cases for a split.""" try: train, val = create_gepa_datasets("databricks-model-serving") assert len(train) > 0 if len(train) + (len(val) if val else 0) >= 5: assert val is not None - assert len(val) > 0 except FileNotFoundError: - pytest.skip("No ground_truth.yaml for databricks-model-serving") + pytest.skip("No ground_truth.yaml") def test_reproducible_splits(self): - """Same seed should produce identical splits.""" try: - train1, val1 = create_gepa_datasets("databricks-model-serving", seed=42) - train2, val2 = create_gepa_datasets("databricks-model-serving", seed=42) - assert [t["id"] for t in train1] == [t["id"] for t in train2] - if val1 and val2: - assert [t["id"] for t in val1] == [t["id"] for t in val2] + t1, v1 = create_gepa_datasets("databricks-model-serving", seed=42) + t2, v2 = create_gepa_datasets("databricks-model-serving", seed=42) + assert [t["id"] for t in t1] == [t["id"] for t in t2] except FileNotFoundError: - pytest.skip("No ground_truth.yaml for databricks-model-serving") + pytest.skip("No ground_truth.yaml") def test_tasks_have_correct_keys(self): - """Tasks should have the expected keys for GEPA compatibility.""" try: train, _ = create_gepa_datasets("databricks-model-serving") - assert len(train) > 0 for task in train: assert "id" in task assert "input" in task assert "answer" in task assert "additional_context" in task except FileNotFoundError: - pytest.skip("No ground_truth.yaml for databricks-model-serving") + pytest.skip("No ground_truth.yaml") def test_to_gepa_instances(self): - """to_gepa_instances should produce DefaultDataInst-compatible dicts.""" try: train, _ = create_gepa_datasets("databricks-model-serving") instances = to_gepa_instances(train) @@ -129,27 +89,20 @@ def test_to_gepa_instances(self): assert "input" in inst assert "additional_context" in inst assert "answer" in inst - # Should NOT have internal-only keys assert "id" not in inst - assert "metadata" not in inst except FileNotFoundError: - pytest.skip("No ground_truth.yaml for databricks-model-serving") + pytest.skip("No ground_truth.yaml") def test_bootstrap_tasks_generated(self): - """Bootstrap should generate tasks from SKILL.md headers.""" tasks = generate_bootstrap_tasks("databricks-model-serving") assert len(tasks) > 0 for task in tasks: assert "id" in task assert "input" in task - assert "additional_context" in task - assert "metadata" in task @pytest.mark.skipif(not HAS_MLFLOW, reason="mlflow not installed") class TestASI: - """Verification: Feedback -> GEPA score conversion.""" - def test_yes_scores_one(self): assert feedback_to_score(Feedback(name="test", value="yes")) == 1.0 @@ -159,153 +112,118 @@ def test_no_scores_zero(self): def test_skip_returns_none(self): assert feedback_to_score(Feedback(name="test", value="skip")) is None - def test_numeric_value(self): - assert feedback_to_score(Feedback(name="test", value="0.75")) == 0.75 - def test_feedback_to_asi_composite(self): feedbacks = [ Feedback(name="syntax", value="yes", rationale="Valid"), Feedback(name="pattern", value="no", rationale="Missing X"), Feedback(name="optional", value="skip", rationale="N/A"), ] - score, diag = feedback_to_asi(feedbacks) - # Mean of [1.0, 0.0] = 0.5 + score, si = feedback_to_asi(feedbacks) assert score == pytest.approx(0.5) - assert diag["syntax"]["score"] == 1.0 - assert diag["pattern"]["score"] == 0.0 - assert diag["optional"]["status"] == "skipped" - assert diag["_summary"]["scored"] == 2 - assert diag["_summary"]["skipped"] == 1 - # Failure messages collected - assert len(diag["_failure_messages"]) >= 1 + assert si["syntax"]["score"] == 1.0 + assert si["pattern"]["score"] == 0.0 + assert si["optional"]["status"] == "skipped" + assert si["_summary"]["scored"] == 2 +@pytest.mark.skipif(not HAS_GEPA, reason="gepa not installed") class TestConfig: - """Verification: GEPA config presets.""" - def test_presets_exist(self): + from skill_test.optimize.config import PRESETS assert "quick" in PRESETS assert "standard" in PRESETS assert "thorough" in PRESETS def test_quick_has_fewer_calls(self): - assert PRESETS["quick"].max_metric_calls < PRESETS["standard"].max_metric_calls + from skill_test.optimize.config import PRESETS + assert PRESETS["quick"].engine.max_metric_calls < PRESETS["standard"].engine.max_metric_calls - def test_thorough_has_most_calls(self): - assert PRESETS["thorough"].max_metric_calls > PRESETS["standard"].max_metric_calls - - def test_to_kwargs(self): - kwargs = PRESETS["quick"].to_kwargs() - assert "max_metric_calls" in kwargs - assert "reflection_lm" in kwargs - assert kwargs["max_metric_calls"] == 15 - - -# -------------------------------------------------------------------------- -# Step 6: New skill test (bootstrap mode) -# -------------------------------------------------------------------------- + def test_presets_are_gepa_configs(self): + from skill_test.optimize.config import PRESETS + for name, cfg in PRESETS.items(): + assert isinstance(cfg, GEPAConfig), f"{name} is not GEPAConfig" + assert isinstance(cfg.engine, EngineConfig) + assert isinstance(cfg.reflection, ReflectionConfig) class TestBootstrapMode: - """Verification: new skills without ground_truth.yaml can still bootstrap.""" - def test_nonexistent_skill_returns_empty(self): tasks = generate_bootstrap_tasks("nonexistent-skill-xyz") - # No SKILL.md found -> empty list assert tasks == [] def test_bootstrap_has_gepa_format(self): - """Bootstrap tasks should be GEPA-compatible after conversion.""" tasks = generate_bootstrap_tasks("databricks-model-serving") if not tasks: - pytest.skip("No SKILL.md found for databricks-model-serving") + pytest.skip("No SKILL.md found") instances = to_gepa_instances(tasks) for inst in instances: assert isinstance(inst["input"], str) assert isinstance(inst["additional_context"], dict) - assert isinstance(inst["answer"], str) -# -------------------------------------------------------------------------- -# Step 2: Dry run (requires adapter but not GEPA optimization) -# -------------------------------------------------------------------------- +@pytest.mark.skipif(not HAS_GEPA, reason="gepa not installed") +class TestToolExtraction: + def test_extract_tools(self): + from skill_test.optimize.tools import extract_tool_descriptions, get_tool_stats + stats = get_tool_stats() + assert stats["modules"] > 0 + assert stats["total_tools"] > 0 + + def test_tools_to_gepa_components(self): + from skill_test.optimize.tools import extract_tool_descriptions, tools_to_gepa_components + tool_map = extract_tool_descriptions(modules=["sql"]) + components = tools_to_gepa_components(tool_map) + assert "tools_sql" in components + assert "### TOOL:" in components["tools_sql"] @pytest.mark.skipif(not HAS_GEPA, reason="gepa not installed") class TestDryRun: - """Verification: dry run shows config without calling GEPA.""" - - def test_dry_run_returns_result(self): + def test_dry_run_skill_only(self): from skill_test.optimize.runner import optimize_skill - try: - result = optimize_skill( - skill_name="databricks-model-serving", - mode="static", - preset="quick", - dry_run=True, - ) + result = optimize_skill("databricks-model-serving", preset="quick", dry_run=True) assert result.improvement == 0.0 assert result.original_content == result.optimized_content assert result.gepa_result is None assert result.original_token_count > 0 - print(f"\nDry run score: {result.original_score:.3f}") - print(f"Original tokens: {result.original_token_count:,}") except FileNotFoundError: - pytest.skip("SKILL.md not found for databricks-model-serving") + pytest.skip("SKILL.md not found") + + def test_dry_run_with_tools(self): + from skill_test.optimize.runner import optimize_skill + try: + result = optimize_skill( + "databricks-model-serving", preset="quick", dry_run=True, + include_tools=True, tool_modules=["serving"], + ) + assert SKILL_KEY in result.components + assert "tools_serving" in result.components + assert result.original_token_count > 0 + except FileNotFoundError: + pytest.skip("SKILL.md not found") # -------------------------------------------------------------------------- -# Steps 3-5, 7-8: E2E integration (requires GEPA + LLM API key) +# E2E integration (requires GEPA + LLM API key) # -------------------------------------------------------------------------- - @pytest.mark.skipif(not HAS_GEPA, reason="gepa not installed") @pytest.mark.slow class TestOptimizeE2E: - """End-to-end optimization test. - - Picks an existing skill, runs GEPA optimization, and asserts both - quality improvement and token reduction. - """ - def test_optimize_improves_quality_and_reduces_tokens(self): - """Optimize databricks-spark-declarative-pipelines (largest skill). - - Asserts: - 1. Quality score does not regress - 2. Token count does not increase by >5% - 3. Validation set score within 5% of train (no overfitting) - """ from skill_test.optimize.runner import optimize_skill - result = optimize_skill( skill_name="databricks-spark-declarative-pipelines", mode="static", preset="quick", ) + assert result.optimized_score >= result.original_score + assert result.optimized_token_count <= result.original_token_count * 1.05 - # 1. Quality must not regress - assert result.optimized_score >= result.original_score, ( - f"Quality regressed: {result.original_score:.3f} -> {result.optimized_score:.3f}" - ) - - # 2. Token count must not increase significantly - assert result.optimized_token_count <= result.original_token_count * 1.05, ( - f"Tokens grew: {result.original_token_count:,} -> {result.optimized_token_count:,}" - ) - - # 3. No overfitting if result.val_scores: avg_val = sum(result.val_scores.values()) / len(result.val_scores) - assert avg_val >= result.optimized_score - 0.05, ( - f"Overfitting: train={result.optimized_score:.3f}, val={avg_val:.3f}" - ) + assert avg_val >= result.optimized_score - 0.05 - print(f"\n=== E2E Results ===") - print(f"Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " - f"({result.improvement:+.3f})") - print(f"Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " - f"({result.token_reduction_pct:+.1f}%)") - if result.mlflow_run_id: - print(f"MLflow: {result.mlflow_run_id}") + print(f"\nQuality: {result.original_score:.3f} -> {result.optimized_score:.3f}") + print(f"Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,}") From bafd451828f2492549e7285bab6e3aec8c32d90a Mon Sep 17 00:00:00 2001 From: CAholder Date: Tue, 24 Feb 2026 10:07:09 -0800 Subject: [PATCH 06/20] addition and updates to evaluation powering GEPA. added capability to add examples from MLflow traces or manual --- .test/README.md | 579 +++++++++++++- .../gepa_skill_optimization_demo.ipynb | 40 +- .test/scripts/add_example.py | 272 +++++++ .test/scripts/generate_ground_truth.py | 645 ++++++++++++++++ .test/scripts/optimize.py | 31 +- .test/scripts/trace_to_examples.py | 710 ++++++++++++++++++ .../databricks-app-apx/ground_truth.yaml | 311 ++++++++ .test/skills/databricks-app-apx/manifest.yaml | 28 + .../databricks-config/ground_truth.yaml | 199 +++++ .test/skills/databricks-config/manifest.yaml | 28 + .../skills/databricks-dbsql/ground_truth.yaml | 366 +++++++++ .test/skills/databricks-dbsql/manifest.yaml | 34 + .../skills/databricks-docs/ground_truth.yaml | 170 +++++ .test/skills/databricks-docs/manifest.yaml | 29 + .../skills/databricks-jobs/ground_truth.yaml | 348 +++++++++ .test/skills/databricks-jobs/manifest.yaml | 33 + .../ground_truth.yaml | 317 ++++++++ .../manifest.yaml | 30 + .../ground_truth.yaml | 387 ++++++++++ .../manifest.yaml | 29 + .../databricks-metric-views/ground_truth.yaml | 346 +++++++++ .../databricks-metric-views/manifest.yaml | 32 + .../ground_truth.yaml | 365 +++++++++ .../manifest.yaml | 31 + .../databricks-python-sdk/ground_truth.yaml | 378 ++++++++++ .../databricks-python-sdk/manifest.yaml | 30 + .../ground_truth.yaml | 227 ++++++ .../manifest.yaml | 30 + .../ground_truth.yaml | 392 ++++++++++ .../manifest.yaml | 30 + .../ground_truth.yaml | 255 +++++++ .../databricks-unity-catalog/manifest.yaml | 31 + .../ground_truth.yaml | 271 +++++++ .../manifest.yaml | 30 + .../ground_truth.yaml | 345 +++++++++ .../databricks-vector-search/manifest.yaml | 31 + .test/src/skill_test/optimize/asi.py | 85 ++- .test/src/skill_test/optimize/config.py | 275 ++++++- .test/src/skill_test/optimize/evaluator.py | 292 ++++++- .test/src/skill_test/optimize/runner.py | 180 ++++- .test/src/skill_test/optimize/splitter.py | 33 + .test/uv.lock | 23 +- 42 files changed, 8158 insertions(+), 140 deletions(-) create mode 100644 .test/scripts/add_example.py create mode 100644 .test/scripts/generate_ground_truth.py create mode 100644 .test/scripts/trace_to_examples.py create mode 100644 .test/skills/databricks-app-apx/ground_truth.yaml create mode 100644 .test/skills/databricks-app-apx/manifest.yaml create mode 100644 .test/skills/databricks-config/ground_truth.yaml create mode 100644 .test/skills/databricks-config/manifest.yaml create mode 100644 .test/skills/databricks-dbsql/ground_truth.yaml create mode 100644 .test/skills/databricks-dbsql/manifest.yaml create mode 100644 .test/skills/databricks-docs/ground_truth.yaml create mode 100644 .test/skills/databricks-docs/manifest.yaml create mode 100644 .test/skills/databricks-jobs/ground_truth.yaml create mode 100644 .test/skills/databricks-jobs/manifest.yaml create mode 100644 .test/skills/databricks-lakebase-autoscale/ground_truth.yaml create mode 100644 .test/skills/databricks-lakebase-autoscale/manifest.yaml create mode 100644 .test/skills/databricks-lakebase-provisioned/ground_truth.yaml create mode 100644 .test/skills/databricks-lakebase-provisioned/manifest.yaml create mode 100644 .test/skills/databricks-metric-views/ground_truth.yaml create mode 100644 .test/skills/databricks-metric-views/manifest.yaml create mode 100644 .test/skills/databricks-mlflow-evaluation/ground_truth.yaml create mode 100644 .test/skills/databricks-mlflow-evaluation/manifest.yaml create mode 100644 .test/skills/databricks-python-sdk/ground_truth.yaml create mode 100644 .test/skills/databricks-python-sdk/manifest.yaml create mode 100644 .test/skills/databricks-spark-structured-streaming/ground_truth.yaml create mode 100644 .test/skills/databricks-spark-structured-streaming/manifest.yaml create mode 100644 .test/skills/databricks-synthetic-data-generation/ground_truth.yaml create mode 100644 .test/skills/databricks-synthetic-data-generation/manifest.yaml create mode 100644 .test/skills/databricks-unity-catalog/ground_truth.yaml create mode 100644 .test/skills/databricks-unity-catalog/manifest.yaml create mode 100644 .test/skills/databricks-unstructured-pdf-generation/ground_truth.yaml create mode 100644 .test/skills/databricks-unstructured-pdf-generation/manifest.yaml create mode 100644 .test/skills/databricks-vector-search/ground_truth.yaml create mode 100644 .test/skills/databricks-vector-search/manifest.yaml diff --git a/.test/README.md b/.test/README.md index d7626622..4750874f 100644 --- a/.test/README.md +++ b/.test/README.md @@ -12,10 +12,11 @@ uv run python .test/scripts/optimize.py databricks-model-serving --preset quick This will: 1. Load the SKILL.md and its test cases from `ground_truth.yaml` -2. Score the current skill against deterministic scorers (syntax, patterns, APIs, facts) -3. Run GEPA's optimization loop (reflect on failures, propose mutations, select via Pareto frontier) -4. Show a diff with quality improvement and token reduction -5. Apply the optimized SKILL.md back to disk +2. Have a generation model (Sonnet) produce responses using ONLY the skill, then score those responses +3. Also score the SKILL.md itself for pattern/fact coverage +4. Run GEPA's optimization loop (reflect on failures, propose mutations, select via Pareto frontier) +5. Show a diff with quality improvement and token reduction +6. Apply the optimized SKILL.md back to disk ## Setup @@ -23,14 +24,18 @@ This will: # Install with optimization dependencies uv pip install -e ".test/[all]" -# Authentication for the reflection model (pick one) -# Option A: Databricks Model Serving (default) +# Authentication for models (pick one) +# Option A: Databricks Model Serving (default for both gen + reflection) export DATABRICKS_API_KEY="dapi..." -export DATABRICKS_API_BASE="https://.cloud.databricks.com" +export DATABRICKS_API_BASE="https://.cloud.databricks.com/serving-endpoints" # Option B: OpenAI export OPENAI_API_KEY="sk-..." export GEPA_REFLECTION_LM="openai/gpt-4o" +export GEPA_GEN_LM="openai/gpt-4o" + +# Optional: override generation model (default: databricks/databricks-claude-sonnet-4-6) +# export GEPA_GEN_LM="databricks/databricks-claude-sonnet-4-6" ``` --- @@ -40,7 +45,7 @@ export GEPA_REFLECTION_LM="openai/gpt-4o" ### Evaluate + Optimize a Skill ```bash -# Standard optimization (50 iterations) +# Standard optimization (50 iterations per component, up to 5 passes) uv run python .test/scripts/optimize.py # Quick pass (15 iterations, good for initial check) @@ -55,6 +60,12 @@ uv run python .test/scripts/optimize.py --dry-run # Optimize and apply the result uv run python .test/scripts/optimize.py --apply +# Use a specific generation model for evaluation +uv run python .test/scripts/optimize.py --gen-model "openai/gpt-4o" + +# Control iteration depth (default: 5 passes) +uv run python .test/scripts/optimize.py --max-passes 3 + # Optimize all skills that have test cases uv run python .test/scripts/optimize.py --all --preset quick ``` @@ -82,40 +93,223 @@ uv run python .test/scripts/optimize.py databricks-model-serving --include-tools When `--include-tools` is used, GEPA creates one component per tool module (e.g., `tools_sql`, `tools_serving`) and round-robins through them alongside `skill_md`. The `--apply` flag writes optimized docstrings back to the MCP server source files. +The iteration budget (`max_metric_calls`) is automatically scaled by the number of components so each one gets the preset's full budget. Additionally, the optimizer runs **up to 5 passes** (configurable with `--max-passes`), re-seeding from the previous best each time. It stops early if a pass produces no improvement. + Available tool modules (88 tools across 16 modules): `agent_bricks`, `aibi_dashboards`, `apps`, `compute`, `file`, `genie`, `jobs`, `lakebase`, `manifest`, `pipelines`, `serving`, `sql`, `unity_catalog`, `user`, `vector_search`, `volume_files` +### Changing the Generation Model + +The evaluator uses a **generation model** to simulate an agent reading the SKILL.md and producing a response. Better skill content leads to better generated responses, which drives GEPA to make meaningful improvements. + +The default is `databricks/databricks-claude-sonnet-4-6`. + +| Method | Example | +|--------|---------| +| Environment variable | `export GEPA_GEN_LM="databricks/databricks-claude-sonnet-4-6"` | +| CLI flag | `--gen-model "openai/gpt-4o"` | +| Python | `optimize_skill("my-skill", gen_model="anthropic/claude-sonnet-4-5-20250514")` | + ### Changing the Reflection Model -GEPA uses a reflection LM to analyze scorer failures and propose skill improvements. The default is **Databricks Model Serving** (`databricks-gpt-5-2`). +GEPA uses a reflection LM to analyze scorer failures and propose skill improvements. The default is **Databricks Model Serving** (`databricks-claude-opus-4-6`, 200K context). | Method | Example | |--------|---------| -| Environment variable | `export GEPA_REFLECTION_LM="databricks/databricks-gpt-5-2"` | +| Environment variable | `export GEPA_REFLECTION_LM="databricks/databricks-claude-opus-4-6"` | | CLI flag | `--reflection-lm "openai/gpt-4o"` | | Python | `optimize_skill("my-skill", reflection_lm="anthropic/claude-sonnet-4-5-20250514")` | Model strings use [litellm provider prefixes](https://docs.litellm.ai/docs/providers): -| Provider | Prefix | Example | -|----------|--------|---------| -| Databricks Model Serving | `databricks/` | `databricks/databricks-gpt-5-2` | -| OpenAI | `openai/` | `openai/gpt-4o` | -| Anthropic | `anthropic/` | `anthropic/claude-sonnet-4-5-20250514` | +| Provider | Prefix | Example | Context | +|----------|--------|---------|---------| +| Databricks Model Serving | `databricks/` | `databricks/databricks-claude-opus-4-6` | 200K | +| OpenAI | `openai/` | `openai/gpt-4o` | 128K | +| Anthropic | `anthropic/` | `anthropic/claude-sonnet-4-5-20250514` | 200K | + +**Context window requirement:** The reflection model must have a context window large enough to hold the full candidate (all components) plus GEPA's reflection overhead (~3x the raw candidate tokens). Models with small context windows (e.g., 8K) will fail with `BadRequestError` during reflection. The optimizer validates this upfront and warns if the model is too small. ### Authentication | Provider | Required Environment Variables | |----------|-------------------------------| -| Databricks | `DATABRICKS_API_KEY`, `DATABRICKS_API_BASE` | +| Databricks | `DATABRICKS_API_KEY`, `DATABRICKS_API_BASE` (must end with `/serving-endpoints`) | | OpenAI | `OPENAI_API_KEY` | | Anthropic | `ANTHROPIC_API_KEY` | +Optional overrides: `GEPA_REFLECTION_LM` (reflection model), `GEPA_GEN_LM` (generation model for evaluation). + +--- + +## Best Practices for Optimization + +These practices are derived from the [optimize_anything API guide](https://gepa-ai.github.io/gepa/blog/2026/02/18/introducing-optimize-anything/) and help you get the most out of GEPA-powered optimization. + +### 1. Write Rich Evaluators with Actionable Side Information (ASI) + +GEPA's reflection LM learns from diagnostic feedback, not just scores. The more context your evaluator surfaces, the better the proposals. + +- **Return diagnostics**, not just a number. Use `oa.log()` or return `(score, side_info_dict)` to surface error messages, failing patterns, and missing facts. +- **Break scoring into multiple dimensions**. Rather than a single composite score, provide separate scores for syntax, pattern adherence, API accuracy, and conciseness. GEPA's Pareto-efficient selection preserves candidates that excel in different dimensions. +- Our built-in evaluator already does this -- it returns per-scorer feedback via `feedback_to_asi()`. + +### 2. Build a Diverse Dataset of Test Cases + +GEPA operates in three modes depending on what data you provide: +- **Single-task** (no dataset): evaluator scores the artifact directly +- **Multi-task** (dataset only): Pareto-efficient search across tasks +- **Generalization** (dataset + valset): trains on tasks, validates on held-out examples + +For best results: +- Aim for **5+ test cases** to enable a train/val split (Generalization mode). Fewer than 5 defaults to single-task mode. +- Cover **different categories** of usage (e.g., simple queries, complex joins, error handling). The automatic stratified splitter ensures balanced representation. +- Use `/skill-test add` to interactively generate test cases, then review with `/skill-test review`. + +### 3. Tune reflection_minibatch_size for Focused Improvement + +GEPA's default `reflection_minibatch_size=2` shows the reflection LM feedback from 2 tasks per iteration. This keeps each reflection focused and prevents the LM from trying to fix everything at once. + +- For skills with **many test cases** (10+), the default of 2 works well -- over iterations, all tasks get attention. +- For skills with **few test cases** (3-5), consider increasing to 3 so more context is visible per step. + +### 4. Use Multi-Component Optimization for Skills + Tools + +When you optimize a SKILL.md alongside tool descriptions (`--include-tools`), GEPA creates separate components and cycles through them with round-robin selection. This means: + +- Each component gets its **own** optimization budget (the preset's `max_metric_calls` is multiplied by the component count). +- Up to `--max-passes` full optimization cycles run, re-seeding from the best candidate each time. +- Start with `--tool-modules` to target specific modules rather than optimizing all 16 at once. + +### 5. Choose the Right Preset + +| Preset | Budget per Component | Use Case | +|--------|---------------------|----------| +| `quick` | 15 calls | Fast feedback loop, initial exploration | +| `standard` | 50 calls | Default, good balance of quality and cost | +| `thorough` | 150 calls | Production-quality optimization | + +For multi-component runs, the actual `max_metric_calls` = budget x number of components. + +### 6. Leverage the Background Context + +The `background` parameter tells the reflection LM domain-specific constraints. Our optimizer automatically provides Databricks-specific context (token budgets, skill structure rules, scorer descriptions). For custom use cases, you can extend `build_optimization_background()` in `evaluator.py`. + +### 7. Iterate with Dry Runs First + +Always start with `--dry-run` to verify your setup: +```bash +uv run python .test/scripts/optimize.py --include-tools --dry-run +``` +This shows the component list, token counts, current score, and config without calling GEPA. Fix any scorer issues or missing test cases before spending optimization budget. + --- ## Building Test Cases for a Skill -Skills need test cases before optimization can work well. The workflow: +Skills need test cases before optimization can work well. There are three ways to add them: + +### Quick: Extract from MLflow Traces + +If you have MLflow traces (from `mlflow autolog claude`), extract test cases directly from them. You can find your traces at your workspace's MLflow experiment page, e.g.: +`https://.cloud.databricks.com/ml/experiments//traces` + +```bash +# Step 1: Set authentication +export DATABRICKS_HOST="https://e2-demo-field-eng.cloud.databricks.com" +export DATABRICKS_TOKEN="dapi..." + +# Step 2: List traces in your experiment to see what's available +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 --list + +# Step 3: Extract examples from all recent traces +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving + +# Or extract from a specific trace ID (from the UI or --list output) +uv run python .test/scripts/trace_to_examples.py \ + --trace-id tr-d416fccdab46e2dea6bad1d0bd8aaaa8 \ + --skill databricks-model-serving + +# Or extract from a specific MLflow run ID +uv run python .test/scripts/trace_to_examples.py \ + --run-id abc123def456 \ + --skill databricks-model-serving + +# Refine auto-extracted expectations with an LLM +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --refine + +# Auto-append directly to ground_truth.yaml (skip manual review) +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --trust + +# Limit how many traces to process +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --limit 5 +``` + +You can also extract from local session.jsonl files (Claude Code stores these at `~/.claude/projects/`): + +```bash +uv run python .test/scripts/trace_to_examples.py \ + --trace ~/.claude/projects/.../session.jsonl \ + --skill databricks-model-serving +``` + +The script extracts user prompt / assistant response pairs, auto-generates `expected_patterns` from code blocks and `expected_facts` from API references, and saves to `candidates.yaml` for review (or directly to `ground_truth.yaml` with `--trust`). + +#### Workflow: MLflow Traces to Optimized Skill + +The end-to-end workflow for turning real agent sessions into skill improvements: + +```bash +# 1. Extract examples from your traces +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --refine + +# 2. Review the extracted candidates +cat .test/skills/databricks-model-serving/candidates.yaml + +# 3. Promote good candidates to ground_truth.yaml +# (edit candidates.yaml, keep the good ones, then) +uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --trust + +# 4. Run optimization with the enriched dataset +uv run python .test/scripts/optimize.py databricks-model-serving --preset quick + +# 5. If score improves, apply +uv run python .test/scripts/optimize.py databricks-model-serving --preset standard --apply +``` + +### Quick: Add a Single Example Manually + +```bash +# Interactive mode — prompts for each field +uv run python .test/scripts/add_example.py databricks-model-serving + +# Inline mode — provide prompt and response directly +uv run python .test/scripts/add_example.py databricks-model-serving \ + --prompt "Create a ChatAgent with tool calling" \ + --response-file /path/to/response.md \ + --facts "Uses ChatAgent class" "Implements predict method" \ + --patterns "ChatAgent" "def predict" + +# From clipboard (paste prompt + response separated by ---) +uv run python .test/scripts/add_example.py databricks-model-serving --from-clipboard +``` + +The script auto-generates an ID, detects code language, extracts patterns from code blocks, and confirms before saving. + +### Full Workflow: Initialize + Add + Review ### 1. Initialize Test Scaffolding @@ -162,6 +356,351 @@ Edit `.test/skills//manifest.yaml` or: --- +## Test Case Examples + +Test cases live in `ground_truth.yaml` and tell GEPA what the skill should produce for a given prompt. Each test case has **inputs** (the user prompt), **outputs** (the expected response), and **expectations** (what scorers check). Here are real examples from the repo. + +### Example 1: Code Generation Test Case + +A test case that checks whether the skill produces correct Python code with the right API calls: + +```yaml +test_cases: + - id: serving_responses_agent_001 + inputs: + prompt: "Create a ResponsesAgent that uses UC functions for tool calling" + outputs: + response: | + ```python + from databricks.agents import ResponsesAgent + + agent = ResponsesAgent( + model="databricks-claude-sonnet-4", + tools=[{"type": "function", "function": {"name": "catalog.schema.my_function"}}], + instructions="You are a helpful assistant.", + ) + ``` + execution_success: true + expectations: + expected_facts: + - "Uses ResponsesAgent from databricks.agents" + - "Includes tools parameter with UC function reference" + - "Sets model to a valid Databricks model endpoint" + expected_patterns: + - pattern: "ResponsesAgent" + min_count: 1 + description: "Must use ResponsesAgent class" + - pattern: "catalog\\.\\w+\\.\\w+" + min_count: 1 + description: "UC function in three-level namespace" + metadata: + category: happy_path + difficulty: medium +``` + +**What GEPA sees:** The evaluator scores the skill against this test case. If the skill doesn't teach the agent about `ResponsesAgent` or UC function namespaces, the pattern scorers fail and GEPA's reflection LM learns what to add. + +### Example 2: SQL/Pipeline Test Case + +A test case for Spark Declarative Pipelines (SDP) with SQL syntax validation: + +```yaml + - id: sdp_bronze_ingestion_001 + inputs: + prompt: "Create a bronze ingestion streaming table that reads from cloud storage using Auto Loader" + outputs: + response: | + ```sql + CREATE OR REFRESH STREAMING TABLE bronze_events + CLUSTER BY (event_date) + AS SELECT * + FROM STREAM read_files('s3://bucket/events/', format => 'json') + ``` + execution_success: null + expectations: + expected_facts: + - "Uses CREATE OR REFRESH STREAMING TABLE syntax" + - "Uses CLUSTER BY for data layout optimization" + - "Uses STREAM read_files() for Auto Loader ingestion" + expected_patterns: + - pattern: "CREATE OR REFRESH STREAMING TABLE" + min_count: 1 + description: "SDP streaming table DDL" + - pattern: "CLUSTER BY" + min_count: 1 + description: "Liquid clustering" + - pattern: "read_files\\(" + min_count: 1 + description: "Auto Loader function" + guidelines: + - "Must use SDP syntax (CREATE OR REFRESH), not legacy DLT syntax" + metadata: + category: happy_path + difficulty: easy +``` + +### Example 3: Error Handling Test Case + +Test cases don't have to be happy paths. Testing that a skill warns about common mistakes: + +```yaml + - id: serving_error_raw_dict_001 + inputs: + prompt: "Why does my ChatAgent return raw dicts instead of proper messages?" + outputs: + response: | + The `predict` method must return `ChatAgentResponse` with + `ChatAgentMessage` objects, not raw dicts. Use + `self.create_text_output_item(text)` to build response items. + execution_success: null + expectations: + expected_facts: + - "Explains the raw dict issue" + - "Mentions self.create_text_output_item()" + expected_patterns: + - pattern: "create_text_output_item" + min_count: 1 + description: "Correct helper method" + metadata: + category: error_handling + difficulty: easy +``` + +### Example 4: Minimal Test Case + +The simplest possible test case -- just a prompt and expected facts: + +```yaml + - id: genie_create_space_001 + inputs: + prompt: "Create a Genie Space for our sales data" + outputs: + response: "I'll create a Genie Space connected to your sales tables." + expectations: + expected_facts: + - "Creates a Genie Space" + - "Connects to data tables" + metadata: + category: happy_path + difficulty: easy +``` + +### How Test Cases Map to GEPA + +The optimizer converts each test case into a GEPA dataset instance: + +| ground_truth.yaml field | GEPA field | Used by | +|------------------------|------------|---------| +| `inputs.prompt` | `input` | Reflection LM sees the task | +| `outputs.response` | `answer` | Deterministic scorers compare against this | +| `expectations.*` | `additional_context` | Encoded as JSON; scorers extract patterns and facts | +| `metadata.category` | (stratification) | Ensures balanced train/val split | + +**Scoring pipeline per test case (layered evaluation):** + +| Layer | Weight | What it does | +|-------|--------|-------------| +| Generated response quality | 40% | An LLM reads ONLY the SKILL.md and answers the test prompt. Its response is scored against expected patterns/facts. | +| Skill content coverage | 25% | Checks if the SKILL.md itself contains the patterns and facts needed to answer test prompts. | +| Reference response check | 5% | Scores the ground truth response as a sanity baseline. | +| Structure validation | 10% | Python/SQL syntax in code blocks, no hallucinated APIs. | +| Token efficiency | 20% | Penalizes bloated skill content (smaller is better). | + +The key insight: if a pattern or fact is missing from the SKILL.md, the generation model cannot produce it. This gives GEPA immediate, dynamic signal when content changes. + +--- + +## Manifest Configuration Examples + +The `manifest.yaml` controls which scorers run and what trace expectations apply. Here are patterns for different skill types: + +### Python SDK Skill + +```yaml +skill_name: databricks-model-serving +description: Deploy and query Databricks Model Serving endpoints + +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + default_guidelines: + - "Must use ResponsesAgent pattern for GenAI agents" + - "Must use self.create_text_output_item() for ChatAgent responses" + - "Job-based deployment preferred over synchronous" + +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 +``` + +### SQL-Heavy Skill + +```yaml +skill_name: databricks-spark-declarative-pipelines +description: Create SDP/LDP pipelines with streaming tables and materialized views + +scorers: + enabled: + - python_syntax + - sql_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + default_guidelines: + - "Must use SDP syntax (CREATE OR REFRESH STREAMING TABLE)" + +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 +``` + +### Skill with Trace Expectations + +```yaml +skill_name: databricks-agent-bricks +description: Create Agent Bricks (Knowledge Assistants, Genie, Multi-Agent) + +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + default_guidelines: + - "Must use correct MCP tools (manage_ka, manage_mas)" + - "Must use ka_tile_id not endpoint_name for Knowledge Assistants" + - "Must use find_by_name helper for entity lookups" + trace_expectations: + tool_limits: + manage_ka: 10 + manage_mas: 10 + create_or_update_genie: 5 +``` + +--- + +## End-to-End Walkthrough + +Here's a complete example of adding test cases and running optimization for a new skill. + +### Step 1: Initialize scaffolding + +```bash +# Creates manifest.yaml, ground_truth.yaml, candidates.yaml +/skill-test my-new-skill init +``` + +### Step 2: Write test cases + +Edit `.test/skills/my-new-skill/ground_truth.yaml`: + +```yaml +metadata: + skill_name: my-new-skill + version: "1.0" + +test_cases: + - id: basic_001 + inputs: + prompt: "Show me how to create a Delta table with liquid clustering" + outputs: + response: | + ```sql + CREATE TABLE catalog.schema.events ( + event_id BIGINT, + event_date DATE, + payload STRING + ) + CLUSTER BY (event_date) + ``` + expectations: + expected_facts: + - "Uses CREATE TABLE with CLUSTER BY" + expected_patterns: + - pattern: "CLUSTER BY" + min_count: 1 + description: "Liquid clustering syntax" + metadata: + category: happy_path + difficulty: easy + + - id: basic_002 + inputs: + prompt: "How do I read from a Delta table using Spark?" + outputs: + response: | + ```python + df = spark.read.table("catalog.schema.my_table") + ``` + expectations: + expected_facts: + - "Uses three-level namespace" + expected_patterns: + - pattern: "spark\\.read\\.table" + min_count: 1 + description: "Spark table reader" + metadata: + category: happy_path + difficulty: easy + + # ... add at least 5 test cases for train/val split +``` + +### Step 3: Verify setup with a dry run + +```bash +uv run python .test/scripts/optimize.py my-new-skill --dry-run +``` + +Output: +``` +=== Dry Run: my-new-skill === +SKILL.md path: .claude/skills/my-new-skill/SKILL.md +Components: ['skill_md'] +Total original tokens: 2,847 + skill_md: 2,847 tokens +Train tasks: 4 +Val tasks: None (single-task mode) +Mode: static +Preset: standard (max_metric_calls=50, scaled for 1 component(s)) +Max passes: 5 +Reflection LM: databricks/databricks-claude-opus-4-6 +Current score: 0.723 +``` + +### Step 4: Run optimization + +```bash +# Quick first pass to see if GEPA can improve +uv run python .test/scripts/optimize.py my-new-skill --preset quick + +# If score improves, run standard for better results +uv run python .test/scripts/optimize.py my-new-skill --preset standard --apply +``` + +### Step 5: Save baseline for regression checking + +```bash +/skill-test my-new-skill baseline + +# Later, after making changes: +/skill-test my-new-skill regression +``` + +### Tips for Writing Good Test Cases + +- **Cover different categories**: happy_path, error_handling, edge cases. The splitter stratifies by `metadata.category` so each category is represented in both train and val sets. +- **Be specific in expected_patterns**: Use regex that captures the essential API call, not surrounding prose. `"ResponsesAgent"` is better than `".*ResponsesAgent.*"`. +- **Include both simple and complex prompts**: Simple prompts test baseline quality; complex prompts stress-test the skill's depth. +- **Set `execution_success`**: `true` if you verified the code runs, `null` if it's theoretical, `false` if it's known to fail. This helps scorers weight results. +- **Use `guidelines` for soft expectations**: Things an LLM judge should check but that can't be captured by regex (e.g., "Should explain why CLUSTER BY is preferred over partitioning"). + +--- + ## Trace Evaluation Capture Claude Code sessions and evaluate against skill expectations. @@ -214,6 +753,12 @@ mlflow autolog claude -u databricks -n "$MLFLOW_EXPERIMENT_NAME" . .test/baselines// └── baseline.yaml # Regression baseline + +.test/scripts/ +├── optimize.py # CLI for GEPA optimization +├── trace_to_examples.py # Extract test cases from session.jsonl traces +├── add_example.py # Manually add test cases to ground_truth.yaml +└── _common.py # Shared CLI utilities ``` --- diff --git a/.test/notebooks/gepa_skill_optimization_demo.ipynb b/.test/notebooks/gepa_skill_optimization_demo.ipynb index f63a0d93..b403bf1f 100644 --- a/.test/notebooks/gepa_skill_optimization_demo.ipynb +++ b/.test/notebooks/gepa_skill_optimization_demo.ipynb @@ -47,22 +47,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import os\n", - "\n", - "# Configure the reflection model -- pick ONE:\n", - "\n", - "# Option A: Databricks Model Serving (default, recommended)\n", - "# os.environ[\"DATABRICKS_API_KEY\"] = \"dapi...\" \n", - "# os.environ[\"DATABRICKS_API_BASE\"] = \"https://.cloud.databricks.com\"\n", - "# os.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n", - "\n", - "# Option B: OpenAI\n", - "# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n", - "# os.environ[\"GEPA_REFLECTION_LM\"] = \"openai/gpt-4o\"\n", - "\n", - "print(f\"Reflection LM: {os.environ.get('GEPA_REFLECTION_LM', 'databricks/databricks-gpt-5-2 (default)')}\")" - ] + "source": "import os\n\n# Configure the reflection model -- pick ONE:\n\n# Option A: Databricks Model Serving (default, recommended)\n# IMPORTANT: DATABRICKS_API_BASE must end with /serving-endpoints\n# os.environ[\"DATABRICKS_API_KEY\"] = \"dapi...\" \n# os.environ[\"DATABRICKS_API_BASE\"] = \"https://.cloud.databricks.com/serving-endpoints\"\n# os.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n\n# Option B: OpenAI\n# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n# os.environ[\"GEPA_REFLECTION_LM\"] = \"openai/gpt-4o\"\n\nprint(f\"Reflection LM: {os.environ.get('GEPA_REFLECTION_LM', 'databricks/databricks-gpt-5-2 (default)')}\")" }, { "cell_type": "markdown", @@ -128,28 +113,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from skill_test.optimize.evaluator import create_skill_adapter, SkillAdapter\n", - "from skill_test.optimize.splitter import to_gepa_instances\n", - "\n", - "adapter = create_skill_adapter(SKILL_NAME)\n", - "seed_candidate = {SkillAdapter.SKILL_KEY: original_content}\n", - "\n", - "# Evaluate on all train tasks\n", - "gepa_instances = to_gepa_instances(train)\n", - "eval_batch = adapter.evaluate(gepa_instances, seed_candidate)\n", - "\n", - "print(f\"{'Task ID':<35} {'Score':>8}\")\n", - "print(\"-\" * 45)\n", - "for i, score in enumerate(eval_batch.scores):\n", - " task_id = train[i]['id']\n", - " status = 'PASS' if score >= 0.5 else 'FAIL'\n", - " print(f\"{task_id:<35} {score:>7.3f} {status}\")\n", - "\n", - "baseline_score = sum(eval_batch.scores) / len(eval_batch.scores)\n", - "print(f\"\\nBaseline Score: {baseline_score:.3f}\")\n", - "print(f\"Token Count: {original_tokens:,}\")" - ] + "source": "from skill_test.optimize.evaluator import create_skill_evaluator, SKILL_KEY\nfrom skill_test.optimize.splitter import to_gepa_instances\n\nevaluator = create_skill_evaluator(SKILL_NAME)\nseed_candidate = {SKILL_KEY: original_content}\n\n# Evaluate on all train tasks\ngepa_instances = to_gepa_instances(train)\n\nprint(f\"{'Task ID':<35} {'Score':>8}\")\nprint(\"-\" * 45)\nfor i, inst in enumerate(gepa_instances):\n score, side_info = evaluator(seed_candidate, inst)\n task_id = train[i]['id']\n status = 'PASS' if score >= 0.5 else 'FAIL'\n print(f\"{task_id:<35} {score:>7.3f} {status}\")\n\n# Quick baseline\nscores = [evaluator(seed_candidate, inst)[0] for inst in gepa_instances]\nbaseline_score = sum(scores) / len(scores)\nprint(f\"\\nBaseline Score: {baseline_score:.3f}\")\nprint(f\"Token Count: {original_tokens:,}\")" }, { "cell_type": "markdown", diff --git a/.test/scripts/add_example.py b/.test/scripts/add_example.py new file mode 100644 index 00000000..68f5c920 --- /dev/null +++ b/.test/scripts/add_example.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +"""Manually add a test case to a skill's ground_truth.yaml. + +Usage: + # Interactive mode — prompts for each field + uv run python .test/scripts/add_example.py databricks-model-serving + + # Inline mode — provide prompt and response directly + uv run python .test/scripts/add_example.py databricks-model-serving \ + --prompt "Create a ChatAgent with tool calling" \ + --response-file /path/to/response.md \ + --facts "Uses ChatAgent class" "Implements predict method" \ + --patterns "ChatAgent" "def predict" + + # From clipboard + uv run python .test/scripts/add_example.py databricks-model-serving --from-clipboard +""" + +import argparse +import re +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _common import setup_path + +setup_path() + + +def _detect_language(response: str) -> str | None: + """Auto-detect code language from response code blocks.""" + langs = re.findall(r"```(\w+)\n", response) + if not langs: + return None + # Most common language wins + from collections import Counter + counts = Counter(l for l in langs if l != "text") + return counts.most_common(1)[0][0] if counts else None + + +def _auto_extract_patterns(response: str) -> list[str]: + """Extract patterns from code blocks.""" + patterns = [] + for match in re.finditer(r"```(?:python)\n(.*?)```", response, re.DOTALL): + code = match.group(1) + for m in re.finditer(r"class\s+(\w+)", code): + patterns.append(m.group(1)) + for m in re.finditer(r"def\s+(\w+)", code): + patterns.append(m.group(1)) + + for match in re.finditer(r"```(?:sql)\n(.*?)```", response, re.DOTALL): + code = match.group(1) + for m in re.finditer(r"(?:CREATE|ALTER)\s+(?:TABLE|VIEW)\s+(\S+)", code, re.I): + patterns.append(m.group(1)) + + return list(dict.fromkeys(patterns)) + + +def _next_id(skill_name: str, existing_ids: set[str]) -> str: + """Generate the next sequential ID for a skill.""" + prefix = skill_name.replace("-", "_") + idx = 1 + while True: + candidate = f"{prefix}_{idx:03d}" + if candidate not in existing_ids: + return candidate + idx += 1 + + +def _read_clipboard() -> str: + """Read text from system clipboard.""" + import subprocess + try: + result = subprocess.run(["pbpaste"], capture_output=True, text=True, check=True) + return result.stdout + except (subprocess.CalledProcessError, FileNotFoundError): + try: + result = subprocess.run( + ["xclip", "-selection", "clipboard", "-o"], + capture_output=True, text=True, check=True, + ) + return result.stdout + except (subprocess.CalledProcessError, FileNotFoundError): + print("Error: Could not read clipboard (tried pbpaste and xclip)") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Add a test case to a skill's ground_truth.yaml", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "skill_name", + help="Name of the skill (e.g., databricks-model-serving)", + ) + parser.add_argument( + "--prompt", "-p", + default=None, + help="The user prompt for the test case", + ) + parser.add_argument( + "--response", "-r", + default=None, + help="The expected response text (inline)", + ) + parser.add_argument( + "--response-file", + type=Path, + default=None, + help="Path to a file containing the expected response", + ) + parser.add_argument( + "--facts", "-f", + nargs="*", + default=None, + help="Expected facts that must appear in the response", + ) + parser.add_argument( + "--patterns", + nargs="*", + default=None, + help="Expected patterns (regex) that must match in the response", + ) + parser.add_argument( + "--category", "-c", + default="happy_path", + help="Test case category (default: happy_path)", + ) + parser.add_argument( + "--from-clipboard", + action="store_true", + help="Read prompt and response from clipboard (separated by ---)", + ) + parser.add_argument( + "--id", + default=None, + help="Override the auto-generated test case ID", + ) + + args = parser.parse_args() + + import yaml + from skill_test.dataset import get_dataset_source, YAMLDatasetSource + + # Validate skill exists + skill_dir = Path(".test/skills") / args.skill_name + gt_path = skill_dir / "ground_truth.yaml" + + if not skill_dir.exists(): + print(f"Error: skill directory not found: {skill_dir}") + print("Available skills:") + for d in sorted(Path(".test/skills").iterdir()): + if d.is_dir() and not d.name.startswith("_"): + print(f" {d.name}") + sys.exit(1) + + # Load existing records + existing_ids = set() + if gt_path.exists(): + try: + source = YAMLDatasetSource(gt_path) + existing = source.load() + existing_ids = {r.id for r in existing} + except Exception: + pass + + # Get prompt + prompt = args.prompt + response = args.response + + if args.from_clipboard: + clipboard = _read_clipboard() + if "---" in clipboard: + parts = clipboard.split("---", 1) + prompt = parts[0].strip() + response = parts[1].strip() + else: + prompt = clipboard.strip() + print("Clipboard content set as prompt (no --- separator found for response)") + + if args.response_file: + response = args.response_file.read_text() + + if not prompt: + print("Enter the user prompt (Ctrl+D to finish):") + prompt = sys.stdin.read().strip() + + if not prompt: + print("Error: prompt is required") + sys.exit(1) + + if not response: + print("Enter the expected response (Ctrl+D to finish):") + response = sys.stdin.read().strip() + + # Generate ID + test_id = args.id or _next_id(args.skill_name, existing_ids) + + # Auto-extract patterns and facts + auto_patterns = _auto_extract_patterns(response) if response else [] + auto_facts = args.facts or [] + user_patterns = args.patterns or [] + + # Merge auto and user patterns + all_patterns = list(dict.fromkeys(user_patterns + auto_patterns)) + + # Detect language + language = _detect_language(response) if response else None + + # Build test case + test_case = { + "id": test_id, + "inputs": {"prompt": prompt}, + "metadata": { + "category": args.category, + "source": "manual", + }, + } + + if response: + test_case["outputs"] = {"response": response} + if language: + test_case["metadata"]["language"] = language + + expectations = {} + if auto_facts: + expectations["expected_facts"] = auto_facts + if all_patterns: + expectations["expected_patterns"] = all_patterns + if expectations: + test_case["expectations"] = expectations + + # Show summary + print(f"\n--- Test Case Preview ---") + print(f"ID: {test_id}") + print(f"Skill: {args.skill_name}") + print(f"Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}") + if response: + print(f"Response: {len(response)} chars") + if all_patterns: + print(f"Patterns: {all_patterns}") + if auto_facts: + print(f"Facts: {auto_facts}") + print(f"Category: {args.category}") + + # Confirm + if sys.stdin.isatty(): + confirm = input("\nAppend to ground_truth.yaml? [Y/n] ").strip().lower() + if confirm and confirm != "y": + print("Aborted.") + sys.exit(0) + + # Save + if gt_path.exists(): + with open(gt_path) as f: + data = yaml.safe_load(f) or {"test_cases": []} + else: + gt_path.parent.mkdir(parents=True, exist_ok=True) + data = {"test_cases": []} + + data["test_cases"].append(test_case) + + with open(gt_path, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + + print(f"Added test case '{test_id}' to {gt_path}") + + +if __name__ == "__main__": + main() diff --git a/.test/scripts/generate_ground_truth.py b/.test/scripts/generate_ground_truth.py new file mode 100644 index 00000000..083b08a2 --- /dev/null +++ b/.test/scripts/generate_ground_truth.py @@ -0,0 +1,645 @@ +#!/usr/bin/env python3 +"""Generate ground_truth.yaml and manifest.yaml for skills missing test cases. + +Reads each SKILL.md, extracts code examples, headers, and key patterns, +then generates test cases that enable GEPA scorers to produce real signal. + +Usage: + # Generate for a specific skill + uv run python .test/scripts/generate_ground_truth.py databricks-metric-views + + # Generate for all missing skills + uv run python .test/scripts/generate_ground_truth.py --all + + # Preview without writing (dry run) + uv run python .test/scripts/generate_ground_truth.py --all --dry-run +""" + +import argparse +import re +import sys +import textwrap +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +import yaml + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + +@dataclass +class CodeBlock: + language: str + code: str + context: str # surrounding prose/header text + + +@dataclass +class Section: + level: int # 2 for ##, 3 for ### + title: str + content: str + code_blocks: list[CodeBlock] = field(default_factory=list) + + +@dataclass +class Pattern: + pattern: str + description: str + min_count: int = 1 + + +# --------------------------------------------------------------------------- +# SKILL.md parsing +# --------------------------------------------------------------------------- + +def extract_frontmatter(content: str) -> dict[str, str]: + """Extract YAML frontmatter from SKILL.md.""" + m = re.match(r"^---\s*\n(.*?)\n---", content, re.DOTALL) + if m: + try: + return yaml.safe_load(m.group(1)) or {} + except yaml.YAMLError: + return {} + return {} + + +def extract_code_blocks(content: str) -> list[CodeBlock]: + """Extract fenced code blocks with their language and surrounding context.""" + blocks = [] + # Match ```lang ... ``` + for m in re.finditer( + r"```(\w+)\s*\n(.*?)```", + content, + re.DOTALL, + ): + lang = m.group(1).lower() + code = m.group(2).strip() + # Get surrounding context (up to 200 chars before) + start = max(0, m.start() - 200) + ctx = content[start : m.start()].strip() + # Find the nearest header + header_match = re.search(r"#+\s+(.+)", ctx) + context = header_match.group(1) if header_match else ctx[-100:] if ctx else "" + blocks.append(CodeBlock(language=lang, code=code, context=context)) + return blocks + + +def extract_sections(content: str) -> list[Section]: + """Extract H2 and H3 sections with their content and code blocks.""" + # Remove frontmatter + content = re.sub(r"^---\s*\n.*?\n---\s*\n", "", content, flags=re.DOTALL) + + sections = [] + # Split by headers + parts = re.split(r"(^#{2,3}\s+.+$)", content, flags=re.MULTILINE) + + current_title = "" + current_level = 0 + current_content = "" + + for part in parts: + header_match = re.match(r"^(#{2,3})\s+(.+)$", part) + if header_match: + # Save previous section + if current_title: + code_blocks = extract_code_blocks(current_content) + sections.append(Section( + level=current_level, + title=current_title, + content=current_content.strip(), + code_blocks=code_blocks, + )) + current_level = len(header_match.group(1)) + current_title = header_match.group(2).strip() + current_content = "" + else: + current_content += part + + # Don't forget last section + if current_title: + code_blocks = extract_code_blocks(current_content) + sections.append(Section( + level=current_level, + title=current_title, + content=current_content.strip(), + code_blocks=code_blocks, + )) + + return sections + + +def extract_patterns_from_code(code: str, language: str) -> list[Pattern]: + """Extract function/class/keyword patterns from a code block.""" + patterns = [] + + if language in ("python", "py"): + # Function calls: word( + for m in re.finditer(r"\b([a-z_]\w+)\s*\(", code): + name = m.group(1) + if name not in ("print", "str", "int", "float", "len", "range", "list", + "dict", "set", "tuple", "type", "isinstance", "if", "for", + "while", "return", "import", "from", "as", "with", "round", + "max", "min", "abs", "sum", "enumerate", "zip", "map", + "filter", "sorted", "any", "all", "open", "format", "bool", + "append", "extend"): + patterns.append(Pattern( + pattern=re.escape(name), + description=f"Uses {name}()", + )) + # Class names: CapitalWord + for m in re.finditer(r"\b([A-Z][a-zA-Z]+(?:[A-Z][a-zA-Z]+)*)\b", code): + name = m.group(1) + if name not in ("True", "False", "None", "String", "Int", "Float", + "IMPORTANT", "NOTE", "WARNING", "TODO"): + patterns.append(Pattern( + pattern=re.escape(name), + description=f"References {name}", + )) + elif language in ("sql",): + # SQL keywords and functions + for m in re.finditer(r"\b(CREATE\s+(?:OR\s+REPLACE\s+)?(?:MATERIALIZED\s+VIEW|VIEW|TABLE|PROCEDURE|CONNECTION))", code, re.IGNORECASE): + patterns.append(Pattern( + pattern=m.group(1).replace(" ", " "), + description=f"Uses {m.group(1).strip()}", + )) + # Function calls in SQL + for m in re.finditer(r"\b([a-z_]\w+)\s*\(", code, re.IGNORECASE): + name = m.group(1).lower() + if name not in ("select", "from", "where", "and", "or", "not", + "group", "order", "having", "limit", "insert", + "update", "delete", "values", "into", "set", + "join", "on", "as", "case", "when", "then", + "else", "end", "in", "between", "like", "is", + "null", "exists", "count", "sum", "avg", "min", "max"): + patterns.append(Pattern( + pattern=re.escape(name), + description=f"Uses {name}() function", + )) + elif language in ("yaml", "yml"): + # Key YAML keys + for m in re.finditer(r"^\s*(\w[\w_-]+):", code, re.MULTILINE): + key = m.group(1) + if key not in ("name", "description", "type", "default", "value", + "true", "false"): + patterns.append(Pattern( + pattern=re.escape(key), + description=f"Includes {key} configuration", + )) + elif language in ("bash", "sh"): + # CLI commands + for m in re.finditer(r"\b(databricks\s+\w+(?:\s+\w+)?)", code): + patterns.append(Pattern( + pattern=re.escape(m.group(1)), + description=f"Uses {m.group(1)} command", + )) + + # Deduplicate by pattern string + seen = set() + unique = [] + for p in patterns: + if p.pattern not in seen: + seen.add(p.pattern) + unique.append(p) + return unique + + +def extract_facts_from_section(section: Section) -> list[str]: + """Extract key factual statements from a section's prose.""" + facts = [] + # Look for bullet points with key info + for line in section.content.split("\n"): + line = line.strip() + # Bullet points with bold terms + m = re.match(r"[-*]\s+\*\*(.+?)\*\*\s*[-:]\s*(.+)", line) + if m: + facts.append(f"{m.group(1)}: {m.group(2).strip()}") + continue + # Table rows with useful info + m = re.match(r"\|\s*`?(\w[\w_.-]+)`?\s*\|\s*(.+?)\s*\|", line) + if m and not m.group(1).startswith("-"): + facts.append(f"{m.group(1)}: {m.group(2).strip()}") + + return facts[:5] # Limit to top 5 + + +# --------------------------------------------------------------------------- +# Test case generation +# --------------------------------------------------------------------------- + +def generate_prompt_from_section(section: Section, skill_name: str) -> str: + """Generate a natural user prompt from a section's content.""" + title = section.title + + # Map section titles to natural prompts + prompt_templates = { + "Quick Start": f"Show me how to get started with {skill_name.replace('databricks-', '')}", + "Create": f"Create a {title.lower().replace('create ', '')}", + "Common Patterns": f"Show me common patterns for {skill_name.replace('databricks-', '')}", + "Configuration": f"How do I configure {skill_name.replace('databricks-', '')}?", + "Filtering": f"How do I filter results when querying?", + "Common Issues": f"What are common issues with {skill_name.replace('databricks-', '')}?", + } + + # Check if any template matches + for key, template in prompt_templates.items(): + if key.lower() in title.lower(): + return template + + # Generate from code blocks if present + if section.code_blocks: + block = section.code_blocks[0] + if block.language in ("python", "py"): + return f"Write Python code to {title.lower()}" + elif block.language == "sql": + return f"Write SQL to {title.lower()}" + elif block.language in ("yaml", "yml"): + return f"Show me the YAML configuration for {title.lower()}" + elif block.language in ("bash", "sh"): + return f"Show me the CLI commands to {title.lower()}" + + # Default: use section title + return f"How do I {title.lower()} with {skill_name.replace('databricks-', '')}?" + + +def generate_response_from_section(section: Section) -> str: + """Generate an expected response from a section's code blocks and content.""" + parts = [] + + # Add brief explanation from prose + prose_lines = [] + for line in section.content.split("\n"): + stripped = line.strip() + if stripped and not stripped.startswith("|") and not stripped.startswith("```"): + if not stripped.startswith("#"): + prose_lines.append(stripped) + if len(prose_lines) >= 3: + break + + if prose_lines: + parts.append(" ".join(prose_lines[:3])) + + # Add code blocks + for block in section.code_blocks[:2]: # Max 2 code blocks per test case + parts.append(f"\n```{block.language}\n{block.code}\n```") + + return "\n\n".join(parts) if parts else section.content[:500] + + +def build_test_case( + skill_name: str, + section: Section, + index: int, + difficulty: str = "easy", +) -> dict: + """Build a single test case from a section.""" + # Generate ID + feature = re.sub(r"[^a-z0-9]+", "_", section.title.lower()).strip("_")[:30] + case_id = f"{skill_name}_{feature}_{index:03d}" + + prompt = generate_prompt_from_section(section, skill_name) + response = generate_response_from_section(section) + + # Collect patterns from all code blocks + all_patterns = [] + for block in section.code_blocks: + all_patterns.extend(extract_patterns_from_code(block.code, block.language)) + + # Limit patterns to top 5 most relevant + patterns_data = [] + seen = set() + for p in all_patterns[:8]: + if p.pattern not in seen: + seen.add(p.pattern) + patterns_data.append({ + "pattern": p.pattern, + "min_count": p.min_count, + "description": p.description, + }) + if len(patterns_data) >= 5: + break + + # Extract facts + facts = extract_facts_from_section(section) + if not facts: + # Fall back to key terms from code blocks + for block in section.code_blocks: + if block.language in ("python", "py"): + facts.append(f"Uses Python {block.language}") + elif block.language == "sql": + facts.append("Uses SQL syntax") + + # Build guidelines from section context + guidelines = [] + if any(b.language in ("python", "py") for b in section.code_blocks): + guidelines.append("Code must be valid Python syntax") + if any(b.language == "sql" for b in section.code_blocks): + guidelines.append("SQL must follow Databricks SQL syntax") + if section.code_blocks: + guidelines.append("Response must include working code examples") + + return { + "id": case_id, + "inputs": {"prompt": prompt}, + "outputs": { + "response": response, + "execution_success": True, + }, + "expectations": { + "expected_facts": facts if facts else [], + "expected_patterns": patterns_data if patterns_data else [], + "guidelines": guidelines if guidelines else [], + }, + "metadata": { + "category": "happy_path", + "difficulty": difficulty, + "source": "auto_generated", + "section": section.title, + }, + } + + +def detect_languages(sections: list[Section]) -> set[str]: + """Detect which languages are used across all sections.""" + langs = set() + for s in sections: + for b in s.code_blocks: + langs.add(b.language) + return langs + + +def generate_manifest(skill_name: str, description: str, languages: set[str]) -> dict: + """Generate a manifest.yaml for a skill.""" + enabled_scorers = ["pattern_adherence", "no_hallucinated_apis", "expected_facts_present"] + if "python" in languages or "py" in languages: + enabled_scorers.insert(0, "python_syntax") + if "sql" in languages: + enabled_scorers.insert(0, "sql_syntax") + + default_guidelines = [ + "Response must address the user's request completely", + "Code examples must follow documented best practices", + "Response must use modern APIs (not deprecated ones)", + ] + + return { + "skill_name": skill_name, + "description": description or f"Test cases for {skill_name} skill", + "scorers": { + "enabled": enabled_scorers, + "llm_scorers": ["Safety", "guidelines_from_expectations"], + "default_guidelines": default_guidelines, + "trace_expectations": { + "tool_limits": {"Bash": 10, "Read": 20}, + "token_budget": {"max_total": 100000}, + "required_tools": ["Read"], + "banned_tools": [], + "expected_files": [], + }, + }, + "quality_gates": { + "syntax_valid": 1.0, + "pattern_adherence": 0.9, + "execution_success": 0.8, + }, + } + + +# --------------------------------------------------------------------------- +# Section selection: pick the best sections for test cases +# --------------------------------------------------------------------------- + +def select_sections_for_tests(sections: list[Section], target: int = 7) -> list[Section]: + """Select the best sections for test case generation. + + Prefers sections with code blocks and diverse topics. + """ + # Score sections by relevance + scored = [] + for s in sections: + score = 0 + # Sections with code are much more valuable + score += len(s.code_blocks) * 3 + # Prefer H2 over H3 + if s.level == 2: + score += 1 + # Skip meta sections + skip_titles = {"related skills", "resources", "reference files", "notes", + "common issues", "current limitations", "sdk version requirements", + "prerequisites", "prerequisites check", "when to use", + "environment configuration", "best practices"} + if s.title.lower() in skip_titles: + score -= 5 + # Boost pattern/example sections + if any(kw in s.title.lower() for kw in ("pattern", "example", "start", "create", "common")): + score += 2 + # Boost if has substantial content + if len(s.content) > 200: + score += 1 + + scored.append((score, s)) + + # Sort by score descending, take top N + scored.sort(key=lambda x: x[0], reverse=True) + selected = [s for _, s in scored if _ > 0][:target] + + return selected + + +# --------------------------------------------------------------------------- +# Main generation logic +# --------------------------------------------------------------------------- + +def generate_skill_tests(skill_name: str, repo_root: Path) -> tuple[list[dict], dict]: + """Generate test cases and manifest for a single skill. + + Returns: + (test_cases, manifest) tuple + """ + skill_md_path = repo_root / "databricks-skills" / skill_name / "SKILL.md" + if not skill_md_path.exists(): + raise FileNotFoundError(f"SKILL.md not found: {skill_md_path}") + + content = skill_md_path.read_text() + frontmatter = extract_frontmatter(content) + description = frontmatter.get("description", "") + sections = extract_sections(content) + + # Select best sections for test cases + selected = select_sections_for_tests(sections, target=7) + + if not selected: + raise ValueError(f"No suitable sections found in {skill_md_path}") + + # Generate test cases + test_cases = [] + difficulties = ["easy", "easy", "easy", "medium", "medium", "medium", "hard", "hard"] + for i, section in enumerate(selected): + difficulty = difficulties[i] if i < len(difficulties) else "medium" + tc = build_test_case(skill_name, section, i + 1, difficulty) + test_cases.append(tc) + + # Generate manifest + languages = detect_languages(sections) + manifest = generate_manifest(skill_name, description, languages) + + return test_cases, manifest + + +def write_skill_tests( + skill_name: str, + test_cases: list[dict], + manifest: dict, + output_dir: Path, + dry_run: bool = False, +) -> None: + """Write ground_truth.yaml and manifest.yaml for a skill.""" + skill_dir = output_dir / skill_name + skill_dir.mkdir(parents=True, exist_ok=True) + + gt_data = { + "metadata": { + "skill_name": skill_name, + "version": "0.1.0", + "created_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f"), + }, + "test_cases": test_cases, + } + + gt_path = skill_dir / "ground_truth.yaml" + manifest_path = skill_dir / "manifest.yaml" + + if dry_run: + print(f" [DRY RUN] Would write {gt_path} ({len(test_cases)} test cases)") + print(f" [DRY RUN] Would write {manifest_path}") + return + + # Custom YAML representer for multiline strings + class MultilineDumper(yaml.SafeDumper): + pass + + def str_representer(dumper, data): + if "\n" in data: + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + MultilineDumper.add_representer(str, str_representer) + + with open(gt_path, "w") as f: + yaml.dump(gt_data, f, Dumper=MultilineDumper, default_flow_style=False, + sort_keys=False, allow_unicode=True, width=120) + + with open(manifest_path, "w") as f: + yaml.dump(manifest, f, Dumper=MultilineDumper, default_flow_style=False, + sort_keys=False, allow_unicode=True, width=120) + + print(f" Wrote {gt_path} ({len(test_cases)} test cases)") + print(f" Wrote {manifest_path}") + + +# --------------------------------------------------------------------------- +# Skills that are missing test cases +# --------------------------------------------------------------------------- + +MISSING_SKILLS = [ + "databricks-app-apx", + "databricks-config", + "databricks-dbsql", + "databricks-docs", + "databricks-jobs", + "databricks-lakebase-autoscale", + "databricks-lakebase-provisioned", + "databricks-metric-views", + "databricks-mlflow-evaluation", + "databricks-python-sdk", + "databricks-spark-structured-streaming", + "databricks-synthetic-data-generation", + "databricks-unity-catalog", + "databricks-unstructured-pdf-generation", + "databricks-vector-search", +] + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Generate ground_truth.yaml test cases for skills", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "skill_name", + nargs="?", + help="Skill name (e.g., databricks-metric-views)", + ) + parser.add_argument( + "--all", + action="store_true", + help="Generate for all missing skills", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview without writing files", + ) + parser.add_argument( + "--force", + action="store_true", + help="Overwrite existing ground_truth.yaml files", + ) + + args = parser.parse_args() + + if not args.skill_name and not args.all: + parser.error("Provide a skill name or use --all") + + # Find repo root + repo_root = Path(__file__).resolve().parent.parent.parent + if not (repo_root / "databricks-skills").exists(): + print(f"ERROR: databricks-skills/ not found at {repo_root}", file=sys.stderr) + sys.exit(1) + + output_dir = repo_root / ".test" / "skills" + + # Determine which skills to process + if args.all: + skills = MISSING_SKILLS + else: + skills = [args.skill_name] + + success = 0 + errors = 0 + + for skill_name in skills: + print(f"\n{'=' * 50}") + print(f" {skill_name}") + print(f"{'=' * 50}") + + # Check if already exists + if not args.force and (output_dir / skill_name / "ground_truth.yaml").exists(): + print(f" SKIP: ground_truth.yaml already exists (use --force to overwrite)") + continue + + try: + test_cases, manifest = generate_skill_tests(skill_name, repo_root) + write_skill_tests(skill_name, test_cases, manifest, output_dir, dry_run=args.dry_run) + success += 1 + except Exception as e: + print(f" ERROR: {e}") + errors += 1 + + print(f"\n{'=' * 50}") + print(f" Done: {success} generated, {errors} errors") + print(f"{'=' * 50}") + + sys.exit(1 if errors > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py index b5b04c35..7a1d8548 100644 --- a/.test/scripts/optimize.py +++ b/.test/scripts/optimize.py @@ -66,12 +66,19 @@ def main(): parser.add_argument( "--task-lm", default=None, - help="LLM model for generative mode (e.g., openai/gpt-4o)", + help="(Deprecated, use --gen-model) LLM model for generative mode", + ) + parser.add_argument( + "--gen-model", + default=None, + help="LLM model for generative evaluation (default: GEPA_GEN_LM env or " + "databricks/databricks-claude-sonnet-4-6). The evaluator sends the " + "candidate SKILL.md to this model and scores the generated response.", ) parser.add_argument( "--reflection-lm", default=None, - help="Override GEPA reflection model (default: GEPA_REFLECTION_LM env or databricks/databricks-gpt-5-2)", + help="Override GEPA reflection model (default: GEPA_REFLECTION_LM env or databricks/databricks-claude-opus-4-6)", ) parser.add_argument( "--dry-run", @@ -99,6 +106,20 @@ def main(): action="store_true", help="Optimize ONLY tool descriptions, not the SKILL.md", ) + parser.add_argument( + "--max-passes", + type=int, + default=5, + help="Maximum optimization passes per component (default: 5). " + "Each pass re-seeds from the previous best. Stops early if no improvement.", + ) + parser.add_argument( + "--max-metric-calls", + type=int, + default=None, + help="Override max metric calls per pass (default: auto-scaled by preset × components, " + "capped at 300 for non-Opus models). Example: --max-metric-calls 100", + ) args = parser.parse_args() @@ -129,11 +150,14 @@ def main(): mode=args.mode, preset=args.preset, task_lm=args.task_lm, + gen_model=args.gen_model, reflection_lm=args.reflection_lm, include_tools=args.include_tools, tool_modules=args.tool_modules, tools_only=args.tools_only, dry_run=args.dry_run, + max_passes=args.max_passes, + max_metric_calls=args.max_metric_calls, ) review_optimization(result) if args.apply and not args.dry_run: @@ -161,11 +185,14 @@ def main(): mode=args.mode, preset=args.preset, task_lm=args.task_lm, + gen_model=args.gen_model, reflection_lm=args.reflection_lm, include_tools=args.include_tools, tool_modules=args.tool_modules, tools_only=args.tools_only, dry_run=args.dry_run, + max_passes=args.max_passes, + max_metric_calls=args.max_metric_calls, ) review_optimization(result) if args.apply and not args.dry_run: diff --git a/.test/scripts/trace_to_examples.py b/.test/scripts/trace_to_examples.py new file mode 100644 index 00000000..cb056b3f --- /dev/null +++ b/.test/scripts/trace_to_examples.py @@ -0,0 +1,710 @@ +#!/usr/bin/env python3 +"""Extract test cases from Claude Code traces (local or MLflow). + +Parses real agent transcripts and auto-generates ground_truth.yaml entries +from user prompt / assistant response pairs. + +Supports three trace sources: +1. Local session.jsonl files (--trace) +2. MLflow experiment traces (--experiment-id) +3. Individual MLflow traces (--trace-id or --run-id) + +Usage: + # --- From local session.jsonl --- + uv run python .test/scripts/trace_to_examples.py \ + --trace ~/.claude/projects/.../session.jsonl \ + --skill databricks-model-serving + + # --- From MLflow experiment (browse traces, pick best ones) --- + # List recent traces in an experiment + uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 --list + + # Extract from all recent traces in an experiment + uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving + + # Extract from a specific MLflow run + uv run python .test/scripts/trace_to_examples.py \ + --run-id abc123def456 \ + --skill databricks-model-serving + + # Extract from a specific MLflow trace ID + uv run python .test/scripts/trace_to_examples.py \ + --trace-id tr-d416fccdab46e2dea6bad1d0bd8aaaa8 \ + --skill databricks-model-serving + + # --- Common options --- + # With LLM refinement of expectations + uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --refine + + # Auto-append to ground_truth.yaml (skip manual review) + uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --trust + + # Limit number of traces to process from an experiment + uv run python .test/scripts/trace_to_examples.py \ + --experiment-id 2452310130108632 \ + --skill databricks-model-serving --limit 5 +""" + +import argparse +import re +import sys +from pathlib import Path +from typing import Any + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _common import setup_path + +setup_path() + + +def _extract_text_content(message: dict) -> str: + """Extract text from a message's content array.""" + content = message.get("content", []) + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for part in content: + if isinstance(part, str): + parts.append(part) + elif isinstance(part, dict) and part.get("type") == "text": + parts.append(part.get("text", "")) + return "\n".join(parts) + return "" + + +def _extract_code_blocks(text: str) -> list[dict]: + """Extract fenced code blocks with language tags.""" + blocks = [] + for match in re.finditer(r"```(\w*)\n(.*?)```", text, re.DOTALL): + lang = match.group(1) or "text" + code = match.group(2).strip() + blocks.append({"language": lang, "code": code}) + return blocks + + +def _extract_patterns_from_code(code_blocks: list[dict]) -> list[str]: + """Auto-extract patterns from code blocks (function/class names, SQL keywords).""" + patterns = [] + for block in code_blocks: + code = block["code"] + lang = block["language"] + + if lang == "python": + for m in re.finditer(r"class\s+(\w+)", code): + patterns.append(m.group(1)) + for m in re.finditer(r"def\s+(\w+)", code): + patterns.append(m.group(1)) + for m in re.finditer(r"from\s+([\w.]+)\s+import", code): + patterns.append(m.group(1)) + + elif lang == "sql": + for m in re.finditer(r"(?:CREATE|ALTER|DROP)\s+(?:TABLE|VIEW|SCHEMA)\s+(\S+)", code, re.I): + patterns.append(m.group(1)) + for m in re.finditer(r"(?:STREAMING TABLE|MATERIALIZED VIEW)\s+(\S+)", code, re.I): + patterns.append(m.group(1)) + + return list(dict.fromkeys(patterns)) # dedupe preserving order + + +def _extract_facts_from_response(response: str) -> list[str]: + """Auto-extract key facts: API names, function calls, class references.""" + facts = [] + for m in re.finditer(r"(mlflow\.\w+(?:\.\w+)*)\(", response): + facts.append(m.group(1)) + for m in re.finditer(r"(spark\.\w+(?:\.\w+)*)\(", response): + facts.append(m.group(1)) + for m in re.finditer(r"\b([A-Z]\w+(?:Agent|Client|Config|Builder))\b", response): + facts.append(m.group(1)) + return list(dict.fromkeys(facts))[:10] + + +def _categorize_by_tools(tool_names: list[str]) -> str: + """Infer category from tool usage in the turn.""" + if any("sql" in t.lower() or "dbsql" in t.lower() for t in tool_names): + return "sql" + if any("bash" in t.lower() for t in tool_names): + return "deployment" + if any("write" in t.lower() or "edit" in t.lower() for t in tool_names): + return "code_generation" + return "general" + + +# --------------------------------------------------------------------------- +# Local trace extraction (session.jsonl) +# --------------------------------------------------------------------------- + +def extract_examples_from_file(trace_path: Path, skill_name: str | None = None) -> list[dict]: + """Parse a session.jsonl and extract test case candidates.""" + from skill_test.trace.parser import parse_transcript_file, link_tool_results + + entries = parse_transcript_file(trace_path) + link_tool_results(entries) + + examples = [] + idx = 0 + + for i, entry in enumerate(entries): + if entry.type != "user": + continue + if entry.tool_use_result: + continue + + user_text = _extract_text_content(entry.message) + if not user_text or len(user_text) < 10: + continue + + assistant_text = "" + tool_names = [] + for j in range(i + 1, len(entries)): + if entries[j].type == "assistant": + assistant_text = _extract_text_content(entries[j].message) + tool_names = [tc.name for tc in entries[j].tool_calls] + break + if entries[j].type == "user" and not entries[j].tool_use_result: + break + + if not assistant_text or len(assistant_text) < 50: + continue + + code_blocks = _extract_code_blocks(assistant_text) + auto_patterns = _extract_patterns_from_code(code_blocks) + auto_facts = _extract_facts_from_response(assistant_text) + category = _categorize_by_tools(tool_names) + + prefix = skill_name or "trace" + example = { + "id": f"{prefix}_{idx:03d}", + "inputs": {"prompt": user_text}, + "outputs": {"response": assistant_text}, + "expectations": {}, + "metadata": { + "category": category, + "source": "trace", + "trace_file": str(trace_path.name), + }, + } + + if auto_patterns: + example["expectations"]["expected_patterns"] = auto_patterns + if auto_facts: + example["expectations"]["expected_facts"] = auto_facts + if code_blocks: + langs = list({b["language"] for b in code_blocks if b["language"] != "text"}) + if langs: + example["metadata"]["languages"] = langs + + examples.append(example) + idx += 1 + + return examples + + +# --------------------------------------------------------------------------- +# MLflow trace extraction +# --------------------------------------------------------------------------- + +def _extract_examples_from_mlflow_trace(trace: Any, skill_name: str | None, idx_offset: int = 0) -> list[dict]: + """Extract test case candidates from an MLflow Trace object. + + MLflow traces from `mlflow autolog claude` contain spans representing + the agent conversation. We look for the root span's input/output which + contains the user prompt and final assistant response. + """ + examples = [] + prefix = skill_name or "mlflow" + + trace_info = trace.info + trace_id = trace_info.request_id if hasattr(trace_info, "request_id") else "unknown" + + # Try to get input/output from the trace data + user_text = "" + assistant_text = "" + + if trace.data: + # The root span typically has the full conversation + spans = trace.data.spans if hasattr(trace.data, "spans") else [] + + # Look for the root span (no parent) or the first AGENT/CHAIN span + root_span = None + for span in spans: + parent = getattr(span, "parent_id", None) + if parent is None or parent == "0": + root_span = span + break + + if root_span is None and spans: + root_span = spans[0] + + if root_span: + inputs = getattr(root_span, "inputs", None) + outputs = getattr(root_span, "outputs", None) + + # Extract user prompt from inputs + if isinstance(inputs, dict): + # Common patterns: {"messages": [...]}, {"input": "..."}, {"prompt": "..."} + if "messages" in inputs: + msgs = inputs["messages"] + if isinstance(msgs, list): + for msg in reversed(msgs): + if isinstance(msg, dict) and msg.get("role") == "user": + user_text = msg.get("content", "") + break + elif "input" in inputs: + user_text = str(inputs["input"]) + elif "prompt" in inputs: + user_text = str(inputs["prompt"]) + elif isinstance(inputs, str): + user_text = inputs + + # Extract assistant response from outputs + if isinstance(outputs, dict): + if "choices" in outputs: + choices = outputs["choices"] + if isinstance(choices, list) and choices: + msg = choices[0].get("message", {}) + assistant_text = msg.get("content", "") + elif "output" in outputs: + assistant_text = str(outputs["output"]) + elif "response" in outputs: + assistant_text = str(outputs["response"]) + elif isinstance(outputs, str): + assistant_text = outputs + + if not user_text or len(user_text) < 10: + return examples + if not assistant_text or len(assistant_text) < 50: + return examples + + # Build the test case + code_blocks = _extract_code_blocks(assistant_text) + auto_patterns = _extract_patterns_from_code(code_blocks) + auto_facts = _extract_facts_from_response(assistant_text) + + # Categorize by looking at tool spans + tool_names = [] + if trace.data and hasattr(trace.data, "spans"): + for span in trace.data.spans: + span_type = getattr(span, "span_type", "") + if span_type == "TOOL" or "tool" in getattr(span, "name", "").lower(): + tool_names.append(getattr(span, "name", "unknown")) + + category = _categorize_by_tools(tool_names) + + example = { + "id": f"{prefix}_{idx_offset:03d}", + "inputs": {"prompt": user_text}, + "outputs": {"response": assistant_text}, + "expectations": {}, + "metadata": { + "category": category, + "source": "mlflow_trace", + "trace_id": trace_id, + }, + } + + if auto_patterns: + example["expectations"]["expected_patterns"] = auto_patterns + if auto_facts: + example["expectations"]["expected_facts"] = auto_facts + if code_blocks: + langs = list({b["language"] for b in code_blocks if b["language"] != "text"}) + if langs: + example["metadata"]["languages"] = langs + + examples.append(example) + return examples + + +def list_mlflow_traces(experiment_id: str, limit: int = 20) -> None: + """List recent traces in an MLflow experiment.""" + import mlflow + + from skill_test.trace.mlflow_integration import _configure_mlflow + _configure_mlflow() + + print(f"Fetching traces from experiment {experiment_id}...") + try: + traces_df = mlflow.search_traces( + experiment_ids=[experiment_id], + max_results=limit, + ) + except Exception as e: + print(f"Error fetching traces: {e}") + print("\nMake sure you have authentication configured:") + print(" export DATABRICKS_HOST='https://.cloud.databricks.com'") + print(" export DATABRICKS_TOKEN='dapi...'") + sys.exit(1) + + if traces_df.empty: + print("No traces found in experiment.") + return + + print(f"\nFound {len(traces_df)} traces:\n") + print(f"{'Trace ID':<45} {'Status':<10} {'Timestamp':<25} {'Duration'}") + print("-" * 100) + + for _, row in traces_df.iterrows(): + trace_id = row.get("request_id", "unknown") + status = row.get("status", "?") + ts = row.get("timestamp_ms", 0) + duration = row.get("execution_time_ms", 0) + + from datetime import datetime + ts_str = datetime.fromtimestamp(ts / 1000).strftime("%Y-%m-%d %H:%M:%S") if ts else "?" + dur_str = f"{duration / 1000:.1f}s" if duration else "?" + + print(f"{trace_id:<45} {status:<10} {ts_str:<25} {dur_str}") + + print(f"\nTo extract examples from a specific trace:") + print(f" uv run python .test/scripts/trace_to_examples.py --trace-id --skill ") + print(f"\nTo extract from all traces in this experiment:") + print(f" uv run python .test/scripts/trace_to_examples.py --experiment-id {experiment_id} --skill ") + + +def extract_examples_from_experiment(experiment_id: str, skill_name: str | None, limit: int = 10) -> list[dict]: + """Extract examples from recent traces in an MLflow experiment.""" + import mlflow + + from skill_test.trace.mlflow_integration import _configure_mlflow + _configure_mlflow() + + print(f"Fetching up to {limit} traces from experiment {experiment_id}...") + try: + traces_df = mlflow.search_traces( + experiment_ids=[experiment_id], + max_results=limit, + filter_string="status = 'OK'", + ) + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + if traces_df.empty: + print("No successful traces found.") + return [] + + print(f"Processing {len(traces_df)} traces...") + all_examples = [] + + for _, row in traces_df.iterrows(): + trace_id = row.get("request_id") + if not trace_id: + continue + try: + trace = mlflow.get_trace(trace_id) + if trace: + examples = _extract_examples_from_mlflow_trace( + trace, skill_name, idx_offset=len(all_examples) + ) + all_examples.extend(examples) + if examples: + print(f" {trace_id}: extracted {len(examples)} example(s)") + except Exception as e: + print(f" {trace_id}: skipped ({e})") + + return all_examples + + +def extract_examples_from_trace_id(trace_id: str, skill_name: str | None) -> list[dict]: + """Extract examples from a single MLflow trace by ID.""" + import mlflow + + from skill_test.trace.mlflow_integration import _configure_mlflow + _configure_mlflow() + + print(f"Fetching trace {trace_id}...") + trace = mlflow.get_trace(trace_id) + if trace is None: + print(f"Trace not found: {trace_id}") + return [] + + return _extract_examples_from_mlflow_trace(trace, skill_name) + + +def extract_examples_from_run_id(run_id: str, skill_name: str | None) -> list[dict]: + """Extract examples from an MLflow run (downloads session.jsonl artifact).""" + from skill_test.trace.mlflow_integration import _configure_mlflow + _configure_mlflow() + + import mlflow + + print(f"Fetching artifacts from run {run_id}...") + + # Try to download session.jsonl artifact + artifact_names = ["trace.jsonl", "session.jsonl", "transcript.jsonl"] + artifact_path = None + + for name in artifact_names: + try: + artifact_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path=name) + print(f" Downloaded {name}") + break + except Exception: + continue + + if artifact_path: + return extract_examples_from_file(Path(artifact_path), skill_name) + + # Fallback: try MLflow traces API + print(" No JSONL artifact found, trying traces API...") + try: + traces_df = mlflow.search_traces( + experiment_ids=[mlflow.get_run(run_id).info.experiment_id], + filter_string=f"run_id = '{run_id}'", + max_results=10, + ) + if not traces_df.empty: + all_examples = [] + for _, row in traces_df.iterrows(): + tid = row.get("request_id") + if tid: + trace = mlflow.get_trace(tid) + if trace: + all_examples.extend( + _extract_examples_from_mlflow_trace(trace, skill_name, len(all_examples)) + ) + return all_examples + except Exception as e: + print(f" Traces API failed: {e}") + + print(" No extractable data found in this run.") + return [] + + +# --------------------------------------------------------------------------- +# LLM refinement and output +# --------------------------------------------------------------------------- + +def refine_with_llm(examples: list[dict], skill_name: str) -> list[dict]: + """Use an LLM to review and refine auto-extracted expectations.""" + import litellm + import json + + for ex in examples: + prompt_text = ex["inputs"]["prompt"][:200] + response_text = ex["outputs"]["response"][:1000] + current_patterns = ex["expectations"].get("expected_patterns", []) + current_facts = ex["expectations"].get("expected_facts", []) + + messages = [ + { + "role": "system", + "content": ( + "You are reviewing auto-extracted test expectations for a " + f"Databricks skill called '{skill_name}'. Refine the patterns " + "and facts to be more precise and meaningful. Return JSON only." + ), + }, + { + "role": "user", + "content": ( + f"Prompt: {prompt_text}\n\n" + f"Response excerpt: {response_text}\n\n" + f"Auto-extracted patterns: {json.dumps(current_patterns)}\n" + f"Auto-extracted facts: {json.dumps(current_facts)}\n\n" + "Return a JSON object with:\n" + '- "expected_patterns": list of regex pattern strings\n' + '- "expected_facts": list of fact strings that must appear\n' + "Keep only patterns/facts that are genuinely important for correctness." + ), + }, + ] + + try: + from skill_test.optimize.config import DEFAULT_GEN_LM + resp = litellm.completion(model=DEFAULT_GEN_LM, messages=messages) + content = resp.choices[0].message.content + json_match = re.search(r"\{.*\}", content, re.DOTALL) + if json_match: + refined = json.loads(json_match.group()) + if "expected_patterns" in refined: + ex["expectations"]["expected_patterns"] = refined["expected_patterns"] + if "expected_facts" in refined: + ex["expectations"]["expected_facts"] = refined["expected_facts"] + except Exception as e: + print(f" Warning: LLM refinement failed for {ex['id']}: {e}") + + return examples + + +def save_examples(examples: list[dict], output_path: Path) -> None: + """Save examples to a YAML file.""" + import yaml + + data = {"test_cases": examples} + with open(output_path, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + print(f"Saved {len(examples)} examples to {output_path}") + + +def append_to_ground_truth(examples: list[dict], skill_name: str) -> None: + """Append examples directly to a skill's ground_truth.yaml.""" + import yaml + + from skill_test.dataset import get_dataset_source + + try: + source = get_dataset_source(skill_name) + existing = source.load() + existing_ids = {r.id for r in existing} + gt_path = source.yaml_path + except FileNotFoundError: + gt_path = Path(".test/skills") / skill_name / "ground_truth.yaml" + gt_path.parent.mkdir(parents=True, exist_ok=True) + existing_ids = set() + + new_examples = [ex for ex in examples if ex["id"] not in existing_ids] + if not new_examples: + print("No new examples to add (all IDs already exist).") + return + + if gt_path.exists(): + with open(gt_path) as f: + data = yaml.safe_load(f) or {"test_cases": []} + else: + data = {"test_cases": []} + + data["test_cases"].extend(new_examples) + + with open(gt_path, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + + print(f"Appended {len(new_examples)} examples to {gt_path}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Extract test cases from Claude Code traces (local or MLflow)", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Trace sources (mutually exclusive) + source_group = parser.add_argument_group("trace source (pick one)") + source_group.add_argument( + "--trace", "-t", + type=Path, + default=None, + help="Path to local session.jsonl transcript file", + ) + source_group.add_argument( + "--experiment-id", + default=None, + help="MLflow experiment ID to extract traces from (e.g., 2452310130108632)", + ) + source_group.add_argument( + "--run-id", + default=None, + help="MLflow run ID to extract traces from", + ) + source_group.add_argument( + "--trace-id", + default=None, + help="MLflow trace ID (e.g., tr-d416fccdab46e2dea6bad1d0bd8aaaa8)", + ) + + # Common options + parser.add_argument( + "--skill", "-s", + default=None, + help="Skill name to tag examples with (e.g., databricks-model-serving)", + ) + parser.add_argument( + "--refine", + action="store_true", + help="Use LLM to review and refine auto-extracted expectations", + ) + parser.add_argument( + "--trust", + action="store_true", + help="Auto-append to ground_truth.yaml instead of writing candidates.yaml", + ) + parser.add_argument( + "--output", "-o", + type=Path, + default=None, + help="Output file path (default: candidates.yaml in skill dir or cwd)", + ) + parser.add_argument( + "--limit", + type=int, + default=10, + help="Max number of traces to process from an experiment (default: 10)", + ) + parser.add_argument( + "--list", + action="store_true", + dest="list_traces", + help="List traces in the experiment without extracting (use with --experiment-id)", + ) + + args = parser.parse_args() + + # Validate: at least one source required + sources = [args.trace, args.experiment_id, args.run_id, args.trace_id] + if not any(sources): + parser.error("Provide a trace source: --trace, --experiment-id, --run-id, or --trace-id") + + # List mode + if args.list_traces: + if not args.experiment_id: + parser.error("--list requires --experiment-id") + list_mlflow_traces(args.experiment_id, limit=args.limit) + return + + # Extract examples based on source + examples = [] + + if args.trace: + if not args.trace.exists(): + print(f"Error: trace file not found: {args.trace}") + sys.exit(1) + print(f"Parsing local trace: {args.trace}") + examples = extract_examples_from_file(args.trace, args.skill) + + elif args.experiment_id: + examples = extract_examples_from_experiment(args.experiment_id, args.skill, limit=args.limit) + + elif args.run_id: + examples = extract_examples_from_run_id(args.run_id, args.skill) + + elif args.trace_id: + examples = extract_examples_from_trace_id(args.trace_id, args.skill) + + print(f"\nExtracted {len(examples)} candidate test cases") + + if not examples: + print("No suitable prompt/response pairs found.") + sys.exit(0) + + if args.refine: + print("Refining expectations with LLM...") + examples = refine_with_llm(examples, args.skill or "unknown") + + if args.trust and args.skill: + append_to_ground_truth(examples, args.skill) + else: + output_path = args.output + if output_path is None: + if args.skill: + output_path = Path(".test/skills") / args.skill / "candidates.yaml" + else: + output_path = Path("candidates.yaml") + output_path.parent.mkdir(parents=True, exist_ok=True) + save_examples(examples, output_path) + + +if __name__ == "__main__": + main() diff --git a/.test/skills/databricks-app-apx/ground_truth.yaml b/.test/skills/databricks-app-apx/ground_truth.yaml new file mode 100644 index 00000000..ab9c8bbf --- /dev/null +++ b/.test/skills/databricks-app-apx/ground_truth.yaml @@ -0,0 +1,311 @@ +metadata: + skill_name: databricks-app-apx + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: apx_initialize_project_001 + inputs: + prompt: "I want to start building a new full-stack Databricks app. How do I initialize an APX project?" + outputs: + response: | + To initialize an APX project, use the APX MCP tools: + + ```bash + # Start the APX development server + mcp-cli call apx/start '{}' + + # Check that the server is running + mcp-cli call apx/status '{}' + ``` + + After initialization, APX scaffolds a FastAPI backend and React frontend with TypeScript support. + execution_success: true + expectations: + expected_facts: + - "APX uses MCP tools invoked via mcp-cli call" + - "apx/start initializes the development server" + - "apx/status checks the current server status" + expected_patterns: + - pattern: "apx/start" + min_count: 1 + description: "Mentions the apx/start MCP tool" + - pattern: "apx/status" + min_count: 1 + description: "Mentions the apx/status MCP tool" + guidelines: + - "Response must show the MCP CLI commands for starting APX" + - "Response should explain what happens after initialization" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: apx_pydantic_models_002 + inputs: + prompt: "What is the recommended pattern for defining backend data models in an APX app?" + outputs: + response: | + In `src/{app_name}/backend/models.py`, follow the 3-model pattern: + + ```python + from pydantic import BaseModel + + # Input validation - what the client sends + class EntityIn(BaseModel): + name: str + description: str + + # Database record - stored internally + class EntityRecord(EntityIn): + id: str + created_at: datetime + + # API response - what the client receives + class EntityOut(EntityRecord): + pass + ``` + + - `EntityIn` handles input validation + - `EntityRecord` extends with database fields + - `EntityOut` controls API response shape + execution_success: true + expectations: + expected_facts: + - "APX backend models live in src/{app_name}/backend/models.py" + - "The 3-model pattern uses EntityIn, EntityRecord, and EntityOut" + - "Models are built with Pydantic BaseModel" + expected_patterns: + - pattern: "class\\s+\\w+In\\(" + min_count: 1 + description: "Defines an input model class" + - pattern: "class\\s+\\w+Record\\(" + min_count: 1 + description: "Defines a record model class" + - pattern: "BaseModel" + min_count: 1 + description: "Uses Pydantic BaseModel" + guidelines: + - "Response must show the 3-model pattern with EntityIn, EntityRecord, and EntityOut" + - "Code must be valid Python with Pydantic imports" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: apx_api_routes_003 + inputs: + prompt: "How should I define API routes in my APX backend?" + outputs: + response: | + In `src/{app_name}/backend/router.py`, define FastAPI routes with these requirements: + + ```python + from fastapi import APIRouter + + router = APIRouter(prefix="/api/entities", tags=["entities"]) + + @router.get("/", response_model=list[EntityOut]) + async def list_entities(): + return entities + + @router.get("/{entity_id}", response_model=EntityOut) + async def get_entity(entity_id: str): + return find_entity(entity_id) + + @router.post("/", response_model=EntityOut) + async def create_entity(entity: EntityIn): + return save_entity(entity) + ``` + + Always include `response_model` on every route, as this enables OpenAPI spec generation for the frontend client. + execution_success: true + expectations: + expected_facts: + - "API routes are defined in src/{app_name}/backend/router.py" + - "Every route must include response_model for OpenAPI generation" + - "Routes use FastAPI's APIRouter" + expected_patterns: + - pattern: "response_model" + min_count: 2 + description: "Includes response_model on route decorators" + - pattern: "APIRouter" + min_count: 1 + description: "Uses FastAPI APIRouter" + - pattern: "@router\\.(get|post|put|delete)" + min_count: 2 + description: "Defines HTTP method route decorators" + guidelines: + - "Response must emphasize the requirement for response_model on every route" + - "Code must show standard CRUD route patterns" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: apx_add_ui_components_004 + inputs: + prompt: "How do I add shadcn UI components to my APX frontend?" + outputs: + response: | + Use the shadcn MCP tool to add components: + + ```bash + mcp-cli call shadcn/get_add_command_for_items '{ + "items": ["@shadcn/button", "@shadcn/card", "@shadcn/table", + "@shadcn/badge", "@shadcn/select", "@shadcn/skeleton"] + }' + ``` + + This generates the install command for the specified shadcn components. Run the output command to add them to your frontend. + execution_success: true + expectations: + expected_facts: + - "shadcn components are added via the shadcn MCP tool" + - "The tool is invoked with shadcn/get_add_command_for_items" + expected_patterns: + - pattern: "shadcn/get_add_command_for_items" + min_count: 1 + description: "Uses the shadcn MCP tool" + - pattern: "@shadcn/" + min_count: 2 + description: "References shadcn component packages" + guidelines: + - "Response must show the MCP tool call with JSON parameters" + - "Response should list common UI components like button, card, table" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: apx_testing_endpoints_005 + inputs: + prompt: "How do I test my APX app's backend API endpoints and run type checks?" + outputs: + response: | + APX provides built-in testing and type checking: + + ```bash + # Run type checks on both backend and frontend + mcp-cli call apx/dev_check '{}' + + # Test API endpoints manually + curl http://localhost:8000/api/{entities} | jq . + curl http://localhost:8000/api/{entities}/{id} | jq . + + # Get the frontend development URL + mcp-cli call apx/get_frontend_url '{}' + ``` + + Fix any Python type errors reported by basedpyright before deploying. + execution_success: true + expectations: + expected_facts: + - "apx/dev_check runs type checks on both backend and frontend" + - "Backend API runs on localhost:8000" + - "APX uses basedpyright for Python type checking" + expected_patterns: + - pattern: "apx/dev_check" + min_count: 1 + description: "Uses the dev_check MCP tool" + - pattern: "curl.*localhost:8000" + min_count: 1 + description: "Tests endpoints via curl on localhost" + - pattern: "apx/get_frontend_url" + min_count: 1 + description: "Uses the get_frontend_url MCP tool" + guidelines: + - "Response must include both type checking and API testing commands" + - "Response should mention basedpyright for Python type checking" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: apx_deployment_006 + inputs: + prompt: "How do I deploy my APX app to Databricks and check the logs?" + outputs: + response: | + Deploy and monitor your APX app: + + ```bash + # Deploy the app to Databricks + mcp-cli call apx/deploy '{}' + + # Check deployed application logs + databricks apps logs --profile + ``` + + The APX MCP server can also automatically check deployed application logs. Simply ask: "Please check the deployed app logs for ". + execution_success: true + expectations: + expected_facts: + - "APX apps are deployed using the apx/deploy MCP tool" + - "Application logs are accessed via databricks apps logs CLI command" + expected_patterns: + - pattern: "apx/deploy" + min_count: 1 + description: "Uses the deploy MCP tool" + - pattern: "databricks apps logs" + min_count: 1 + description: "Uses the databricks apps logs CLI command" + guidelines: + - "Response must show both deployment and log monitoring commands" + - "Response should mention the APX MCP automatic log checking feature" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: apx_full_workflow_007 + inputs: + prompt: "Walk me through the complete APX development workflow from start to deployment." + outputs: + response: | + The APX development workflow has 5 phases: + + **Phase 1: Initialize** + ```bash + mcp-cli call apx/start '{}' + mcp-cli call apx/status '{}' + ``` + + **Phase 2: Backend** - Create Pydantic models in `models.py` and FastAPI routes in `router.py` + + **Phase 3: Frontend** - Add shadcn components and build React UI + + **Phase 4: Testing** + ```bash + mcp-cli call apx/dev_check '{}' + curl http://localhost:8000/api/{entities} | jq . + ``` + + **Phase 5: Deploy** + ```bash + mcp-cli call apx/deploy '{}' + databricks apps logs --profile + ``` + execution_success: true + expectations: + expected_facts: + - "APX development follows 5 phases: Initialize, Backend, Frontend, Testing, Deploy" + - "Backend uses Pydantic models and FastAPI routes" + - "Frontend uses React with shadcn components" + expected_patterns: + - pattern: "apx/start" + min_count: 1 + description: "Includes initialization step" + - pattern: "apx/dev_check" + min_count: 1 + description: "Includes testing step" + - pattern: "apx/deploy" + min_count: 1 + description: "Includes deployment step" + guidelines: + - "Response must cover all 5 phases of the APX workflow" + - "Response should include code examples for key phases" + metadata: + category: happy_path + difficulty: hard + source: auto_generated diff --git a/.test/skills/databricks-app-apx/manifest.yaml b/.test/skills/databricks-app-apx/manifest.yaml new file mode 100644 index 00000000..e819a067 --- /dev/null +++ b/.test/skills/databricks-app-apx/manifest.yaml @@ -0,0 +1,28 @@ +skill_name: databricks-app-apx +description: Build full-stack Databricks applications using APX framework (FastAPI + React). +scorers: + enabled: + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-config/ground_truth.yaml b/.test/skills/databricks-config/ground_truth.yaml new file mode 100644 index 00000000..10dbad09 --- /dev/null +++ b/.test/skills/databricks-config/ground_truth.yaml @@ -0,0 +1,199 @@ +metadata: + skill_name: databricks-config + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: databricks-config_setup_default_profile_001 + inputs: + prompt: Configure my Databricks profile for the DEFAULT workspace + outputs: + response: | + I'll help you configure your Databricks DEFAULT profile. First, let's authenticate: + + ```bash + databricks auth login -p DEFAULT + ``` + + This will open a browser for OAuth authentication. After authenticating, I'll update your `~/.databrickscfg` with the compute configuration. + + Would you like to use: + - **Cluster ID**: A specific interactive/all-purpose cluster + - **Serverless**: Serverless compute (`serverless_compute_id = auto`) + + Example configuration with a cluster: + ``` + [DEFAULT] + host = https://adb-123456789.11.azuredatabricks.net/ + cluster_id = 1217-064531-c9c3ngyn + auth_type = databricks-cli + ``` + execution_success: true + expectations: + expected_facts: + - Uses databricks auth login command + - Configures ~/.databrickscfg file + - Offers choice between cluster_id and serverless + expected_patterns: + - pattern: databricks auth login + min_count: 1 + description: Must use databricks auth login command + - pattern: databrickscfg + min_count: 1 + description: Must reference the config file + guidelines: + - Must run databricks auth login for authentication + - Must offer choice between cluster_id and serverless compute + - Must never print token values in plain text + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: databricks-config_serverless_config_002 + inputs: + prompt: Set up my Databricks profile to use serverless compute + outputs: + response: | + I'll configure your profile with serverless compute. + + ```bash + databricks auth login -p DEFAULT + ``` + + Then update ~/.databrickscfg: + ``` + [DEFAULT] + host = https://adb-123456789.11.azuredatabricks.net/ + serverless_compute_id = auto + auth_type = databricks-cli + ``` + + Note: For serverless, set `serverless_compute_id = auto` (not just `serverless = true`). + execution_success: true + expectations: + expected_facts: + - Sets serverless_compute_id to auto + - Does not use cluster_id with serverless + expected_patterns: + - pattern: serverless_compute_id + min_count: 1 + description: Must use serverless_compute_id parameter + - pattern: auto + min_count: 1 + description: Must set serverless to auto + guidelines: + - Must use serverless_compute_id = auto, not serverless = true + - Must not combine cluster_id and serverless_compute_id + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: databricks-config_workspace_url_003 + inputs: + prompt: Configure Databricks profile using workspace URL https://adb-1234567890123456.7.azuredatabricks.net/ + outputs: + response: | + I'll extract the profile name from the workspace URL and configure it. + + Profile name: `adb-1234567890123456` + + ```bash + databricks auth login -p adb-1234567890123456 --host https://adb-1234567890123456.7.azuredatabricks.net/ + ``` + + After authentication, choose your compute option and I'll update ~/.databrickscfg. + execution_success: true + expectations: + expected_facts: + - Extracts profile name from workspace URL + - Uses --host flag with the URL + expected_patterns: + - pattern: databricks auth login + min_count: 1 + description: Must use databricks auth login + - pattern: "\\-\\-host" + min_count: 1 + description: Must pass --host when URL is provided + guidelines: + - Must extract profile name from workspace URL + - Must pass --host flag when workspace URL is provided + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: databricks-config_cluster_id_004 + inputs: + prompt: Configure my Databricks profile with cluster ID 0123-456789-abcdef12 + outputs: + response: | + I'll configure your profile with the specified cluster ID. + + ```bash + databricks auth login -p DEFAULT + ``` + + Updated ~/.databrickscfg: + ``` + [DEFAULT] + host = https://adb-123456789.11.azuredatabricks.net/ + cluster_id = 0123-456789-abcdef12 + auth_type = databricks-cli + ``` + execution_success: true + expectations: + expected_facts: + - Sets cluster_id in configuration + - Preserves existing host and auth settings + expected_patterns: + - pattern: cluster_id + min_count: 1 + description: Must set cluster_id + - pattern: 0123-456789-abcdef12 + min_count: 1 + description: Must use the specified cluster ID + guidelines: + - Must set cluster_id, not serverless_compute_id + - Must preserve existing profile settings + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: databricks-config_token_security_005 + inputs: + prompt: Show me my Databricks profile configuration + outputs: + response: | + Here's your current profile configuration: + + ``` + [DEFAULT] + host = https://adb-123456789.11.azuredatabricks.net/ + token = [REDACTED] + cluster_id = 1217-064531-c9c3ngyn + ``` + + View full configuration at: ~/.databrickscfg + execution_success: true + expectations: + expected_facts: + - Token values must be redacted + - Directs user to ~/.databrickscfg for full config + expected_patterns: + - pattern: REDACTED + min_count: 1 + description: Must redact token values + - pattern: databrickscfg + min_count: 1 + description: Must reference config file location + guidelines: + - Must NEVER print token values in plain text + - Must redact any token field values + - Must inform user they can view full config at ~/.databrickscfg + metadata: + category: happy_path + difficulty: medium + source: auto_generated diff --git a/.test/skills/databricks-config/manifest.yaml b/.test/skills/databricks-config/manifest.yaml new file mode 100644 index 00000000..6e8db2b6 --- /dev/null +++ b/.test/skills/databricks-config/manifest.yaml @@ -0,0 +1,28 @@ +skill_name: databricks-config +description: Configure Databricks profile and authenticate for Databricks Connect, Databricks CLI, and Databricks SDK. +scorers: + enabled: + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-dbsql/ground_truth.yaml b/.test/skills/databricks-dbsql/ground_truth.yaml new file mode 100644 index 00000000..f6a3c6d6 --- /dev/null +++ b/.test/skills/databricks-dbsql/ground_truth.yaml @@ -0,0 +1,366 @@ +metadata: + skill_name: databricks-dbsql + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: dbsql_sql_scripting_etl_001 + inputs: + prompt: "Write a SQL script that conditionally processes new orders into a processed table using Databricks SQL scripting." + outputs: + response: | + ```sql + BEGIN + DECLARE v_count INT; + DECLARE v_status STRING DEFAULT 'pending'; + + SET v_count = (SELECT COUNT(*) FROM catalog.schema.raw_orders WHERE status = 'new'); + + IF v_count > 0 THEN + INSERT INTO catalog.schema.processed_orders + SELECT *, current_timestamp() AS processed_at + FROM catalog.schema.raw_orders + WHERE status = 'new'; + + SET v_status = 'completed'; + ELSE + SET v_status = 'skipped'; + END IF; + + SELECT v_status AS result, v_count AS rows_processed; + END + ``` + execution_success: true + expectations: + expected_facts: + - "Databricks SQL scripting uses BEGIN/END blocks for procedural logic" + - "Variables are declared with DECLARE and assigned with SET" + - "IF/ELSE conditional logic is supported in SQL scripts" + expected_patterns: + - pattern: "BEGIN" + min_count: 1 + description: "Uses SQL scripting BEGIN block" + - pattern: "DECLARE" + min_count: 1 + description: "Declares variables using DECLARE" + - pattern: "IF.*THEN" + min_count: 1 + description: "Uses IF/THEN conditional logic" + - pattern: "current_timestamp\\(\\)" + min_count: 1 + description: "Uses current_timestamp() function" + guidelines: + - "SQL must use Databricks SQL scripting syntax with BEGIN/END blocks" + - "Response must include variable declarations and conditional logic" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: dbsql_stored_procedure_002 + inputs: + prompt: "Create a stored procedure that upserts customer data with error handling." + outputs: + response: | + ```sql + CREATE OR REPLACE PROCEDURE catalog.schema.upsert_customers( + IN p_source STRING, + OUT p_rows_affected INT + ) + LANGUAGE SQL + SQL SECURITY INVOKER + BEGIN + DECLARE EXIT HANDLER FOR SQLEXCEPTION + BEGIN + SET p_rows_affected = -1; + SIGNAL SQLSTATE '45000' + SET MESSAGE_TEXT = concat('Upsert failed for source: ', p_source); + END; + + MERGE INTO catalog.schema.dim_customer AS t + USING (SELECT * FROM identifier(p_source)) AS s + ON t.customer_id = s.customer_id + WHEN MATCHED THEN UPDATE SET * + WHEN NOT MATCHED THEN INSERT *; + + SET p_rows_affected = (SELECT COUNT(*) FROM identifier(p_source)); + END; + + -- Invoke the procedure: + CALL catalog.schema.upsert_customers('catalog.schema.staging_customers', ?); + ``` + execution_success: true + expectations: + expected_facts: + - "Stored procedures are created with CREATE OR REPLACE PROCEDURE" + - "SQL SECURITY INVOKER runs with the caller's permissions" + - "The identifier() function dynamically resolves table names from string parameters" + - "Error handling uses DECLARE EXIT HANDLER FOR SQLEXCEPTION" + expected_patterns: + - pattern: "CREATE OR REPLACE PROCEDURE" + min_count: 1 + description: "Creates a stored procedure" + - pattern: "MERGE INTO" + min_count: 1 + description: "Uses MERGE for upsert logic" + - pattern: "identifier\\(" + min_count: 1 + description: "Uses identifier() for dynamic table references" + - pattern: "DECLARE EXIT HANDLER" + min_count: 1 + description: "Includes error handling with exit handler" + guidelines: + - "Procedure must include IN/OUT parameters" + - "Must include error handling with SQLEXCEPTION handler" + - "Must use MERGE INTO for upsert pattern" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: dbsql_materialized_view_003 + inputs: + prompt: "Create a materialized view that refreshes hourly and aggregates daily revenue by region." + outputs: + response: | + ```sql + CREATE OR REPLACE MATERIALIZED VIEW catalog.schema.daily_revenue + CLUSTER BY (order_date) + SCHEDULE EVERY 1 HOUR + COMMENT 'Hourly-refreshed daily revenue by region' + AS SELECT + order_date, + region, + SUM(amount) AS total_revenue, + COUNT(DISTINCT customer_id) AS unique_customers + FROM catalog.schema.fact_orders + JOIN catalog.schema.dim_store USING (store_id) + GROUP BY order_date, region; + ``` + execution_success: true + expectations: + expected_facts: + - "Materialized views support scheduled refresh with SCHEDULE EVERY syntax" + - "CLUSTER BY optimizes data layout for query performance" + - "Materialized views are created with CREATE OR REPLACE MATERIALIZED VIEW" + expected_patterns: + - pattern: "CREATE OR REPLACE MATERIALIZED VIEW" + min_count: 1 + description: "Creates a materialized view" + - pattern: "SCHEDULE EVERY" + min_count: 1 + description: "Configures scheduled refresh" + - pattern: "CLUSTER BY" + min_count: 1 + description: "Uses CLUSTER BY for data layout optimization" + - pattern: "GROUP BY" + min_count: 1 + description: "Aggregates data with GROUP BY" + guidelines: + - "Must include SCHEDULE EVERY for automated refresh" + - "Must include CLUSTER BY for performance optimization" + - "Response should use realistic aggregation patterns" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: dbsql_pipe_syntax_004 + inputs: + prompt: "Show me how to use Databricks SQL pipe syntax to transform and filter order data." + outputs: + response: | + ```sql + -- Pipe syntax: read left-to-right instead of inside-out + FROM catalog.schema.fact_orders + |> WHERE order_date >= current_date() - INTERVAL 30 DAYS + |> AGGREGATE SUM(amount) AS total, COUNT(*) AS cnt GROUP BY region, product_category + |> WHERE total > 10000 + |> ORDER BY total DESC + |> LIMIT 20; + ``` + + Pipe syntax starts with FROM and chains transformations with `|>`, making complex queries more readable than nested subqueries. + execution_success: true + expectations: + expected_facts: + - "Pipe syntax uses |> to chain transformations left-to-right" + - "Queries start with FROM instead of SELECT when using pipe syntax" + - "AGGREGATE replaces GROUP BY in pipe syntax context" + expected_patterns: + - pattern: "\\|>" + min_count: 3 + description: "Uses pipe operator |> for chaining" + - pattern: "^FROM" + min_count: 1 + description: "Starts the query with FROM" + - pattern: "AGGREGATE" + min_count: 1 + description: "Uses AGGREGATE keyword in pipe syntax" + guidelines: + - "Response must demonstrate the pipe |> operator" + - "Query must start with FROM, not SELECT" + - "Response should explain the readability benefit of pipe syntax" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: dbsql_ai_functions_005 + inputs: + prompt: "How can I use Databricks AI functions to classify and analyze text data in SQL?" + outputs: + response: | + ```sql + -- Classify support tickets + SELECT + ticket_id, + description, + ai_classify(description, ARRAY('billing', 'technical', 'account', 'feature_request')) AS category, + ai_analyze_sentiment(description) AS sentiment + FROM catalog.schema.support_tickets + LIMIT 100; + + -- Extract structured entities from text + SELECT + doc_id, + ai_extract(content, ARRAY('person_name', 'company', 'dollar_amount')) AS entities + FROM catalog.schema.contracts; + + -- General-purpose AI query with typed output + SELECT ai_query( + 'databricks-meta-llama-3-3-70b-instruct', + concat('Summarize this feedback in JSON: ', feedback), + returnType => 'STRUCT>' + ) AS analysis + FROM catalog.schema.customer_feedback + LIMIT 50; + ``` + execution_success: true + expectations: + expected_facts: + - "ai_classify() categorizes text into predefined labels" + - "ai_analyze_sentiment() returns sentiment analysis of text" + - "ai_extract() extracts structured entities from unstructured text" + - "ai_query() calls an LLM endpoint with optional structured return types" + expected_patterns: + - pattern: "ai_classify" + min_count: 1 + description: "Uses ai_classify() for text classification" + - pattern: "ai_analyze_sentiment" + min_count: 1 + description: "Uses ai_analyze_sentiment() for sentiment analysis" + - pattern: "ai_extract" + min_count: 1 + description: "Uses ai_extract() for entity extraction" + - pattern: "ai_query" + min_count: 1 + description: "Uses ai_query() for general LLM queries" + guidelines: + - "Response must demonstrate at least three different AI functions" + - "ai_query must include the returnType parameter for structured output" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: dbsql_geospatial_h3_006 + inputs: + prompt: "Write a query that finds all stores within 5km of each customer using H3 geospatial indexing." + outputs: + response: | + ```sql + -- Find stores within 5km using H3 spatial indexing + WITH customer_h3 AS ( + SELECT *, h3_longlatash3(longitude, latitude, 7) AS h3_cell + FROM catalog.schema.customers + ), + store_h3 AS ( + SELECT *, h3_longlatash3(longitude, latitude, 7) AS h3_cell + FROM catalog.schema.stores + ) + SELECT + c.customer_id, + s.store_id, + ST_Distance( + ST_Point(c.longitude, c.latitude), + ST_Point(s.longitude, s.latitude) + ) AS distance_m + FROM customer_h3 c + JOIN store_h3 s ON h3_ischildof(c.h3_cell, h3_toparent(s.h3_cell, 5)) + WHERE ST_Distance( + ST_Point(c.longitude, c.latitude), + ST_Point(s.longitude, s.latitude) + ) < 5000; + ``` + execution_success: true + expectations: + expected_facts: + - "h3_longlatash3() converts longitude/latitude to H3 cell index at a given resolution" + - "ST_Distance() calculates distance in meters between two geometric points" + - "ST_Point() creates a geometric point from longitude and latitude" + - "h3_ischildof() and h3_toparent() navigate the H3 hierarchy for spatial joins" + expected_patterns: + - pattern: "h3_longlatash3" + min_count: 1 + description: "Uses H3 cell indexing function" + - pattern: "ST_Distance" + min_count: 1 + description: "Uses ST_Distance for distance calculation" + - pattern: "ST_Point" + min_count: 1 + description: "Uses ST_Point for point geometry" + - pattern: "h3_ischildof" + min_count: 1 + description: "Uses H3 hierarchy for efficient spatial join" + guidelines: + - "Must use H3 indexing for efficient spatial join instead of brute-force distance" + - "Must include ST_Distance for final distance filtering" + metadata: + category: happy_path + difficulty: hard + source: auto_generated + +- id: dbsql_collation_007 + inputs: + prompt: "How do I create a table with case-insensitive text columns in Databricks SQL?" + outputs: + response: | + ```sql + -- Create table with case-insensitive collation + CREATE TABLE catalog.schema.products ( + product_id BIGINT GENERATED ALWAYS AS IDENTITY, + name STRING COLLATE UTF8_LCASE, + category STRING COLLATE UTF8_LCASE, + price DECIMAL(10, 2) + ); + + -- Queries are automatically case-insensitive (no LOWER() needed) + SELECT * FROM catalog.schema.products + WHERE name = 'MacBook Pro'; -- matches 'macbook pro', 'MACBOOK PRO', etc. + ``` + + The `UTF8_LCASE` collation makes string comparisons case-insensitive at the column level, eliminating the need for `LOWER()` or `UPPER()` in WHERE clauses. + execution_success: true + expectations: + expected_facts: + - "UTF8_LCASE collation enables case-insensitive comparisons at the column level" + - "Collation is specified with COLLATE keyword in column definitions" + - "Case-insensitive collation eliminates the need for LOWER() or UPPER() in queries" + expected_patterns: + - pattern: "COLLATE UTF8_LCASE" + min_count: 1 + description: "Uses UTF8_LCASE collation" + - pattern: "CREATE TABLE" + min_count: 1 + description: "Creates a table with collation" + - pattern: "GENERATED ALWAYS AS IDENTITY" + min_count: 1 + description: "Uses auto-generated identity column" + guidelines: + - "Response must show COLLATE UTF8_LCASE in column definitions" + - "Response should explain the benefit over using LOWER() in queries" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-dbsql/manifest.yaml b/.test/skills/databricks-dbsql/manifest.yaml new file mode 100644 index 00000000..625132dc --- /dev/null +++ b/.test/skills/databricks-dbsql/manifest.yaml @@ -0,0 +1,34 @@ +skill_name: databricks-dbsql +description: 'Databricks SQL (DBSQL) advanced features and SQL warehouse capabilities. This skill MUST be invoked when the + user mentions: "DBSQL", "Databricks SQL", "SQL warehouse", "SQL scripting", "stored procedure", "CALL procedure", "materialized + view", "CREATE MATERIALIZED VIEW", "pipe syntax", "|>", "geospatial", "H3", "ST_", "spatial SQL", "collation", "COLLATE", + "ai_query", "ai_classify", "ai_extract", "ai_gen", "AI function", "http_request", "remote_query", "read_files", "Lakehouse + Federation", "recursive CTE", "WITH RECURSIVE", "multi-statement transaction", "temp table", "temporary view", "pipe operator". + SHOULD also invoke when the user asks about SQL best practices, data modeling patterns, or advanced SQL features on Databricks.' +scorers: + enabled: + - sql_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-docs/ground_truth.yaml b/.test/skills/databricks-docs/ground_truth.yaml new file mode 100644 index 00000000..39179c0c --- /dev/null +++ b/.test/skills/databricks-docs/ground_truth.yaml @@ -0,0 +1,170 @@ +metadata: + skill_name: databricks-docs + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: databricks-docs_lookup_reference_001 + inputs: + prompt: How do I look up Databricks documentation for a specific feature? + outputs: + response: | + Use the Databricks documentation index at `https://docs.databricks.com/llms.txt`. + + Fetch it with WebFetch, then search for relevant sections and links. The index is organized by category: + - Data Engineering (Lakeflow, Spark, Delta Lake, pipelines) + - SQL & Analytics (Warehouses, queries, dashboards) + - AI/ML (MLflow, model serving, GenAI) + - Governance (Unity Catalog, permissions, security) + - Developer Tools (SDKs, CLI, APIs, Terraform) + execution_success: true + expectations: + expected_facts: + - Use llms.txt documentation index + - Index is organized by categories + expected_patterns: + - pattern: docs\.databricks\.com + min_count: 1 + description: Must reference Databricks docs URL + - pattern: llms\.txt + min_count: 1 + description: Must reference the llms.txt index + guidelines: + - Must reference the llms.txt documentation index URL + - Should explain the documentation structure + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: databricks-docs_reference_role_002 + inputs: + prompt: What is the role of the databricks-docs skill? + outputs: + response: | + This is a reference skill, not an action skill. Use it to: + - Look up documentation when other skills don't cover a topic + - Get authoritative guidance on Databricks concepts and APIs + - Find detailed information to inform how you use MCP tools + - Discover features and capabilities + + Always prefer using MCP tools for actions and load specific skills for workflows. + execution_success: true + expectations: + expected_facts: + - Is a reference skill, not an action skill + - Supplements other skills and MCP tools + expected_patterns: + - pattern: reference + min_count: 1 + description: Must identify as reference skill + guidelines: + - Must clarify this is a reference skill, not an action skill + - Must recommend MCP tools for actual actions + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: databricks-docs_complement_skills_003 + inputs: + prompt: How do I use databricks-docs alongside other skills? + outputs: + response: | + The docs skill complements other skills: + + 1. Load the specific skill for workflow patterns (e.g., databricks-spark-declarative-pipelines) + 2. Use databricks-docs to fetch docs if you need clarification on specific features + 3. Use MCP tools to actually perform the action (e.g., create_or_update_pipeline) + + Example: User wants to create a Delta Live Tables pipeline + - Load `databricks-spark-declarative-pipelines` for workflow patterns + - Fetch docs via llms.txt if you need clarification on DLT features + - Use `create_or_update_pipeline` MCP tool to create the pipeline + execution_success: true + expectations: + expected_facts: + - Load specific skills for workflows + - Use docs for clarification + - Use MCP tools for actions + expected_patterns: + - pattern: MCP + min_count: 1 + description: Must mention MCP tools for actions + - pattern: WebFetch|llms\.txt|docs + min_count: 1 + description: Must reference docs lookup method + guidelines: + - Must explain the three-step pattern (skill + docs + MCP tools) + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: databricks-docs_fetch_docs_004 + inputs: + prompt: How do I fetch and search the Databricks documentation? + outputs: + response: | + Fetch the llms.txt documentation index: + + URL: `https://docs.databricks.com/llms.txt` + + Use WebFetch to retrieve this index, then: + 1. Search for relevant sections/links + 2. Fetch specific documentation pages for detailed guidance + 3. Apply what you learn using the appropriate MCP tools + execution_success: true + expectations: + expected_facts: + - Fetch llms.txt with WebFetch + - Search for relevant sections + - Fetch specific pages for details + expected_patterns: + - pattern: WebFetch + min_count: 1 + description: Must use WebFetch to retrieve docs + - pattern: llms\.txt + min_count: 1 + description: Must reference the llms.txt index + guidelines: + - Must describe the WebFetch workflow for docs + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: databricks-docs_unfamiliar_feature_005 + inputs: + prompt: I want to learn about an unfamiliar Databricks feature, how should I approach it? + outputs: + response: | + When encountering an unfamiliar Databricks feature: + + 1. Fetch `https://docs.databricks.com/llms.txt` to find relevant documentation + 2. Read the specific docs to understand the feature + 3. Determine which skill/tools apply, then use them + + The documentation is organized by category: + - Overview & Getting Started + - Data Engineering + - SQL & Analytics + - AI/ML + - Governance + - Developer Tools + execution_success: true + expectations: + expected_facts: + - Start with llms.txt documentation index + - Read specific docs for the feature + - Determine which skills/tools to use + expected_patterns: + - pattern: llms\.txt + min_count: 1 + description: Must reference llms.txt + guidelines: + - Must outline a systematic approach to learning new features + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-docs/manifest.yaml b/.test/skills/databricks-docs/manifest.yaml new file mode 100644 index 00000000..287f44e4 --- /dev/null +++ b/.test/skills/databricks-docs/manifest.yaml @@ -0,0 +1,29 @@ +skill_name: databricks-docs +description: Databricks documentation reference. Use as a lookup resource alongside other skills and MCP tools for comprehensive + guidance. +scorers: + enabled: + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-jobs/ground_truth.yaml b/.test/skills/databricks-jobs/ground_truth.yaml new file mode 100644 index 00000000..f504bfc0 --- /dev/null +++ b/.test/skills/databricks-jobs/ground_truth.yaml @@ -0,0 +1,348 @@ +metadata: + skill_name: databricks-jobs + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: jobs_create_sdk_001 + inputs: + prompt: "How do I create a Databricks job using the Python SDK?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.jobs import Task, NotebookTask, Source + + w = WorkspaceClient() + + job = w.jobs.create( + name="my-etl-job", + tasks=[ + Task( + task_key="extract", + notebook_task=NotebookTask( + notebook_path="/Workspace/Users/user@example.com/extract", + source=Source.WORKSPACE + ) + ) + ] + ) + print(f"Created job: {job.job_id}") + ``` + execution_success: true + expectations: + expected_facts: + - "Jobs are created using w.jobs.create() from the Databricks Python SDK" + - "Each task requires a task_key and a task type such as NotebookTask" + - "WorkspaceClient auto-detects authentication from the environment" + expected_patterns: + - pattern: "WorkspaceClient" + min_count: 1 + description: "Uses WorkspaceClient for SDK access" + - pattern: "w\\.jobs\\.create" + min_count: 1 + description: "Calls jobs.create() method" + - pattern: "NotebookTask" + min_count: 1 + description: "Uses NotebookTask for notebook-based tasks" + - pattern: "task_key" + min_count: 1 + description: "Specifies task_key identifier" + guidelines: + - "Code must be valid Python using the databricks-sdk package" + - "Must import Task and NotebookTask from databricks.sdk.service.jobs" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: jobs_create_cli_002 + inputs: + prompt: "Show me how to create a job using the Databricks CLI." + outputs: + response: | + ```bash + databricks jobs create --json '{ + "name": "my-etl-job", + "tasks": [{ + "task_key": "extract", + "notebook_task": { + "notebook_path": "/Workspace/Users/user@example.com/extract", + "source": "WORKSPACE" + } + }] + }' + ``` + execution_success: true + expectations: + expected_facts: + - "The Databricks CLI creates jobs with databricks jobs create --json" + - "Job configuration is passed as a JSON payload" + expected_patterns: + - pattern: "databricks jobs create" + min_count: 1 + description: "Uses the databricks jobs create CLI command" + - pattern: "task_key" + min_count: 1 + description: "Includes task_key in JSON payload" + - pattern: "notebook_task" + min_count: 1 + description: "Includes notebook_task configuration" + guidelines: + - "Response must show the databricks jobs create command with --json flag" + - "JSON payload must include task_key and notebook_task fields" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: jobs_dabs_config_003 + inputs: + prompt: "How do I define a Databricks job in a DABs (Databricks Asset Bundles) YAML file?" + outputs: + response: | + ```yaml + # resources/jobs.yml + resources: + jobs: + my_etl_job: + name: "[${bundle.target}] My ETL Job" + tasks: + - task_key: extract + notebook_task: + notebook_path: ../src/notebooks/extract.py + ``` + + The `${bundle.target}` variable is replaced with the deployment target (e.g., dev, staging, prod) at deploy time. + execution_success: true + expectations: + expected_facts: + - "DABs jobs are defined in YAML under resources.jobs" + - "bundle.target variable enables environment-specific job naming" + - "notebook_path uses relative paths from the bundle root" + expected_patterns: + - pattern: "resources:" + min_count: 1 + description: "Uses DABs resources block" + - pattern: "jobs:" + min_count: 1 + description: "Defines jobs section" + - pattern: "bundle\\.target" + min_count: 1 + description: "Uses bundle.target variable for environment naming" + - pattern: "notebook_task:" + min_count: 1 + description: "Configures a notebook task" + guidelines: + - "YAML must follow DABs resource configuration structure" + - "Response should explain the bundle.target variable" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: jobs_multi_task_workflow_004 + inputs: + prompt: "How do I set up a multi-task ETL workflow with dependencies between tasks?" + outputs: + response: | + ```yaml + tasks: + - task_key: extract + notebook_task: + notebook_path: ../src/extract.py + + - task_key: transform + depends_on: + - task_key: extract + notebook_task: + notebook_path: ../src/transform.py + + - task_key: load + depends_on: + - task_key: transform + run_if: ALL_SUCCESS + notebook_task: + notebook_path: ../src/load.py + ``` + + Tasks form a DAG (directed acyclic graph). Use `depends_on` to define execution order and `run_if` to control conditional execution (ALL_SUCCESS, AT_LEAST_ONE_SUCCESS, etc.). + execution_success: true + expectations: + expected_facts: + - "Multi-task workflows use depends_on to define task dependencies" + - "Tasks form a DAG (directed acyclic graph) for execution ordering" + - "run_if controls conditional execution with values like ALL_SUCCESS" + expected_patterns: + - pattern: "depends_on:" + min_count: 2 + description: "Uses depends_on for task ordering" + - pattern: "run_if:" + min_count: 1 + description: "Uses run_if for conditional execution" + - pattern: "task_key:" + min_count: 3 + description: "Defines at least three tasks in the workflow" + guidelines: + - "Must show at least 3 tasks with dependencies forming a pipeline" + - "Must include run_if to demonstrate conditional execution" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: jobs_compute_config_005 + inputs: + prompt: "What are the options for configuring compute resources for Databricks jobs?" + outputs: + response: | + **Option 1: Job Clusters (recommended for most jobs)** + ```yaml + job_clusters: + - job_cluster_key: shared_cluster + new_cluster: + spark_version: "15.4.x-scala2.12" + node_type_id: "i3.xlarge" + num_workers: 2 + spark_conf: + spark.speculation: "true" + + tasks: + - task_key: my_task + job_cluster_key: shared_cluster + notebook_task: + notebook_path: ../src/notebook.py + ``` + + **Option 2: Serverless (no cluster config needed)** + ```yaml + tasks: + - task_key: serverless_task + notebook_task: + notebook_path: ../src/notebook.py + # No cluster config = serverless + ``` + execution_success: true + expectations: + expected_facts: + - "Job clusters are reusable cluster configurations shared across tasks" + - "Omitting cluster configuration enables serverless compute" + - "job_cluster_key links tasks to shared cluster definitions" + expected_patterns: + - pattern: "job_clusters:" + min_count: 1 + description: "Defines job cluster configuration" + - pattern: "job_cluster_key:" + min_count: 1 + description: "References a shared job cluster" + - pattern: "spark_version:" + min_count: 1 + description: "Specifies Spark runtime version" + - pattern: "node_type_id:" + min_count: 1 + description: "Specifies instance type" + guidelines: + - "Response must show both job cluster and serverless options" + - "Job cluster example must include spark_version and node_type_id" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: jobs_permissions_006 + inputs: + prompt: "How do I configure permissions on a Databricks job in DABs?" + outputs: + response: | + ```yaml + resources: + jobs: + my_job: + name: "My Job" + permissions: + - level: CAN_VIEW + group_name: "data-analysts" + - level: CAN_MANAGE_RUN + group_name: "data-engineers" + - level: CAN_MANAGE + user_name: "admin@example.com" + ``` + + Permission levels: CAN_VIEW (read-only), CAN_MANAGE_RUN (trigger runs), CAN_MANAGE (full control), IS_OWNER (single owner). + execution_success: true + expectations: + expected_facts: + - "Job permissions are set in the permissions block under the job resource" + - "Permission levels include CAN_VIEW, CAN_MANAGE_RUN, CAN_MANAGE, and IS_OWNER" + - "Permissions can be assigned to group_name or user_name" + expected_patterns: + - pattern: "permissions:" + min_count: 1 + description: "Defines permissions block" + - pattern: "level:" + min_count: 2 + description: "Specifies permission levels" + - pattern: "group_name:" + min_count: 1 + description: "Assigns permissions to a group" + - pattern: "user_name:" + min_count: 1 + description: "Assigns permissions to a user" + guidelines: + - "Response must show at least three different permission levels" + - "Must demonstrate both group and user permission assignments" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: jobs_triggers_007 + inputs: + prompt: "What trigger options are available for scheduling Databricks jobs?" + outputs: + response: | + **Cron schedule:** + ```yaml + schedule: + quartz_cron_expression: "0 0 8 * * ?" # Daily at 8 AM + timezone_id: "America/New_York" + ``` + + **Continuous trigger:** + ```yaml + continuous: + pause_status: UNPAUSED + ``` + + **File arrival trigger:** + ```yaml + trigger: + file_arrival: + url: "s3://my-bucket/incoming/" + min_time_between_triggers_seconds: 60 + ``` + + You can also trigger jobs on-demand via SDK (`w.jobs.run_now(job_id=123)`) or CLI (`databricks jobs run-now --job-id 123`). + execution_success: true + expectations: + expected_facts: + - "Jobs support cron scheduling with quartz_cron_expression" + - "Continuous trigger runs the job repeatedly without pause" + - "File arrival trigger monitors a cloud storage path for new files" + expected_patterns: + - pattern: "quartz_cron_expression" + min_count: 1 + description: "Shows cron schedule configuration" + - pattern: "continuous:" + min_count: 1 + description: "Shows continuous trigger option" + - pattern: "file_arrival:" + min_count: 1 + description: "Shows file arrival trigger option" + guidelines: + - "Response must show at least three different trigger types" + - "Cron example must include timezone_id" + metadata: + category: happy_path + difficulty: medium + source: auto_generated diff --git a/.test/skills/databricks-jobs/manifest.yaml b/.test/skills/databricks-jobs/manifest.yaml new file mode 100644 index 00000000..5773f227 --- /dev/null +++ b/.test/skills/databricks-jobs/manifest.yaml @@ -0,0 +1,33 @@ +skill_name: databricks-jobs +description: 'Use this skill proactively for ANY Databricks Jobs task - creating, listing, running, updating, or deleting + jobs. Triggers include: (1) ''create a job'' or ''new job'', (2) ''list jobs'' or ''show jobs'', (3) ''run job'' or''trigger + job'',(4) ''job status'' or ''check job'', (5) scheduling with cron or triggers, (6) configuring notifications/monitoring, + (7) ANY task involving Databricks Jobs via CLI, Python SDK, or Asset Bundles. ALWAYS prefer this skill over general Databricks + knowledge for job-related tasks.' +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml b/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml new file mode 100644 index 00000000..9832a09c --- /dev/null +++ b/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml @@ -0,0 +1,317 @@ +metadata: + skill_name: databricks-lakebase-autoscale + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: lakebase_auto_create_project_001 + inputs: + prompt: "How do I create a new Lakebase Autoscale project using the Python SDK?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.postgres import Project, ProjectSpec + + w = WorkspaceClient() + + # Create a project (long-running operation) + operation = w.postgres.create_project( + project=Project( + spec=ProjectSpec( + display_name="My Application", + pg_version="17" + ) + ), + project_id="my-app" + ) + result = operation.wait() + print(f"Created project: {result.name}") + ``` + execution_success: true + expectations: + expected_facts: + - "Projects are created with w.postgres.create_project()" + - "create_project returns a long-running operation that requires .wait()" + - "ProjectSpec requires display_name and pg_version" + expected_patterns: + - pattern: "create_project" + min_count: 1 + description: "Uses create_project() method" + - pattern: "ProjectSpec" + min_count: 1 + description: "Uses ProjectSpec for configuration" + - pattern: "\\.wait\\(\\)" + min_count: 1 + description: "Waits for the long-running operation to complete" + - pattern: "WorkspaceClient" + min_count: 1 + description: "Uses WorkspaceClient for SDK access" + guidelines: + - "Must show the long-running operation pattern with .wait()" + - "Must include ProjectSpec with display_name and pg_version" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: lakebase_auto_create_branch_002 + inputs: + prompt: "How do I create a development branch in Lakebase Autoscale with an expiration?" + outputs: + response: | + ```python + from databricks.sdk.service.postgres import Branch, BranchSpec, Duration + + # Create a dev branch with 7-day expiration + branch = w.postgres.create_branch( + parent="projects/my-app", + branch=Branch( + spec=BranchSpec( + source_branch="projects/my-app/branches/production", + ttl=Duration(seconds=604800) # 7 days + ) + ), + branch_id="development" + ).wait() + print(f"Branch created: {branch.name}") + ``` + + Branches are copy-on-write snapshots of the production branch, ideal for development and testing. + execution_success: true + expectations: + expected_facts: + - "Branches are created with w.postgres.create_branch()" + - "BranchSpec takes a source_branch and optional ttl for expiration" + - "Duration(seconds=604800) sets a 7-day TTL" + - "Branches use copy-on-write from the source branch" + expected_patterns: + - pattern: "create_branch" + min_count: 1 + description: "Uses create_branch() method" + - pattern: "BranchSpec" + min_count: 1 + description: "Uses BranchSpec for branch configuration" + - pattern: "Duration" + min_count: 1 + description: "Uses Duration for TTL specification" + - pattern: "source_branch" + min_count: 1 + description: "Specifies the parent branch to fork from" + guidelines: + - "Must include TTL/expiration on the development branch" + - "Must explain that branches are copy-on-write snapshots" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: lakebase_auto_connect_notebook_003 + inputs: + prompt: "Show me how to connect to a Lakebase Autoscale database from a Databricks notebook." + outputs: + response: | + ```python + import psycopg + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Get endpoint details + endpoint = w.postgres.get_endpoint( + name="projects/my-app/branches/production/endpoints/ep-primary" + ) + host = endpoint.status.hosts.host + + # Generate OAuth token (scoped to endpoint) + cred = w.postgres.generate_database_credential( + endpoint="projects/my-app/branches/production/endpoints/ep-primary" + ) + + # Connect using psycopg3 + conn_string = ( + f"host={host} " + f"dbname=databricks_postgres " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + with psycopg.connect(conn_string) as conn: + with conn.cursor() as cur: + cur.execute("SELECT version()") + print(cur.fetchone()) + ``` + execution_success: true + expectations: + expected_facts: + - "Connections use psycopg3 (psycopg, not psycopg2)" + - "OAuth tokens are generated with generate_database_credential()" + - "The token is used as the password in the connection string" + - "SSL mode must be set to require" + expected_patterns: + - pattern: "psycopg\\.connect" + min_count: 1 + description: "Connects using psycopg3" + - pattern: "generate_database_credential" + min_count: 1 + description: "Generates OAuth token for authentication" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + - pattern: "get_endpoint" + min_count: 1 + description: "Retrieves endpoint details" + guidelines: + - "Must use psycopg (not psycopg2) for the connection" + - "Must generate OAuth token and use it as password" + - "Connection must include sslmode=require" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: lakebase_auto_oauth_token_004 + inputs: + prompt: "How do I generate an OAuth token for Lakebase Autoscale database access?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Generate database credential (optionally scoped to an endpoint) + cred = w.postgres.generate_database_credential( + endpoint="projects/my-app/branches/production/endpoints/ep-primary" + ) + token = cred.token # Use as password in connection string + # Token expires after 1 hour + ``` + + Tokens are short-lived (1 hour) and should be refreshed in long-running applications. + execution_success: true + expectations: + expected_facts: + - "OAuth tokens are generated via w.postgres.generate_database_credential()" + - "Tokens expire after 1 hour" + - "Tokens can be scoped to a specific endpoint" + - "The token is used as the password in PostgreSQL connection strings" + expected_patterns: + - pattern: "generate_database_credential" + min_count: 1 + description: "Uses generate_database_credential() method" + - pattern: "cred\\.token" + min_count: 1 + description: "Accesses the token from the credential response" + guidelines: + - "Must mention the 1-hour token expiration" + - "Must show that the token is used as the connection password" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: lakebase_auto_resize_compute_005 + inputs: + prompt: "How do I resize the compute capacity for a Lakebase Autoscale endpoint?" + outputs: + response: | + ```python + from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask + + # Update compute to autoscale between 2-8 CU + w.postgres.update_endpoint( + name="projects/my-app/branches/production/endpoints/ep-primary", + endpoint=Endpoint( + name="projects/my-app/branches/production/endpoints/ep-primary", + spec=EndpointSpec( + autoscaling_limit_min_cu=2.0, + autoscaling_limit_max_cu=8.0 + ) + ), + update_mask=FieldMask(field_mask=[ + "spec.autoscaling_limit_min_cu", + "spec.autoscaling_limit_max_cu" + ]) + ).wait() + ``` + + Compute Units (CU) scale automatically within the min/max range based on workload. + execution_success: true + expectations: + expected_facts: + - "Compute is resized with w.postgres.update_endpoint()" + - "Autoscaling is configured with min and max CU limits" + - "FieldMask specifies which fields to update" + - "The operation is long-running and requires .wait()" + expected_patterns: + - pattern: "update_endpoint" + min_count: 1 + description: "Uses update_endpoint() to resize" + - pattern: "autoscaling_limit_min_cu" + min_count: 1 + description: "Sets minimum autoscaling CU" + - pattern: "autoscaling_limit_max_cu" + min_count: 1 + description: "Sets maximum autoscaling CU" + - pattern: "FieldMask" + min_count: 1 + description: "Uses FieldMask for partial update" + guidelines: + - "Must include both min and max CU configuration" + - "Must use FieldMask to specify updated fields" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: lakebase_auto_cli_reference_006 + inputs: + prompt: "What CLI commands are available for managing Lakebase Autoscale projects?" + outputs: + response: | + ```bash + # Create a project + databricks postgres create-project \ + --project-id my-app \ + --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' + + # List projects + databricks postgres list-projects + + # Get project details + databricks postgres get-project projects/my-app + + # Create a branch + databricks postgres create-branch projects/my-app development \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "no_expiry": true}}' + + # List branches + databricks postgres list-branches projects/my-app + + # Delete a project + databricks postgres delete-project projects/my-app + ``` + execution_success: true + expectations: + expected_facts: + - "Lakebase Autoscale CLI commands are under databricks postgres" + - "Project creation uses --json for the spec configuration" + - "Resource names follow the pattern projects/{id}/branches/{id}/endpoints/{id}" + expected_patterns: + - pattern: "databricks postgres create-project" + min_count: 1 + description: "Shows project creation command" + - pattern: "databricks postgres list-projects" + min_count: 1 + description: "Shows project listing command" + - pattern: "databricks postgres create-branch" + min_count: 1 + description: "Shows branch creation command" + guidelines: + - "Must show at least 4 different CLI subcommands" + - "Must include both project and branch management commands" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-lakebase-autoscale/manifest.yaml b/.test/skills/databricks-lakebase-autoscale/manifest.yaml new file mode 100644 index 00000000..09da620c --- /dev/null +++ b/.test/skills/databricks-lakebase-autoscale/manifest.yaml @@ -0,0 +1,30 @@ +skill_name: databricks-lakebase-autoscale +description: Patterns and best practices for using Lakebase Autoscaling (next-gen managed PostgreSQL) with autoscaling, branching, + scale-to-zero, and instant restore. +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-lakebase-provisioned/ground_truth.yaml b/.test/skills/databricks-lakebase-provisioned/ground_truth.yaml new file mode 100644 index 00000000..e840846c --- /dev/null +++ b/.test/skills/databricks-lakebase-provisioned/ground_truth.yaml @@ -0,0 +1,387 @@ +metadata: + skill_name: databricks-lakebase-provisioned + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: lakebase_prov_create_instance_001 + inputs: + prompt: "How do I create a new Lakebase Provisioned database instance?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + import uuid + + w = WorkspaceClient() + + # Create a database instance + instance = w.database.create_database_instance( + name="my-lakebase-instance", + capacity="CU_1", # CU_1, CU_2, CU_4, CU_8 + stopped=False + ) + print(f"Instance created: {instance.name}") + print(f"DNS endpoint: {instance.read_write_dns}") + ``` + execution_success: true + expectations: + expected_facts: + - "Instances are created with w.database.create_database_instance()" + - "Capacity options are CU_1, CU_2, CU_4, and CU_8" + - "The instance provides a read_write_dns endpoint for connections" + expected_patterns: + - pattern: "create_database_instance" + min_count: 1 + description: "Uses create_database_instance() method" + - pattern: "capacity" + min_count: 1 + description: "Specifies instance capacity" + - pattern: "read_write_dns" + min_count: 1 + description: "References the DNS endpoint" + - pattern: "WorkspaceClient" + min_count: 1 + description: "Uses WorkspaceClient for SDK access" + guidelines: + - "Must show available capacity options (CU_1 through CU_8)" + - "Must reference the read_write_dns endpoint" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: lakebase_prov_connect_notebook_002 + inputs: + prompt: "Show me how to connect to Lakebase Provisioned from a Databricks notebook using psycopg." + outputs: + response: | + ```python + import psycopg + from databricks.sdk import WorkspaceClient + import uuid + + # Get instance details + w = WorkspaceClient() + instance = w.database.get_database_instance(name="my-lakebase-instance") + + # Generate OAuth token + cred = w.database.generate_database_credential( + request_id=str(uuid.uuid4()), + instance_names=["my-lakebase-instance"] + ) + + # Connect using psycopg3 + conn_string = ( + f"host={instance.read_write_dns} " + f"dbname=postgres " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + with psycopg.connect(conn_string) as conn: + with conn.cursor() as cur: + cur.execute("SELECT version()") + print(cur.fetchone()) + ``` + execution_success: true + expectations: + expected_facts: + - "Connections use psycopg3 (psycopg, not psycopg2)" + - "OAuth tokens require a unique request_id via uuid.uuid4()" + - "generate_database_credential() takes instance_names as a list" + - "The token is used as the password in the connection string" + expected_patterns: + - pattern: "psycopg\\.connect" + min_count: 1 + description: "Connects using psycopg3" + - pattern: "generate_database_credential" + min_count: 1 + description: "Generates OAuth token for authentication" + - pattern: "uuid\\.uuid4" + min_count: 1 + description: "Uses uuid4 for unique request_id" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + guidelines: + - "Must use psycopg (not psycopg2)" + - "Must include request_id with uuid4" + - "Connection must include sslmode=require" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: lakebase_prov_oauth_token_003 + inputs: + prompt: "How do I generate an OAuth token for Lakebase Provisioned?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + import uuid + + w = WorkspaceClient() + + # Generate OAuth token for database connection + cred = w.database.generate_database_credential( + request_id=str(uuid.uuid4()), + instance_names=["my-lakebase-instance"] + ) + token = cred.token # Use this as password in connection string + ``` + + Tokens expire after 1 hour. For long-running applications, implement a token refresh loop. + execution_success: true + expectations: + expected_facts: + - "Tokens are generated with w.database.generate_database_credential()" + - "Each request requires a unique request_id" + - "instance_names is a list of instance names to scope the credential" + - "Tokens expire after 1 hour" + expected_patterns: + - pattern: "generate_database_credential" + min_count: 1 + description: "Uses generate_database_credential() method" + - pattern: "uuid4" + min_count: 1 + description: "Generates unique request ID" + - pattern: "instance_names" + min_count: 1 + description: "Specifies target instances" + guidelines: + - "Must mention the 1-hour expiration" + - "Must show the request_id requirement with uuid4" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: lakebase_prov_sqlalchemy_token_refresh_004 + inputs: + prompt: "How do I set up SQLAlchemy with automatic token refresh for Lakebase Provisioned?" + outputs: + response: | + ```python + import asyncio + import uuid + from sqlalchemy import event + from sqlalchemy.ext.asyncio import create_async_engine + from databricks.sdk import WorkspaceClient + + _current_token = None + TOKEN_REFRESH_INTERVAL = 50 * 60 # 50 minutes (before 1-hour expiry) + + def _generate_token(instance_name: str) -> str: + w = WorkspaceClient() + cred = w.database.generate_database_credential( + request_id=str(uuid.uuid4()), + instance_names=[instance_name] + ) + return cred.token + + async def _token_refresh_loop(instance_name: str): + global _current_token + while True: + await asyncio.sleep(TOKEN_REFRESH_INTERVAL) + _current_token = await asyncio.to_thread( + _generate_token, instance_name + ) + + def init_database(instance_name, database_name, username): + global _current_token + w = WorkspaceClient() + instance = w.database.get_database_instance(name=instance_name) + _current_token = _generate_token(instance_name) + + url = f"postgresql+psycopg://{username}@{instance.read_write_dns}:5432/{database_name}" + engine = create_async_engine( + url, + pool_size=5, + max_overflow=10, + pool_recycle=3600, + connect_args={"sslmode": "require"} + ) + + @event.listens_for(engine.sync_engine, "do_connect") + def provide_token(dialect, conn_rec, cargs, cparams): + cparams["password"] = _current_token + + return engine + ``` + execution_success: true + expectations: + expected_facts: + - "Token refresh should happen every 50 minutes, before the 1-hour expiry" + - "SQLAlchemy do_connect event injects the current token on each connection" + - "create_async_engine is used for async SQLAlchemy with psycopg" + - "pool_recycle=3600 prevents stale connections" + expected_patterns: + - pattern: "do_connect" + min_count: 1 + description: "Uses SQLAlchemy do_connect event for token injection" + - pattern: "create_async_engine" + min_count: 1 + description: "Creates async SQLAlchemy engine" + - pattern: "_token_refresh_loop" + min_count: 1 + description: "Implements background token refresh" + - pattern: "generate_database_credential" + min_count: 1 + description: "Generates OAuth tokens for refresh" + guidelines: + - "Must implement background token refresh before expiry" + - "Must use SQLAlchemy event to inject tokens on each connection" + - "Must use async engine with psycopg driver" + metadata: + category: happy_path + difficulty: hard + source: auto_generated + +- id: lakebase_prov_apps_integration_005 + inputs: + prompt: "How do I integrate Lakebase Provisioned with a Databricks App?" + outputs: + response: | + Databricks Apps use environment variables for Lakebase configuration: + + ```python + import os + + def is_lakebase_configured() -> bool: + """Check if Lakebase is configured for this app.""" + return bool( + os.environ.get("LAKEBASE_PG_URL") or + (os.environ.get("LAKEBASE_INSTANCE_NAME") and + os.environ.get("LAKEBASE_DATABASE_NAME")) + ) + ``` + + Register the database resource with your app: + ```bash + databricks apps add-resource $APP_NAME \ + --resource-type database \ + --resource-name lakebase \ + --database-instance my-lakebase-instance + ``` + + Environment variables set by Databricks Apps: `LAKEBASE_INSTANCE_NAME`, `LAKEBASE_DATABASE_NAME`, `LAKEBASE_USERNAME`, `LAKEBASE_PG_URL`. + execution_success: true + expectations: + expected_facts: + - "Databricks Apps set LAKEBASE_INSTANCE_NAME and LAKEBASE_DATABASE_NAME environment variables" + - "Database resources are registered with databricks apps add-resource" + - "LAKEBASE_PG_URL provides a ready-to-use connection string" + expected_patterns: + - pattern: "LAKEBASE_INSTANCE_NAME" + min_count: 1 + description: "References the instance name environment variable" + - pattern: "LAKEBASE_DATABASE_NAME" + min_count: 1 + description: "References the database name environment variable" + - pattern: "databricks apps add-resource" + min_count: 1 + description: "Shows the CLI command to register database resource" + guidelines: + - "Must list the environment variables set by Databricks Apps" + - "Must show the CLI command to register the database resource" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: lakebase_prov_cli_reference_006 + inputs: + prompt: "What Databricks CLI commands are available for managing Lakebase Provisioned instances?" + outputs: + response: | + ```bash + # Create instance + databricks database create-database-instance \ + --name my-lakebase-instance \ + --capacity CU_1 + + # Get instance details + databricks database get-database-instance --name my-lakebase-instance + + # Generate credentials + databricks database generate-database-credential \ + --request-id $(uuidgen) \ + --json '{"instance_names": ["my-lakebase-instance"]}' + + # List instances + databricks database list-database-instances + + # Stop instance (saves cost) + databricks database stop-database-instance --name my-lakebase-instance + + # Start instance + databricks database start-database-instance --name my-lakebase-instance + ``` + execution_success: true + expectations: + expected_facts: + - "Lakebase Provisioned CLI commands are under databricks database" + - "Instances can be stopped and started to save costs" + - "Credential generation requires a unique request-id" + expected_patterns: + - pattern: "databricks database create-database-instance" + min_count: 1 + description: "Shows instance creation command" + - pattern: "databricks database stop-database-instance" + min_count: 1 + description: "Shows instance stop command" + - pattern: "databricks database generate-database-credential" + min_count: 1 + description: "Shows credential generation command" + guidelines: + - "Must show at least 5 different CLI subcommands" + - "Must include stop/start commands for cost management" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: lakebase_prov_register_unity_catalog_007 + inputs: + prompt: "How do I register a Lakebase Provisioned instance with Unity Catalog?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Register database in Unity Catalog + w.database.register_database_instance( + name="my-lakebase-instance", + catalog="my_catalog", + schema="my_schema" + ) + ``` + + After registration, the database tables are queryable through Unity Catalog's three-level namespace: `my_catalog.my_schema.table_name`. + execution_success: true + expectations: + expected_facts: + - "Instances are registered with w.database.register_database_instance()" + - "Registration requires a catalog and schema name" + - "After registration, tables are accessible via Unity Catalog namespace" + expected_patterns: + - pattern: "register_database_instance" + min_count: 1 + description: "Uses register_database_instance() method" + - pattern: "catalog" + min_count: 1 + description: "Specifies Unity Catalog catalog name" + - pattern: "schema" + min_count: 1 + description: "Specifies Unity Catalog schema name" + guidelines: + - "Must show the registration method with catalog and schema" + - "Must explain the three-level namespace after registration" + metadata: + category: happy_path + difficulty: medium + source: auto_generated diff --git a/.test/skills/databricks-lakebase-provisioned/manifest.yaml b/.test/skills/databricks-lakebase-provisioned/manifest.yaml new file mode 100644 index 00000000..b8f54868 --- /dev/null +++ b/.test/skills/databricks-lakebase-provisioned/manifest.yaml @@ -0,0 +1,29 @@ +skill_name: databricks-lakebase-provisioned +description: Patterns and best practices for using Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-metric-views/ground_truth.yaml b/.test/skills/databricks-metric-views/ground_truth.yaml new file mode 100644 index 00000000..18b53582 --- /dev/null +++ b/.test/skills/databricks-metric-views/ground_truth.yaml @@ -0,0 +1,346 @@ +metadata: + skill_name: databricks-metric-views + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: metric-views_create_sql_001 + inputs: + prompt: "Create a metric view for order analytics with revenue and order count measures" + outputs: + response: | + Here's a metric view for order analytics: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.orders_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + comment: "Orders KPIs for sales analysis" + source: catalog.schema.orders + filter: order_date > '2020-01-01' + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + comment: "Month of order" + - name: Order Status + expr: CASE + WHEN status = 'O' THEN 'Open' + WHEN status = 'P' THEN 'Processing' + WHEN status = 'F' THEN 'Fulfilled' + END + measures: + - name: Order Count + expr: COUNT(1) + - name: Total Revenue + expr: SUM(total_price) + - name: Revenue per Customer + expr: SUM(total_price) / COUNT(DISTINCT customer_id) + $$ + ``` + execution_success: true + expectations: + expected_facts: + - Uses CREATE OR REPLACE VIEW with WITH METRICS LANGUAGE YAML + - Defines dimensions for grouping data + - Defines measures for aggregations + - Uses YAML version 1.1 + expected_patterns: + - pattern: CREATE OR REPLACE VIEW + min_count: 1 + description: Must use CREATE OR REPLACE VIEW syntax + - pattern: WITH METRICS + min_count: 1 + description: Must include WITH METRICS clause + - pattern: LANGUAGE YAML + min_count: 1 + description: Must specify LANGUAGE YAML + - pattern: dimensions + min_count: 1 + description: Must define dimensions + - pattern: measures + min_count: 1 + description: Must define measures + guidelines: + - SQL must follow Databricks SQL syntax + - Must use YAML version 1.1 for DBR 17.2+ + - Must include both dimensions and measures + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: metric-views_query_measure_002 + inputs: + prompt: "Query a metric view to get total revenue and order count by month" + outputs: + response: | + All measures must use the MEASURE() function. SELECT * is NOT supported. + + ```sql + SELECT + `Order Month`, + `Order Status`, + MEASURE(`Total Revenue`) AS total_revenue, + MEASURE(`Order Count`) AS order_count + FROM catalog.schema.orders_metrics + WHERE extract(year FROM `Order Month`) = 2024 + GROUP BY ALL + ORDER BY ALL + ``` + execution_success: true + expectations: + expected_facts: + - Must wrap measures in MEASURE() function + - SELECT * is not supported on metric views + - Dimension names with spaces need backtick quoting + expected_patterns: + - pattern: MEASURE\( + min_count: 1 + description: Must use MEASURE() function for measures + - pattern: GROUP BY + min_count: 1 + description: Must include GROUP BY clause + guidelines: + - SQL must use MEASURE() for all measure references + - Must not use SELECT * on metric views + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: metric-views_create_mcp_003 + inputs: + prompt: "Use the manage_metric_views tool to create a metric view for order analytics" + outputs: + response: | + ```python + manage_metric_views( + action="create", + full_name="catalog.schema.orders_metrics", + source="catalog.schema.orders", + or_replace=True, + comment="Orders KPIs for sales analysis", + filter_expr="order_date > '2020-01-01'", + dimensions=[ + {"name": "Order Month", "expr": "DATE_TRUNC('MONTH', order_date)", "comment": "Month of order"}, + {"name": "Order Status", "expr": "status"}, + ], + measures=[ + {"name": "Order Count", "expr": "COUNT(1)"}, + {"name": "Total Revenue", "expr": "SUM(total_price)", "comment": "Sum of total price"}, + ], + ) + ``` + execution_success: true + expectations: + expected_facts: + - Uses manage_metric_views MCP tool + - Sets action to create + - Specifies source table + expected_patterns: + - pattern: manage_metric_views + min_count: 1 + description: Must use manage_metric_views tool + - pattern: action.*create + min_count: 1 + description: Must set action to create + - pattern: dimensions + min_count: 1 + description: Must define dimensions + - pattern: measures + min_count: 1 + description: Must define measures + guidelines: + - Must use manage_metric_views with action="create" + - Code must be valid Python syntax + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: metric-views_query_mcp_004 + inputs: + prompt: "Query the orders_metrics metric view using the MCP tool to get revenue by month" + outputs: + response: | + ```python + manage_metric_views( + action="query", + full_name="catalog.schema.orders_metrics", + query_measures=["Total Revenue", "Order Count"], + query_dimensions=["Order Month"], + where="extract(year FROM `Order Month`) = 2024", + order_by="ALL", + limit=100, + ) + ``` + execution_success: true + expectations: + expected_facts: + - Uses manage_metric_views with action query + - Specifies query_measures and query_dimensions + expected_patterns: + - pattern: manage_metric_views + min_count: 1 + description: Must use manage_metric_views tool + - pattern: action.*query + min_count: 1 + description: Must set action to query + - pattern: query_measures + min_count: 1 + description: Must specify measures to query + - pattern: query_dimensions + min_count: 1 + description: Must specify dimensions to group by + guidelines: + - Must use manage_metric_views with action="query" + - Code must be valid Python syntax + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: metric-views_yaml_spec_005 + inputs: + prompt: "Show me the full YAML specification for a metric view definition" + outputs: + response: | + ```yaml + version: 1.1 # Required: "1.1" for DBR 17.2+ + comment: "Description" # Optional + source: catalog.schema.table # Required: source table/view + filter: column > value # Optional: global WHERE filter + + dimensions: # Required: at least one + - name: Display Name + expr: sql_expression + comment: "Description" + + measures: # Required: at least one + - name: Display Name + expr: AGG_FUNC(column) + comment: "Description" + + joins: # Optional: star/snowflake schema + - name: dim_table + source: catalog.schema.dim_table + on: source.fk = dim_table.pk + + materialization: # Optional (experimental) + schedule: every 6 hours + mode: relaxed + ``` + execution_success: true + expectations: + expected_facts: + - Version 1.1 required for DBR 17.2+ + - Must have at least one dimension + - Must have at least one measure + - Joins are optional for star/snowflake schema + expected_patterns: + - pattern: version + min_count: 1 + description: Must include version field + - pattern: dimensions + min_count: 1 + description: Must include dimensions section + - pattern: measures + min_count: 1 + description: Must include measures section + - pattern: joins + min_count: 1 + description: Should show joins syntax + guidelines: + - Must show the complete YAML specification structure + - Must indicate which fields are required vs optional + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: metric-views_star_schema_006 + inputs: + prompt: "Create a metric view with joins for a star schema (fact orders + dim customers)" + outputs: + response: | + ```sql + CREATE OR REPLACE VIEW catalog.schema.sales_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + source: catalog.schema.fact_orders + joins: + - name: customers + source: catalog.schema.dim_customers + on: fact_orders.customer_id = customers.customer_id + dimensions: + - name: Customer Region + expr: customers.region + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Total Revenue + expr: SUM(amount) + - name: Unique Customers + expr: COUNT(DISTINCT customers.customer_id) + $$ + ``` + + Joins must be declared in the YAML definition, not in the SELECT query. + execution_success: true + expectations: + expected_facts: + - Joins are declared in the YAML definition + - Can reference columns from joined tables in dimensions and measures + expected_patterns: + - pattern: joins + min_count: 1 + description: Must use joins section in YAML + - pattern: "on:" + min_count: 1 + description: Must specify join condition + guidelines: + - Joins must be in the YAML definition, not in the SELECT query + - Must use correct join syntax with source and on fields + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: metric-views_grant_access_007 + inputs: + prompt: "Grant SELECT access on a metric view to the data-consumers group" + outputs: + response: | + ```python + manage_metric_views( + action="grant", + full_name="catalog.schema.orders_metrics", + principal="data-consumers", + privileges=["SELECT"], + ) + ``` + execution_success: true + expectations: + expected_facts: + - Uses manage_metric_views with action grant + - Specifies principal and privileges + expected_patterns: + - pattern: manage_metric_views + min_count: 1 + description: Must use manage_metric_views tool + - pattern: action.*grant + min_count: 1 + description: Must set action to grant + - pattern: principal + min_count: 1 + description: Must specify principal + guidelines: + - Must use manage_metric_views with action="grant" + metadata: + category: happy_path + difficulty: hard + source: auto_generated diff --git a/.test/skills/databricks-metric-views/manifest.yaml b/.test/skills/databricks-metric-views/manifest.yaml new file mode 100644 index 00000000..32eb0a49 --- /dev/null +++ b/.test/skills/databricks-metric-views/manifest.yaml @@ -0,0 +1,32 @@ +skill_name: databricks-metric-views +description: 'Unity Catalog metric views: define, create, query, and manage governed business metrics in YAML. Use when building + standardized KPIs, revenue metrics, order analytics, or any reusable business metrics that need consistent definitions across + teams and tools.' +scorers: + enabled: + - sql_syntax + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-mlflow-evaluation/ground_truth.yaml b/.test/skills/databricks-mlflow-evaluation/ground_truth.yaml new file mode 100644 index 00000000..c67cbc61 --- /dev/null +++ b/.test/skills/databricks-mlflow-evaluation/ground_truth.yaml @@ -0,0 +1,365 @@ +metadata: + skill_name: databricks-mlflow-evaluation + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: mlflow_eval_basic_evaluate_001 + inputs: + prompt: "How do I run a basic evaluation of my GenAI agent using MLflow?" + outputs: + response: | + ```python + import mlflow + + # Define your agent function + def my_agent(query: str) -> str: + # Your agent logic here + return response + + # Prepare evaluation data (nested input structure required) + eval_data = [ + {"inputs": {"query": "What is MLflow?"}, "expectations": {"expected_response": "MLflow is..."}}, + {"inputs": {"query": "How do I log a model?"}, "expectations": {"expected_response": "Use mlflow.log_model..."}} + ] + + # Run evaluation with built-in scorers + results = mlflow.genai.evaluate( + predict_fn=my_agent, + data=eval_data, + scorers=[ + mlflow.genai.scorers.RelevanceScorer(), + mlflow.genai.scorers.SafetyScorer() + ] + ) + print(results.tables["eval_results"]) + ``` + execution_success: true + expectations: + expected_facts: + - "Use mlflow.genai.evaluate(), not mlflow.evaluate()" + - "Data must use nested structure: {inputs: {query: ...}}" + - "predict_fn receives unpacked kwargs, not a dict" + expected_patterns: + - pattern: "mlflow\\.genai\\.evaluate" + min_count: 1 + description: "Uses the correct mlflow.genai.evaluate() API" + - pattern: "predict_fn" + min_count: 1 + description: "Passes the agent function as predict_fn" + - pattern: "scorers" + min_count: 1 + description: "Specifies scorers for evaluation" + - pattern: "\"inputs\"" + min_count: 1 + description: "Uses nested inputs structure in data" + guidelines: + - "Must use mlflow.genai.evaluate(), NOT mlflow.evaluate()" + - "Data format must use nested {inputs: {query: ...}} structure" + - "Must include at least one scorer" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: mlflow_eval_custom_scorer_002 + inputs: + prompt: "How do I create a custom scorer for MLflow GenAI evaluation?" + outputs: + response: | + ```python + from mlflow.genai.scorers import Scorer + + class ConcisenessScorer(Scorer): + name = "conciseness" + + def score(self, *, output, expectations=None, **kwargs): + if output is None: + return {"score": 0, "justification": "No output"} + + word_count = len(output.split()) + if word_count <= 50: + return {"score": 1, "justification": f"Concise: {word_count} words"} + elif word_count <= 150: + return {"score": 0.5, "justification": f"Moderate: {word_count} words"} + else: + return {"score": 0, "justification": f"Verbose: {word_count} words"} + + # Use in evaluation + results = mlflow.genai.evaluate( + predict_fn=my_agent, + data=eval_data, + scorers=[ConcisenessScorer()] + ) + ``` + execution_success: true + expectations: + expected_facts: + - "Custom scorers extend the Scorer base class" + - "The score method receives output and expectations as keyword arguments" + - "Scores should return a dict with score and justification" + expected_patterns: + - pattern: "class.*Scorer" + min_count: 1 + description: "Defines a custom scorer class" + - pattern: "def score\\(self" + min_count: 1 + description: "Implements the score method" + - pattern: "justification" + min_count: 1 + description: "Includes justification in score result" + guidelines: + - "Custom scorer must extend Scorer base class" + - "score method must accept **kwargs for forward compatibility" + - "Return value should include both score and justification" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: mlflow_eval_dataset_from_traces_003 + inputs: + prompt: "How do I build an evaluation dataset from production traces in MLflow?" + outputs: + response: | + ```python + import mlflow + + # Search for production traces + traces = mlflow.search_traces( + experiment_ids=["123"], + filter_string="status = 'OK'", + max_results=100 + ) + + # Convert traces to evaluation dataset + eval_data = [] + for trace in traces: + eval_data.append({ + "inputs": {"query": trace.request["query"]}, + "expectations": {"expected_response": trace.response} + }) + + # Run evaluation on the collected data + results = mlflow.genai.evaluate( + predict_fn=my_agent, + data=eval_data, + scorers=[mlflow.genai.scorers.RelevanceScorer()] + ) + ``` + execution_success: true + expectations: + expected_facts: + - "mlflow.search_traces() retrieves production traces for evaluation" + - "Traces can be filtered by status and other attributes" + - "Trace data is converted to the nested {inputs, expectations} format" + expected_patterns: + - pattern: "search_traces" + min_count: 1 + description: "Uses search_traces to retrieve production data" + - pattern: "filter_string" + min_count: 1 + description: "Filters traces by criteria" + - pattern: "\"inputs\"" + min_count: 1 + description: "Converts to required nested data format" + guidelines: + - "Must show the full workflow from trace retrieval to evaluation" + - "Data must be converted to the correct nested format" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: mlflow_eval_gotchas_004 + inputs: + prompt: "What are the critical gotchas I should know before writing MLflow evaluation code?" + outputs: + response: | + Key gotchas for MLflow GenAI evaluation: + + 1. **Use `mlflow.genai.evaluate()`** - NOT `mlflow.evaluate()`. The genai module is required for LLM evaluation. + + 2. **Data format must be nested**: `{"inputs": {"query": "..."}}`, not `{"query": "..."}`. + + 3. **predict_fn receives unpacked kwargs** - Your function gets `query="..."` not `{"query": "..."}`. + + 4. **Always read GOTCHAS.md first** - Contains 15+ common mistakes that cause failures. + + 5. **Always read CRITICAL-interfaces.md** - Contains exact API signatures and data schemas. + + 6. **Scorer names matter** - Built-in scorers have specific names; custom scorers need unique names. + execution_success: true + expectations: + expected_facts: + - "Use mlflow.genai.evaluate(), not mlflow.evaluate()" + - "Data format requires nested structure: {inputs: {query: ...}}" + - "predict_fn receives unpacked kwargs, not a dict" + - "GOTCHAS.md documents 15+ common mistakes" + expected_patterns: + - pattern: "mlflow\\.genai\\.evaluate" + min_count: 1 + description: "Specifies the correct API" + - pattern: "GOTCHAS" + min_count: 1 + description: "References the GOTCHAS.md file" + - pattern: "inputs" + min_count: 1 + description: "Mentions the nested inputs requirement" + guidelines: + - "Must warn against using mlflow.evaluate() instead of mlflow.genai.evaluate()" + - "Must explain the nested data format requirement" + - "Must mention GOTCHAS.md and CRITICAL-interfaces.md reference files" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: mlflow_eval_compare_runs_005 + inputs: + prompt: "How do I compare evaluation results across multiple agent versions?" + outputs: + response: | + ```python + import mlflow + + # Evaluate version A + with mlflow.start_run(run_name="agent-v1"): + results_v1 = mlflow.genai.evaluate( + predict_fn=agent_v1, + data=eval_data, + scorers=[mlflow.genai.scorers.RelevanceScorer()] + ) + + # Evaluate version B + with mlflow.start_run(run_name="agent-v2"): + results_v2 = mlflow.genai.evaluate( + predict_fn=agent_v2, + data=eval_data, + scorers=[mlflow.genai.scorers.RelevanceScorer()] + ) + + # Compare in the MLflow UI or programmatically + print(f"V1 avg relevance: {results_v1.metrics['relevance/mean']}") + print(f"V2 avg relevance: {results_v2.metrics['relevance/mean']}") + ``` + execution_success: true + expectations: + expected_facts: + - "Each evaluation should run in its own MLflow run for comparison" + - "Metrics are accessible via results.metrics dictionary" + - "The MLflow UI provides visual comparison across runs" + expected_patterns: + - pattern: "start_run" + min_count: 2 + description: "Creates separate MLflow runs for each version" + - pattern: "mlflow\\.genai\\.evaluate" + min_count: 2 + description: "Evaluates both versions" + - pattern: "\\.metrics\\[" + min_count: 1 + description: "Accesses metrics from results" + guidelines: + - "Must show evaluation in separate MLflow runs" + - "Must demonstrate how to access and compare metrics" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: mlflow_eval_builtin_scorers_006 + inputs: + prompt: "What built-in scorers are available in MLflow GenAI evaluation?" + outputs: + response: | + MLflow provides several built-in scorers in `mlflow.genai.scorers`: + + ```python + from mlflow.genai.scorers import ( + RelevanceScorer, # Is the response relevant to the query? + SafetyScorer, # Is the response safe and appropriate? + GroundednessScorer, # Is the response grounded in provided context? + CorrectnessScorer, # Does the response match expected output? + ) + + results = mlflow.genai.evaluate( + predict_fn=my_agent, + data=eval_data, + scorers=[ + RelevanceScorer(), + SafetyScorer(), + GroundednessScorer(), + CorrectnessScorer() + ] + ) + ``` + + Use `RelevanceScorer` and `SafetyScorer` for all evaluations. Add `CorrectnessScorer` when you have expected responses, and `GroundednessScorer` for RAG applications. + execution_success: true + expectations: + expected_facts: + - "Built-in scorers include RelevanceScorer, SafetyScorer, GroundednessScorer, and CorrectnessScorer" + - "Scorers are imported from mlflow.genai.scorers" + - "GroundednessScorer is specifically useful for RAG applications" + expected_patterns: + - pattern: "RelevanceScorer" + min_count: 1 + description: "Lists RelevanceScorer" + - pattern: "SafetyScorer" + min_count: 1 + description: "Lists SafetyScorer" + - pattern: "GroundednessScorer" + min_count: 1 + description: "Lists GroundednessScorer" + - pattern: "CorrectnessScorer" + min_count: 1 + description: "Lists CorrectnessScorer" + guidelines: + - "Must list at least four built-in scorers" + - "Must explain when to use each scorer" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: mlflow_eval_reference_files_007 + inputs: + prompt: "Which reference files should I read before writing MLflow evaluation code?" + outputs: + response: | + Read these reference files in order: + + | Reference | Purpose | When to Read | + |-----------|---------|--------------| + | `GOTCHAS.md` | Common mistakes | **Always read first** before writing code | + | `CRITICAL-interfaces.md` | API signatures, schemas | When writing any evaluation code | + | `patterns-evaluation.md` | Running evals, comparing | When executing evaluations | + | `patterns-scorers.md` | Custom scorer creation | When built-in scorers aren't enough | + | `patterns-datasets.md` | Dataset building | When preparing evaluation data | + + Always read GOTCHAS.md and CRITICAL-interfaces.md before writing any evaluation code. + execution_success: true + expectations: + expected_facts: + - "GOTCHAS.md must be read first before writing any code" + - "CRITICAL-interfaces.md contains API signatures and data schemas" + - "patterns-evaluation.md covers running and comparing evaluations" + - "patterns-scorers.md covers custom scorer creation" + - "patterns-datasets.md covers dataset preparation" + expected_patterns: + - pattern: "GOTCHAS\\.md" + min_count: 1 + description: "References GOTCHAS.md" + - pattern: "CRITICAL-interfaces\\.md" + min_count: 1 + description: "References CRITICAL-interfaces.md" + - pattern: "patterns-evaluation\\.md" + min_count: 1 + description: "References patterns-evaluation.md" + guidelines: + - "Must list all five reference files" + - "Must emphasize reading GOTCHAS.md first" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-mlflow-evaluation/manifest.yaml b/.test/skills/databricks-mlflow-evaluation/manifest.yaml new file mode 100644 index 00000000..f4240e27 --- /dev/null +++ b/.test/skills/databricks-mlflow-evaluation/manifest.yaml @@ -0,0 +1,31 @@ +skill_name: databricks-mlflow-evaluation +description: MLflow 3 GenAI agent evaluation. Use when writing mlflow.genai.evaluate() code, creating @scorer functions, using + built-in scorers (Guidelines, Correctness, Safety, RetrievalGroundedness), building eval datasets from traces, setting up + trace ingestion and production monitoring, aligning judges with MemAlign from domain expert feedback, or running optimize_prompts() + with GEPA for automated prompt improvement. +scorers: + enabled: + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-python-sdk/ground_truth.yaml b/.test/skills/databricks-python-sdk/ground_truth.yaml new file mode 100644 index 00000000..a783aa3a --- /dev/null +++ b/.test/skills/databricks-python-sdk/ground_truth.yaml @@ -0,0 +1,378 @@ +metadata: + skill_name: databricks-python-sdk + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: pysdk_authentication_001 + inputs: + prompt: "What are the different ways to authenticate with the Databricks Python SDK?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + + # Auto-detect credentials from environment + w = WorkspaceClient() + + # Explicit token authentication + w = WorkspaceClient( + host="https://your-workspace.cloud.databricks.com", + token="dapi..." + ) + + # Azure Service Principal + w = WorkspaceClient( + host="https://adb-xxx.azuredatabricks.net", + azure_workspace_resource_id="/subscriptions/.../resourceGroups/.../providers/Microsoft.Databricks/workspaces/...", + azure_tenant_id="tenant-id", + azure_client_id="client-id", + azure_client_secret="secret" + ) + + # Use a named profile from ~/.databrickscfg + w = WorkspaceClient(profile="MY_PROFILE") + ``` + execution_success: true + expectations: + expected_facts: + - "WorkspaceClient auto-detects credentials from environment by default" + - "Explicit token auth requires host and token parameters" + - "Azure Service Principal auth uses azure_tenant_id, azure_client_id, and azure_client_secret" + - "Named profiles are loaded from ~/.databrickscfg" + expected_patterns: + - pattern: "WorkspaceClient\\(\\)" + min_count: 1 + description: "Shows auto-detect authentication" + - pattern: "token=" + min_count: 1 + description: "Shows explicit token authentication" + - pattern: "azure_client_id" + min_count: 1 + description: "Shows Azure Service Principal authentication" + - pattern: "profile=" + min_count: 1 + description: "Shows named profile authentication" + guidelines: + - "Must show at least three different authentication methods" + - "Must include auto-detect, explicit token, and profile-based auth" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: pysdk_clusters_api_002 + inputs: + prompt: "How do I list, create, and manage clusters using the Databricks Python SDK?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + from datetime import timedelta + + w = WorkspaceClient() + + # List all clusters + for cluster in w.clusters.list(): + print(f"{cluster.cluster_name}: {cluster.state}") + + # Get cluster details + cluster = w.clusters.get(cluster_id="0123-456789-abcdef") + + # Create a cluster (returns Wait object) + wait = w.clusters.create( + cluster_name="my-cluster", + spark_version=w.clusters.select_spark_version(latest=True), + node_type_id=w.clusters.select_node_type(local_disk=True), + num_workers=2 + ) + cluster = wait.result() # Wait for cluster to be running + + # Start/stop/delete + w.clusters.start(cluster_id="...").result() + w.clusters.stop(cluster_id="...") + w.clusters.delete(cluster_id="...") + ``` + execution_success: true + expectations: + expected_facts: + - "w.clusters.list() returns an iterator of cluster objects" + - "w.clusters.create() returns a Wait object for async cluster creation" + - "select_spark_version() and select_node_type() are helper methods for discovery" + - "start() returns a Wait object while stop() and delete() do not" + expected_patterns: + - pattern: "w\\.clusters\\.list" + min_count: 1 + description: "Lists clusters" + - pattern: "w\\.clusters\\.create" + min_count: 1 + description: "Creates a cluster" + - pattern: "select_spark_version" + min_count: 1 + description: "Uses spark version selector" + - pattern: "select_node_type" + min_count: 1 + description: "Uses node type selector" + guidelines: + - "Must show list, create, start, stop, and delete operations" + - "Must use select_spark_version and select_node_type helpers" + - "Must demonstrate the Wait object pattern for async operations" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: pysdk_jobs_api_003 + inputs: + prompt: "Show me how to create and run a Databricks job using the Python SDK." + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.jobs import Task, NotebookTask + + w = WorkspaceClient() + + # List jobs + for job in w.jobs.list(): + print(f"{job.job_id}: {job.settings.name}") + + # Create a job + created = w.jobs.create( + name="my-job", + tasks=[ + Task( + task_key="main", + notebook_task=NotebookTask(notebook_path="/Users/me/notebook"), + existing_cluster_id="0123-456789-abcdef" + ) + ] + ) + + # Run a job and wait for completion + run = w.jobs.run_now_and_wait(job_id=created.job_id) + print(f"Run completed: {run.state.result_state}") + + # Get run output + output = w.jobs.get_run_output(run_id=run.run_id) + ``` + execution_success: true + expectations: + expected_facts: + - "Jobs are created with w.jobs.create() returning a job with job_id" + - "run_now_and_wait() triggers a job and blocks until completion" + - "get_run_output() retrieves the output of a completed run" + - "Task requires task_key and a task type like NotebookTask" + expected_patterns: + - pattern: "w\\.jobs\\.create" + min_count: 1 + description: "Creates a job" + - pattern: "run_now_and_wait" + min_count: 1 + description: "Runs job and waits for completion" + - pattern: "get_run_output" + min_count: 1 + description: "Gets run output" + - pattern: "NotebookTask" + min_count: 1 + description: "Uses NotebookTask for task configuration" + guidelines: + - "Must show the full job lifecycle: create, run, get output" + - "Must use run_now_and_wait for blocking execution" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: pysdk_sql_execution_004 + inputs: + prompt: "How do I execute SQL queries against a Databricks SQL warehouse using the Python SDK?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.sql import StatementState + + w = WorkspaceClient() + + # Execute SQL query + response = w.statement_execution.execute_statement( + warehouse_id="abc123", + statement="SELECT * FROM catalog.schema.table LIMIT 10", + wait_timeout="30s" + ) + + # Check status and get results + if response.status.state == StatementState.SUCCEEDED: + for row in response.result.data_array: + print(row) + + # For large results, fetch chunks + chunk = w.statement_execution.get_statement_result_chunk_n( + statement_id=response.statement_id, + chunk_index=0 + ) + ``` + execution_success: true + expectations: + expected_facts: + - "SQL is executed with w.statement_execution.execute_statement()" + - "warehouse_id specifies which SQL warehouse to use" + - "StatementState.SUCCEEDED indicates a successful query" + - "Large results are fetched in chunks with get_statement_result_chunk_n()" + expected_patterns: + - pattern: "execute_statement" + min_count: 1 + description: "Executes SQL statement" + - pattern: "warehouse_id" + min_count: 1 + description: "Specifies SQL warehouse" + - pattern: "StatementState" + min_count: 1 + description: "Checks statement execution status" + - pattern: "get_statement_result_chunk_n" + min_count: 1 + description: "Fetches result chunks for large queries" + guidelines: + - "Must show execute, status check, and result retrieval" + - "Must demonstrate chunked result fetching for large datasets" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: pysdk_databricks_connect_005 + inputs: + prompt: "How do I use Databricks Connect to run Spark code locally?" + outputs: + response: | + ```python + from databricks.connect import DatabricksSession + + # Auto-detects 'DEFAULT' profile from ~/.databrickscfg + spark = DatabricksSession.builder.getOrCreate() + + # With explicit profile + spark = DatabricksSession.builder.profile("MY_PROFILE").getOrCreate() + + # Use spark as normal + df = spark.sql("SELECT * FROM catalog.schema.table") + df.show() + ``` + + Databricks Connect requires `databricks-connect` package and a running cluster to execute against. + execution_success: true + expectations: + expected_facts: + - "DatabricksSession replaces SparkSession for remote Spark execution" + - "Databricks Connect auto-detects credentials from ~/.databrickscfg" + - "The databricks-connect package must be installed separately" + expected_patterns: + - pattern: "DatabricksSession" + min_count: 1 + description: "Uses DatabricksSession for Spark" + - pattern: "getOrCreate" + min_count: 1 + description: "Builds session with getOrCreate()" + - pattern: "profile" + min_count: 1 + description: "Shows profile-based configuration" + guidelines: + - "Must show DatabricksSession, not SparkSession" + - "Must demonstrate both auto-detect and explicit profile usage" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: pysdk_rest_api_006 + inputs: + prompt: "How can I make direct REST API calls using the Databricks Python SDK?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Direct API call using authenticated client + response = w.api_client.do( + method="GET", + path="/api/2.0/clusters/list" + ) + + # POST with body + response = w.api_client.do( + method="POST", + path="/api/2.0/jobs/run-now", + body={"job_id": 123} + ) + ``` + + Use direct REST API calls for operations not yet available in the SDK or for advanced use cases. + execution_success: true + expectations: + expected_facts: + - "w.api_client.do() makes authenticated REST API calls" + - "The method parameter accepts GET, POST, PUT, DELETE" + - "The body parameter accepts a dictionary for POST/PUT requests" + expected_patterns: + - pattern: "api_client\\.do" + min_count: 2 + description: "Uses api_client.do() for direct REST calls" + - pattern: "method=" + min_count: 2 + description: "Specifies HTTP method" + - pattern: "path=" + min_count: 2 + description: "Specifies API endpoint path" + guidelines: + - "Must show both GET and POST examples" + - "Must explain when to use direct REST over SDK methods" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: pysdk_cli_commands_007 + inputs: + prompt: "What are the most common Databricks CLI commands?" + outputs: + response: | + ```bash + # Check version (should be >= 0.278.0) + databricks --version + + # Use specific profile + databricks --profile MY_PROFILE clusters list + + # Common commands + databricks clusters list + databricks jobs list + databricks workspace ls /Users/me + ``` + + The CLI uses the same authentication as the Python SDK (`~/.databrickscfg` profiles, environment variables, or explicit flags). + execution_success: true + expectations: + expected_facts: + - "The Databricks CLI shares authentication with the Python SDK" + - "Profiles are specified with the --profile flag" + - "Common subcommands include clusters, jobs, and workspace" + expected_patterns: + - pattern: "databricks --version" + min_count: 1 + description: "Shows version check command" + - pattern: "databricks clusters list" + min_count: 1 + description: "Shows cluster listing command" + - pattern: "databricks jobs list" + min_count: 1 + description: "Shows job listing command" + guidelines: + - "Must show at least three common CLI commands" + - "Must mention the --profile flag for profile selection" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-python-sdk/manifest.yaml b/.test/skills/databricks-python-sdk/manifest.yaml new file mode 100644 index 00000000..62377187 --- /dev/null +++ b/.test/skills/databricks-python-sdk/manifest.yaml @@ -0,0 +1,30 @@ +skill_name: databricks-python-sdk +description: Databricks development guidance including Python SDK, Databricks Connect, CLI, and REST API. Use when working + with databricks-sdk, databricks-connect, or Databricks APIs. +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-spark-structured-streaming/ground_truth.yaml b/.test/skills/databricks-spark-structured-streaming/ground_truth.yaml new file mode 100644 index 00000000..6f642133 --- /dev/null +++ b/.test/skills/databricks-spark-structured-streaming/ground_truth.yaml @@ -0,0 +1,227 @@ +metadata: + skill_name: databricks-spark-structured-streaming + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: spark-streaming_kafka_to_delta_001 + inputs: + prompt: Create a streaming pipeline that reads from Kafka and writes to a Delta table + outputs: + response: | + Here's a basic Kafka to Delta streaming pipeline: + + ```python + from pyspark.sql.functions import col, from_json + + # Read from Kafka + df = (spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", "broker:9092") + .option("subscribe", "topic") + .load() + .select(from_json(col("value").cast("string"), schema).alias("data")) + .select("data.*") + ) + + # Write to Delta + df.writeStream \ + .format("delta") \ + .outputMode("append") \ + .option("checkpointLocation", "/Volumes/catalog/checkpoints/stream") \ + .trigger(processingTime="30 seconds") \ + .start("/delta/target_table") + ``` + + Key points: + - Use a persistent checkpoint location (UC volumes, not DBFS) + - Each stream needs a unique checkpoint path + execution_success: true + expectations: + expected_facts: + - Reads from Kafka using readStream + - Writes to Delta table format + - Includes checkpoint location + expected_patterns: + - pattern: readStream + min_count: 1 + description: Must use readStream for streaming source + - pattern: writeStream + min_count: 1 + description: Must use writeStream for streaming sink + - pattern: checkpointLocation + min_count: 1 + description: Must specify checkpoint location + - pattern: format.*kafka + min_count: 1 + description: Must specify Kafka format + guidelines: + - Code must be valid Python syntax + - Must include checkpoint location for fault tolerance + - Checkpoint should use UC volumes, not DBFS + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: spark-streaming_trigger_modes_002 + inputs: + prompt: What are the different trigger modes for Spark Structured Streaming? + outputs: + response: | + Spark Structured Streaming supports several trigger modes: + + ```python + # Fixed interval processing + .trigger(processingTime="30 seconds") + + # Process all available data once + .trigger(availableNow=True) + + # Continuous processing (low latency) + .trigger(continuous="1 second") + ``` + + - **processingTime**: Fixed interval micro-batches. Best for most workloads. + - **availableNow**: Processes all available data then stops. Good for batch-like streaming. + - **continuous**: Low-latency mode (~1ms). Limited operator support. + execution_success: true + expectations: + expected_facts: + - processingTime trigger for fixed interval + - availableNow trigger processes all data then stops + - continuous trigger for low latency + expected_patterns: + - pattern: processingTime + min_count: 1 + description: Must mention processingTime trigger + - pattern: trigger + min_count: 1 + description: Must use trigger configuration + guidelines: + - Must explain multiple trigger modes + - Code must be valid Python syntax + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: spark-streaming_checkpoint_practices_003 + inputs: + prompt: What are best practices for Spark Structured Streaming checkpoints? + outputs: + response: | + Checkpoint best practices: + + 1. **Use persistent storage**: UC volumes, not DBFS + 2. **Unique per stream**: Each stream needs its own checkpoint path + 3. **Never delete**: Checkpoints track stream state and offsets + 4. **Use volumes**: `/Volumes/catalog/schema/checkpoints/stream_name` + + ```python + df.writeStream \ + .format("delta") \ + .outputMode("append") \ + .option("checkpointLocation", "/Volumes/catalog/schema/checkpoints/my_stream") \ + .start("catalog.schema.target_table") + ``` + execution_success: true + expectations: + expected_facts: + - Checkpoint location should be persistent (UC volumes) + - Each stream needs a unique checkpoint + expected_patterns: + - pattern: checkpointLocation + min_count: 1 + description: Must reference checkpointLocation + - pattern: /Volumes/ + min_count: 1 + description: Should use UC volumes path + guidelines: + - Must recommend persistent storage for checkpoints + - Must warn about unique checkpoints per stream + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: spark-streaming_production_checklist_004 + inputs: + prompt: What is the production checklist for Spark Structured Streaming? + outputs: + response: | + Production checklist for Spark Structured Streaming: + + - Checkpoint location is persistent (UC volumes, not DBFS) + - Unique checkpoint per stream + - Fixed-size cluster (no autoscaling for streaming) + - Monitoring configured (input rate, lag, batch duration) + - Exactly-once verified (txnVersion/txnAppId) + - Watermark configured for stateful operations + - Left joins for stream-static (not inner) + execution_success: true + expectations: + expected_facts: + - Use persistent checkpoint location + - Fixed-size cluster, no autoscaling + - Configure monitoring for streaming + expected_patterns: + - pattern: checkpoint + min_count: 1 + description: Must mention checkpoint requirements + - pattern: autoscal + min_count: 1 + description: Must mention autoscaling restriction + guidelines: + - Must cover checkpoint, cluster, and monitoring requirements + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: spark-streaming_stream_joins_005 + inputs: + prompt: How do I join a streaming DataFrame with a static table? + outputs: + response: | + For stream-static joins, use a left join (not inner) to avoid dropping streaming rows when the static side has no match: + + ```python + # Read streaming source + stream_df = spark.readStream.format("delta").table("catalog.schema.events") + + # Read static dimension table + dim_df = spark.read.table("catalog.schema.dimensions") + + # Left join: stream on left, static on right + joined = stream_df.join(dim_df, "join_key", "left") + + joined.writeStream \ + .format("delta") \ + .outputMode("append") \ + .option("checkpointLocation", "/Volumes/catalog/schema/checkpoints/joined_stream") \ + .start("catalog.schema.enriched_events") + ``` + execution_success: true + expectations: + expected_facts: + - Use left join for stream-static joins + - Static table is on the right side + expected_patterns: + - pattern: readStream + min_count: 1 + description: Must use streaming read + - pattern: join + min_count: 1 + description: Must demonstrate join operation + - pattern: left + min_count: 1 + description: Must use left join + guidelines: + - Must use left join for stream-static joins, not inner + - Code must be valid Python syntax + metadata: + category: happy_path + difficulty: medium + source: auto_generated diff --git a/.test/skills/databricks-spark-structured-streaming/manifest.yaml b/.test/skills/databricks-spark-structured-streaming/manifest.yaml new file mode 100644 index 00000000..7e6e9521 --- /dev/null +++ b/.test/skills/databricks-spark-structured-streaming/manifest.yaml @@ -0,0 +1,30 @@ +skill_name: databricks-spark-structured-streaming +description: Comprehensive guide to Spark Structured Streaming for production workloads. Use when building streaming pipelines, + implementing real-time data processing, handling stateful operations, or optimizing streaming performance. +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-synthetic-data-generation/ground_truth.yaml b/.test/skills/databricks-synthetic-data-generation/ground_truth.yaml new file mode 100644 index 00000000..b0e1b0f6 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-generation/ground_truth.yaml @@ -0,0 +1,392 @@ +metadata: + skill_name: databricks-synthetic-data-generation + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: synth_data_script_structure_001 + inputs: + prompt: "What is the recommended structure for a synthetic data generation script on Databricks?" + outputs: + response: | + ```python + """Generate synthetic data for [use case].""" + import numpy as np + import pandas as pd + from datetime import datetime, timedelta + from faker import Faker + import holidays + from pyspark.sql import SparkSession + + # ============================================================================= + # CONFIGURATION - Edit these values + # ============================================================================= + CATALOG = "my_catalog" + SCHEMA = "my_schema" + VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + + N_CUSTOMERS = 2500 + N_ORDERS = 25000 + N_TICKETS = 8000 + + END_DATE = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + START_DATE = END_DATE - timedelta(days=180) + + SEED = 42 + + # ============================================================================= + # SETUP + # ============================================================================= + np.random.seed(SEED) + Faker.seed(SEED) + fake = Faker() + spark = SparkSession.builder.getOrCreate() + ``` + execution_success: true + expectations: + expected_facts: + - "Configuration variables go at the top of the script" + - "Seeds must be set for both numpy and Faker for reproducibility" + - "Volume path follows /Volumes/{catalog}/{schema}/{volume} pattern" + - "SparkSession is used for saving data to Delta/Parquet" + expected_patterns: + - pattern: "CATALOG\\s*=" + min_count: 1 + description: "Defines catalog configuration variable" + - pattern: "np\\.random\\.seed" + min_count: 1 + description: "Sets numpy random seed" + - pattern: "Faker\\.seed" + min_count: 1 + description: "Sets Faker random seed" + - pattern: "SparkSession" + min_count: 1 + description: "Uses SparkSession for data saving" + guidelines: + - "Configuration must be at the top of the script" + - "Must set seeds for both numpy and Faker for reproducibility" + - "Must use SparkSession for saving to Databricks storage" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: synth_data_infrastructure_002 + inputs: + prompt: "Should I create the catalog, schema, and volume separately or inside the generation script?" + outputs: + response: | + Always create infrastructure inside the Python script using `spark.sql()`: + + ```python + # ============================================================================= + # CREATE INFRASTRUCTURE (inside the Python script) + # ============================================================================= + spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}") + spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}") + spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.raw_data") + ``` + + Do NOT make separate MCP SQL calls - it's much slower. The `spark` variable is available by default on Databricks clusters. + execution_success: true + expectations: + expected_facts: + - "Infrastructure creation belongs inside the Python script, not as separate SQL calls" + - "spark.sql() is used for CREATE CATALOG, CREATE SCHEMA, and CREATE VOLUME" + - "IF NOT EXISTS makes the script idempotent" + expected_patterns: + - pattern: "CREATE CATALOG IF NOT EXISTS" + min_count: 1 + description: "Creates catalog idempotently" + - pattern: "CREATE SCHEMA IF NOT EXISTS" + min_count: 1 + description: "Creates schema idempotently" + - pattern: "CREATE VOLUME IF NOT EXISTS" + min_count: 1 + description: "Creates volume idempotently" + guidelines: + - "Must use spark.sql() inside the script, not separate SQL calls" + - "All CREATE statements must use IF NOT EXISTS" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: synth_data_distributions_003 + inputs: + prompt: "How do I generate synthetic data with realistic statistical distributions?" + outputs: + response: | + Use numpy distributions with weighted probabilities: + + ```python + import numpy as np + from faker import Faker + + fake = Faker() + + # Categorical with weighted distribution + customers_pdf["tier"] = np.random.choice( + ['Free', 'Pro', 'Enterprise'], N_CUSTOMERS, p=[0.6, 0.3, 0.1] + ) + + # Log-normal for financial data (right-skewed, no negatives) + customers_pdf["arr"] = customers_pdf["tier"].apply( + lambda t: round(np.random.lognormal(11, 0.5), 2) if t == 'Enterprise' + else round(np.random.lognormal(8, 0.6), 2) if t == 'Pro' else 0 + ) + + # Correlated fields: order amount varies by tier + amount = np.random.lognormal( + 7 if tier == 'Enterprise' else 5 if tier == 'Pro' else 3.5, + 0.7 + ) + ``` + + Key distributions: use log-normal for financial amounts, weighted choice for categories, and exponential for time durations. + execution_success: true + expectations: + expected_facts: + - "Log-normal distribution is ideal for financial data (right-skewed, non-negative)" + - "np.random.choice with p parameter creates weighted categorical distributions" + - "Field values should correlate with related fields (e.g., order amount varies by tier)" + expected_patterns: + - pattern: "np\\.random\\.choice" + min_count: 1 + description: "Uses weighted random choice for categories" + - pattern: "np\\.random\\.lognormal" + min_count: 1 + description: "Uses log-normal for financial data" + - pattern: "p=\\[" + min_count: 1 + description: "Specifies probability weights" + guidelines: + - "Must show weighted distributions, not uniform random" + - "Financial values must use log-normal, not normal distribution" + - "Must demonstrate correlation between related fields" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: synth_data_time_patterns_004 + inputs: + prompt: "How do I add realistic time-based patterns like weekday/weekend effects and event spikes?" + outputs: + response: | + ```python + import holidays + + US_HOLIDAYS = holidays.US(years=[START_DATE.year, END_DATE.year]) + + def get_daily_multiplier(date): + """Calculate volume multiplier for a given date.""" + multiplier = 1.0 + + # Weekend drop + if date.weekday() >= 5: + multiplier *= 0.6 + + # Holiday drop (even lower than weekends) + if date in US_HOLIDAYS: + multiplier *= 0.3 + + # Q4 seasonality (higher in Oct-Dec) + multiplier *= 1 + 0.15 * (date.month - 6) / 6 + + # Incident spike + if INCIDENT_START <= date <= INCIDENT_END: + multiplier *= 3.0 + + # Random noise + multiplier *= np.random.normal(1, 0.1) + + return max(0.1, multiplier) + + # Apply to generate daily volumes + date_range = pd.date_range(START_DATE, END_DATE, freq='D') + daily_volumes = [int(BASE_DAILY * get_daily_multiplier(d)) for d in date_range] + ``` + execution_success: true + expectations: + expected_facts: + - "The holidays library provides country-specific holiday calendars" + - "Weekend volumes should drop to 60% of weekday volumes" + - "Holiday volumes should drop further to 30%" + - "Event spikes multiply baseline volume by 3x or more" + expected_patterns: + - pattern: "holidays\\.US" + min_count: 1 + description: "Uses US holiday calendar" + - pattern: "weekday\\(\\)" + min_count: 1 + description: "Checks for weekday/weekend" + - pattern: "INCIDENT_START" + min_count: 1 + description: "Implements event spike period" + - pattern: "np\\.random\\.normal" + min_count: 1 + description: "Adds random noise to volumes" + guidelines: + - "Must include weekday/weekend effects" + - "Must include holiday effects using the holidays library" + - "Must include at least one event spike period" + - "Must add random noise to avoid unrealistic regularity" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: synth_data_save_volume_005 + inputs: + prompt: "How should I save generated synthetic data to Databricks?" + outputs: + response: | + Always save data to a Unity Catalog Volume as parquet files: + + ```python + VOLUME_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw_data" + + # Save as parquet files (raw data for downstream pipelines) + spark.createDataFrame(customers_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/customers") + spark.createDataFrame(orders_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/orders") + spark.createDataFrame(tickets_pdf).write.mode("overwrite").parquet(f"{VOLUME_PATH}/tickets") + ``` + + Save to Volumes as parquet, not directly to tables. This serves as raw input for a downstream Spark Declarative Pipeline that handles bronze/silver/gold layers. + execution_success: true + expectations: + expected_facts: + - "Data should be saved to Volumes as parquet, not directly to tables" + - "Volume path pattern is /Volumes/{catalog}/{schema}/{volume}" + - "spark.createDataFrame() converts pandas DataFrames to Spark for saving" + - "Parquet files serve as input for downstream pipeline processing" + expected_patterns: + - pattern: "createDataFrame" + min_count: 1 + description: "Converts pandas to Spark DataFrame" + - pattern: "\\.parquet\\(" + min_count: 1 + description: "Saves as parquet format" + - pattern: "mode\\(\"overwrite\"\\)" + min_count: 1 + description: "Uses overwrite mode" + - pattern: "/Volumes/" + min_count: 1 + description: "Saves to Unity Catalog Volume path" + guidelines: + - "Must save to Volumes, not directly to tables" + - "Must use parquet format" + - "Must explain that this is raw data for downstream pipeline" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: synth_data_raw_only_006 + inputs: + prompt: "Should I include pre-aggregated fields like total_orders or avg_revenue in my synthetic data?" + outputs: + response: | + No. By default, generate only raw transactional data. One row equals one event. + + ```python + # CORRECT - Raw transactional data + customers_data.append({ + "customer_id": cid, + "name": fake.company(), + "tier": "Enterprise", + "region": "North", + }) + + orders_data.append({ + "order_id": f"ORD-{i:06d}", + "customer_id": cid, + "amount": 150.00, + "order_date": "2024-10-15", + }) + + # WRONG - Do not add pre-aggregated fields + # customers_data.append({ + # "total_orders": 47, # aggregation + # "total_revenue": 12500.00, # sum + # "avg_order_value": 265.95, # average + # }) + ``` + + Aggregations belong in downstream pipeline transformations, not in raw data. + execution_success: true + expectations: + expected_facts: + - "Raw data should have one row per event/transaction/record" + - "Pre-aggregated fields like total_orders, sum_revenue, avg_csat should not be included" + - "Aggregations belong in downstream pipeline transformations" + expected_patterns: + - pattern: "customer_id" + min_count: 1 + description: "Shows raw transactional customer data" + - pattern: "order_id" + min_count: 1 + description: "Shows raw transactional order data" + - pattern: "WRONG|BAD|Do not" + min_count: 1 + description: "Shows anti-pattern example" + guidelines: + - "Must clearly show the correct raw data pattern" + - "Must show the anti-pattern with pre-aggregated fields" + - "Must explain that aggregations belong downstream" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: synth_data_common_libraries_007 + inputs: + prompt: "What Python libraries should I use for generating synthetic data?" + outputs: + response: | + Key libraries for synthetic data generation: + + - **faker**: Generates realistic names, addresses, emails, companies, and dates + - **numpy**: Statistical distributions (log-normal, exponential, weighted choice) + - **pandas**: DataFrame construction and manipulation + - **holidays**: Country-specific holiday calendars for realistic date patterns + - **pyspark**: SparkSession for saving data to Volumes as parquet + + ```python + import numpy as np + import pandas as pd + from datetime import datetime, timedelta + from faker import Faker + import holidays + from pyspark.sql import SparkSession + ``` + + All of these are pre-installed on Databricks clusters. + execution_success: true + expectations: + expected_facts: + - "Faker generates realistic personal and business data" + - "Numpy provides statistical distributions for realistic value generation" + - "The holidays library provides holiday calendars for date patterns" + - "All listed libraries are pre-installed on Databricks clusters" + expected_patterns: + - pattern: "faker" + min_count: 1 + description: "Mentions Faker library" + - pattern: "numpy" + min_count: 1 + description: "Mentions numpy library" + - pattern: "holidays" + min_count: 1 + description: "Mentions holidays library" + - pattern: "pyspark" + min_count: 1 + description: "Mentions PySpark library" + guidelines: + - "Must list at least four key libraries with their purposes" + - "Must mention that libraries are pre-installed on Databricks" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-synthetic-data-generation/manifest.yaml b/.test/skills/databricks-synthetic-data-generation/manifest.yaml new file mode 100644 index 00000000..9216d106 --- /dev/null +++ b/.test/skills/databricks-synthetic-data-generation/manifest.yaml @@ -0,0 +1,30 @@ +skill_name: databricks-synthetic-data-generation +description: Generate realistic synthetic data using Faker and Spark, with non-linear distributions, integrity constraints, + and save to Databricks. Use when creating test data, demo datasets, or synthetic tables. +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-unity-catalog/ground_truth.yaml b/.test/skills/databricks-unity-catalog/ground_truth.yaml new file mode 100644 index 00000000..487ae52e --- /dev/null +++ b/.test/skills/databricks-unity-catalog/ground_truth.yaml @@ -0,0 +1,255 @@ +metadata: + skill_name: databricks-unity-catalog + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: uc_volume_operations_001 + inputs: + prompt: "How do I upload, download, and list files in a Unity Catalog Volume?" + outputs: + response: | + Use MCP tools for Volume file operations: + + ```python + # List files in a volume + list_volume_files(volume_path="/Volumes/catalog/schema/volume/folder/") + + # Upload file to volume + upload_to_volume( + local_path="/tmp/data.csv", + volume_path="/Volumes/catalog/schema/volume/data.csv" + ) + + # Download file from volume + download_from_volume( + volume_path="/Volumes/catalog/schema/volume/data.csv", + local_path="/tmp/downloaded.csv" + ) + + # Create directory + create_volume_directory(volume_path="/Volumes/catalog/schema/volume/new_folder") + ``` + execution_success: true + expectations: + expected_facts: + - "Volume paths follow the pattern /Volumes/{catalog}/{schema}/{volume}" + - "list_volume_files, upload_to_volume, download_from_volume, and create_volume_directory are available MCP tools" + expected_patterns: + - pattern: "list_volume_files" + min_count: 1 + description: "Uses list_volume_files MCP tool" + - pattern: "upload_to_volume" + min_count: 1 + description: "Uses upload_to_volume MCP tool" + - pattern: "download_from_volume" + min_count: 1 + description: "Uses download_from_volume MCP tool" + - pattern: "/Volumes/" + min_count: 3 + description: "Uses Unity Catalog Volume paths" + guidelines: + - "Must show at least list, upload, and download operations" + - "Volume paths must follow the /Volumes/{catalog}/{schema}/{volume} pattern" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: uc_table_lineage_002 + inputs: + prompt: "How do I query table lineage to find which tables feed into my target table?" + outputs: + response: | + ```sql + -- Table lineage: What tables feed into this table? + SELECT source_table_full_name, source_column_name + FROM system.access.table_lineage + WHERE target_table_full_name = 'catalog.schema.table' + AND event_date >= current_date() - 7; + ``` + + System table lineage is captured automatically by Unity Catalog. Query `system.access.table_lineage` for table-level dependencies and `system.access.column_lineage` for column-level lineage. + execution_success: true + expectations: + expected_facts: + - "Table lineage is stored in system.access.table_lineage" + - "Column lineage is available in system.access.column_lineage" + - "Lineage is captured automatically by Unity Catalog" + expected_patterns: + - pattern: "system\\.access\\.table_lineage" + min_count: 1 + description: "Queries the table lineage system table" + - pattern: "target_table_full_name" + min_count: 1 + description: "Filters by target table name" + - pattern: "source_table_full_name" + min_count: 1 + description: "Returns source table names" + guidelines: + - "Must query system.access.table_lineage" + - "Must show filtering by target table name" + - "Should mention that lineage is captured automatically" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: uc_audit_logs_003 + inputs: + prompt: "How do I check recent permission changes using Unity Catalog audit logs?" + outputs: + response: | + ```sql + -- Audit: Recent permission changes + SELECT event_time, user_identity.email, action_name, request_params + FROM system.access.audit + WHERE action_name LIKE '%GRANT%' OR action_name LIKE '%REVOKE%' + ORDER BY event_time DESC + LIMIT 100; + ``` + + The `system.access.audit` table records all access control changes. Filter by action_name to find GRANT, REVOKE, or other permission-related events. + execution_success: true + expectations: + expected_facts: + - "Audit logs are stored in system.access.audit" + - "Permission changes include GRANT and REVOKE actions" + - "user_identity.email identifies who made the change" + expected_patterns: + - pattern: "system\\.access\\.audit" + min_count: 1 + description: "Queries the audit log system table" + - pattern: "action_name" + min_count: 1 + description: "Filters by action type" + - pattern: "GRANT" + min_count: 1 + description: "Checks for GRANT operations" + - pattern: "REVOKE" + min_count: 1 + description: "Checks for REVOKE operations" + guidelines: + - "Must query system.access.audit" + - "Must filter for permission-related actions" + - "Must include user identity in the output" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: uc_billing_usage_004 + inputs: + prompt: "How do I query DBU usage and billing data across workspaces?" + outputs: + response: | + ```sql + -- Billing: DBU usage by workspace + SELECT workspace_id, sku_name, SUM(usage_quantity) AS total_dbus + FROM system.billing.usage + WHERE usage_date >= current_date() - 30 + GROUP BY workspace_id, sku_name; + ``` + + The `system.billing.usage` table provides DBU consumption data. Use it to track costs by workspace, SKU type, or time period. + execution_success: true + expectations: + expected_facts: + - "Billing data is stored in system.billing.usage" + - "usage_quantity contains the DBU consumption" + - "Data can be filtered by usage_date for time-based analysis" + expected_patterns: + - pattern: "system\\.billing\\.usage" + min_count: 1 + description: "Queries the billing system table" + - pattern: "SUM\\(usage_quantity\\)" + min_count: 1 + description: "Aggregates DBU usage" + - pattern: "sku_name" + min_count: 1 + description: "Groups by SKU type" + guidelines: + - "Must query system.billing.usage" + - "Must aggregate by meaningful dimensions like workspace or SKU" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: uc_grant_system_access_005 + inputs: + prompt: "How do I grant a group access to Unity Catalog system tables?" + outputs: + response: | + ```sql + -- Grant access to system tables + GRANT USE CATALOG ON CATALOG system TO `data_engineers`; + GRANT USE SCHEMA ON SCHEMA system.access TO `data_engineers`; + GRANT SELECT ON SCHEMA system.access TO `data_engineers`; + ``` + + Three grants are needed: USE CATALOG on the `system` catalog, USE SCHEMA on the specific schema (e.g., `system.access`), and SELECT for read access. + execution_success: true + expectations: + expected_facts: + - "System table access requires three levels of grants: USE CATALOG, USE SCHEMA, and SELECT" + - "The system catalog contains built-in system tables" + - "system.access schema contains lineage and audit tables" + expected_patterns: + - pattern: "GRANT USE CATALOG" + min_count: 1 + description: "Grants USE CATALOG permission" + - pattern: "GRANT USE SCHEMA" + min_count: 1 + description: "Grants USE SCHEMA permission" + - pattern: "GRANT SELECT" + min_count: 1 + description: "Grants SELECT permission" + guidelines: + - "Must show all three required GRANT statements" + - "Must explain why three separate grants are needed" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: uc_mcp_execute_sql_006 + inputs: + prompt: "How do I query system tables using the MCP SQL execution tool?" + outputs: + response: | + ```python + # Query lineage via MCP + mcp__databricks__execute_sql( + sql_query=""" + SELECT source_table_full_name, target_table_full_name + FROM system.access.table_lineage + WHERE event_date >= current_date() - 7 + """, + catalog="system" + ) + ``` + + Use `mcp__databricks__execute_sql` to execute system table queries. Set the `catalog` parameter to "system" for system table access. + execution_success: true + expectations: + expected_facts: + - "mcp__databricks__execute_sql is the MCP tool for running SQL queries" + - "The catalog parameter should be set to 'system' for system table queries" + expected_patterns: + - pattern: "mcp__databricks__execute_sql" + min_count: 1 + description: "Uses the MCP SQL execution tool" + - pattern: "catalog=\"system\"" + min_count: 1 + description: "Sets catalog to system" + - pattern: "system\\.access" + min_count: 1 + description: "Queries system access tables" + guidelines: + - "Must show the mcp__databricks__execute_sql tool call" + - "Must set catalog parameter to 'system'" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-unity-catalog/manifest.yaml b/.test/skills/databricks-unity-catalog/manifest.yaml new file mode 100644 index 00000000..f679684b --- /dev/null +++ b/.test/skills/databricks-unity-catalog/manifest.yaml @@ -0,0 +1,31 @@ +skill_name: databricks-unity-catalog +description: Unity Catalog system tables and volumes. Use when querying system tables (audit, lineage, billing) or working + with volume file operations (upload, download, list files in /Volumes/). +scorers: + enabled: + - sql_syntax + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-unstructured-pdf-generation/ground_truth.yaml b/.test/skills/databricks-unstructured-pdf-generation/ground_truth.yaml new file mode 100644 index 00000000..6560e7c1 --- /dev/null +++ b/.test/skills/databricks-unstructured-pdf-generation/ground_truth.yaml @@ -0,0 +1,271 @@ +metadata: + skill_name: databricks-unstructured-pdf-generation + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: pdf_gen_quick_start_001 + inputs: + prompt: "How do I generate synthetic PDF documents for my RAG pipeline?" + outputs: + response: | + Use the `generate_pdf_documents` MCP tool: + + ``` + generate_pdf_documents( + catalog="my_catalog", + schema="my_schema", + volume="my_volume", + folder="hr_policies", + topic="HR policy documents for a mid-size technology company", + num_documents=15, + complexity="medium" + ) + ``` + + This generates PDF files and corresponding JSON metadata files with questions and evaluation guidelines for RAG testing. + execution_success: true + expectations: + expected_facts: + - "generate_pdf_documents is an MCP tool for creating synthetic PDFs" + - "Documents are saved to a Unity Catalog Volume" + - "Both PDF files and JSON metadata files are generated" + expected_patterns: + - pattern: "generate_pdf_documents" + min_count: 1 + description: "Uses the generate_pdf_documents MCP tool" + - pattern: "catalog" + min_count: 1 + description: "Specifies target catalog" + - pattern: "schema" + min_count: 1 + description: "Specifies target schema" + - pattern: "num_documents" + min_count: 1 + description: "Specifies number of documents to generate" + guidelines: + - "Must show the generate_pdf_documents tool with key parameters" + - "Must mention that both PDFs and JSON metadata are generated" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: pdf_gen_json_structure_002 + inputs: + prompt: "What is the structure of the JSON metadata files generated alongside the PDFs?" + outputs: + response: | + Each PDF gets a corresponding JSON file with evaluation metadata: + + ```json + { + "title": "API Authentication Guide", + "category": "Technical", + "pdf_path": "/Volumes/catalog/schema/volume/folder/doc_001.pdf", + "question": "What authentication methods are supported by the API?", + "guideline": "Answer should mention OAuth 2.0, API keys, and JWT tokens with their use cases." + } + ``` + + - `title`: Document title + - `category`: Document category + - `pdf_path`: Full Volume path to the generated PDF + - `question`: A test question whose answer is in the PDF + - `guideline`: Evaluation criteria for judging RAG responses + execution_success: true + expectations: + expected_facts: + - "Each PDF has a corresponding JSON metadata file" + - "JSON includes title, category, pdf_path, question, and guideline fields" + - "The question field contains a query answerable from the PDF content" + - "The guideline field provides evaluation criteria for RAG responses" + expected_patterns: + - pattern: "\"title\"" + min_count: 1 + description: "JSON includes title field" + - pattern: "\"question\"" + min_count: 1 + description: "JSON includes question field" + - pattern: "\"guideline\"" + min_count: 1 + description: "JSON includes guideline field" + - pattern: "\"pdf_path\"" + min_count: 1 + description: "JSON includes pdf_path field" + guidelines: + - "Must show the complete JSON structure with all fields" + - "Must explain the purpose of each field" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: pdf_gen_rag_integration_003 + inputs: + prompt: "How do I use the generated PDFs and JSON files to test my RAG pipeline?" + outputs: + response: | + ```python + # Load questions from JSON files + questions = load_json_files(f"/Volumes/{catalog}/{schema}/{volume}/{folder}/*.json") + + for q in questions: + # 1. Ingest the PDF into your vector database + ingest_document(q["pdf_path"]) + + # 2. Query RAG system with the test question + response = rag_system.query(q["question"]) + + # 3. Evaluate using the guideline + is_correct = evaluate_response(response, q["guideline"]) + ``` + + The workflow is: ingest PDFs as source documents, use `question` fields to query your RAG system, and use `guideline` fields to evaluate response quality. + execution_success: true + expectations: + expected_facts: + - "PDFs serve as source documents for the vector database" + - "Questions from JSON test retrieval accuracy" + - "Guidelines from JSON evaluate response quality" + expected_patterns: + - pattern: "question" + min_count: 2 + description: "Uses question field for RAG queries" + - pattern: "guideline" + min_count: 2 + description: "Uses guideline field for evaluation" + - pattern: "pdf_path" + min_count: 1 + description: "References PDF path for ingestion" + guidelines: + - "Must show the three-step workflow: ingest, query, evaluate" + - "Must use both question and guideline fields from the JSON" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: pdf_gen_hr_documents_004 + inputs: + prompt: "Generate HR policy documents for testing a company knowledge base RAG system." + outputs: + response: | + ``` + generate_pdf_documents( + catalog="ai_dev_kit", + schema="hr_demo", + volume="documents", + folder="hr_policies", + topic="HR policy documents for a mid-size technology company including PTO, benefits, code of conduct, and remote work policies", + num_documents=20, + complexity="medium" + ) + ``` + + This generates 20 HR policy PDFs covering common topics like PTO, benefits enrollment, code of conduct, and remote work guidelines, along with JSON evaluation files. + execution_success: true + expectations: + expected_facts: + - "The topic parameter guides the content theme of generated documents" + - "num_documents controls how many PDFs are generated" + - "complexity affects the depth and length of generated content" + expected_patterns: + - pattern: "generate_pdf_documents" + min_count: 1 + description: "Uses the generate_pdf_documents MCP tool" + - pattern: "topic=" + min_count: 1 + description: "Specifies document topic" + - pattern: "num_documents=" + min_count: 1 + description: "Specifies document count" + guidelines: + - "Must specify a descriptive topic parameter" + - "Must use realistic catalog/schema/volume names" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: pdf_gen_tech_docs_005 + inputs: + prompt: "Generate technical documentation PDFs for testing a developer documentation search system." + outputs: + response: | + ``` + generate_pdf_documents( + catalog="ai_dev_kit", + schema="tech_docs", + volume="documents", + folder="api_docs", + topic="Technical API documentation including REST endpoints, authentication, SDK usage, error handling, and rate limiting", + num_documents=25, + complexity="high" + ) + ``` + + Set complexity to "high" for technical documentation to generate detailed content with code examples, tables, and structured sections. + execution_success: true + expectations: + expected_facts: + - "complexity='high' generates more detailed content with code examples" + - "The topic parameter should describe the specific technical domain" + expected_patterns: + - pattern: "generate_pdf_documents" + min_count: 1 + description: "Uses the generate_pdf_documents MCP tool" + - pattern: "complexity=\"high\"" + min_count: 1 + description: "Uses high complexity for technical docs" + - pattern: "topic=" + min_count: 1 + description: "Specifies technical topic" + guidelines: + - "Must use high complexity for technical documentation" + - "Topic must describe specific technical domains" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: pdf_gen_financial_reports_006 + inputs: + prompt: "Generate synthetic financial report PDFs for testing a compliance document search system." + outputs: + response: | + ``` + generate_pdf_documents( + catalog="ai_dev_kit", + schema="finance_demo", + volume="documents", + folder="financial_reports", + topic="Corporate financial reports including quarterly earnings, annual reports, budget summaries, and audit findings", + num_documents=15, + complexity="high" + ) + ``` + + Financial documents benefit from high complexity to include tables, figures, and structured financial data that tests RAG parsing capabilities. + execution_success: true + expectations: + expected_facts: + - "Financial documents should use high complexity for realistic structure" + - "Generated PDFs include tables and structured data" + expected_patterns: + - pattern: "generate_pdf_documents" + min_count: 1 + description: "Uses the generate_pdf_documents MCP tool" + - pattern: "financial" + min_count: 1 + description: "Topic relates to financial documents" + - pattern: "complexity=\"high\"" + min_count: 1 + description: "Uses high complexity" + guidelines: + - "Must use high complexity for financial documents" + - "Must specify financial-specific topics" + metadata: + category: happy_path + difficulty: medium + source: auto_generated diff --git a/.test/skills/databricks-unstructured-pdf-generation/manifest.yaml b/.test/skills/databricks-unstructured-pdf-generation/manifest.yaml new file mode 100644 index 00000000..27c09770 --- /dev/null +++ b/.test/skills/databricks-unstructured-pdf-generation/manifest.yaml @@ -0,0 +1,30 @@ +skill_name: databricks-unstructured-pdf-generation +description: Generate synthetic PDF documents for RAG and unstructured data use cases. Use when creating test PDFs, demo documents, + or evaluation datasets for retrieval systems. +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/skills/databricks-vector-search/ground_truth.yaml b/.test/skills/databricks-vector-search/ground_truth.yaml new file mode 100644 index 00000000..7e8acd2b --- /dev/null +++ b/.test/skills/databricks-vector-search/ground_truth.yaml @@ -0,0 +1,345 @@ +metadata: + skill_name: databricks-vector-search + version: 0.1.0 + created_at: '2026-02-23T08:00:00.000000' + +test_cases: +- id: vs_create_endpoint_001 + inputs: + prompt: "How do I create a vector search endpoint in Databricks?" + outputs: + response: | + ```python + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Create a standard endpoint + endpoint = w.vector_search_endpoints.create_endpoint( + name="my-vs-endpoint", + endpoint_type="STANDARD" # or "STORAGE_OPTIMIZED" + ) + # Endpoint creation is asynchronous; check status with get_endpoint() + ``` + + Two endpoint types are available: + - **STANDARD**: Best for low-latency queries + - **STORAGE_OPTIMIZED**: Best for large-scale, cost-effective deployments + execution_success: true + expectations: + expected_facts: + - "Endpoints are created with w.vector_search_endpoints.create_endpoint()" + - "Two endpoint types: STANDARD and STORAGE_OPTIMIZED" + - "Endpoint creation is asynchronous" + expected_patterns: + - pattern: "create_endpoint" + min_count: 1 + description: "Uses create_endpoint() method" + - pattern: "STANDARD" + min_count: 1 + description: "Shows STANDARD endpoint type" + - pattern: "STORAGE_OPTIMIZED" + min_count: 1 + description: "Shows STORAGE_OPTIMIZED endpoint type" + guidelines: + - "Must show both STANDARD and STORAGE_OPTIMIZED endpoint types" + - "Must mention that endpoint creation is asynchronous" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: vs_delta_sync_managed_002 + inputs: + prompt: "How do I create a Delta Sync index with managed embeddings?" + outputs: + response: | + ```python + # Source table must have: primary key column + text column + index = w.vector_search_indexes.create_index( + name="catalog.schema.my_index", + endpoint_name="my-vs-endpoint", + primary_key="id", + index_type="DELTA_SYNC", + delta_sync_index_spec={ + "source_table": "catalog.schema.documents", + "embedding_source_columns": [ + { + "name": "content", + "embedding_model_endpoint_name": "databricks-gte-large-en" + } + ], + "pipeline_type": "TRIGGERED" # or "CONTINUOUS" + } + ) + ``` + + Databricks automatically embeds the text column using the specified model. Pipeline types: TRIGGERED (manual sync) or CONTINUOUS (auto-sync on source changes). + execution_success: true + expectations: + expected_facts: + - "Delta Sync index automatically syncs from a source Delta table" + - "Managed embeddings use embedding_source_columns with a model endpoint" + - "TRIGGERED pipeline syncs on demand; CONTINUOUS syncs automatically" + - "databricks-gte-large-en is a built-in embedding model" + expected_patterns: + - pattern: "create_index" + min_count: 1 + description: "Uses create_index() method" + - pattern: "DELTA_SYNC" + min_count: 1 + description: "Specifies Delta Sync index type" + - pattern: "embedding_source_columns" + min_count: 1 + description: "Configures managed embedding columns" + - pattern: "databricks-gte-large-en" + min_count: 1 + description: "Uses built-in embedding model" + guidelines: + - "Must show the full create_index call with delta_sync_index_spec" + - "Must explain TRIGGERED vs CONTINUOUS pipeline types" + - "Must specify a Databricks embedding model endpoint" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: vs_delta_sync_self_managed_003 + inputs: + prompt: "How do I create a Delta Sync index with pre-computed embeddings?" + outputs: + response: | + ```python + # Source table must have: primary key + embedding vector column + index = w.vector_search_indexes.create_index( + name="catalog.schema.my_index", + endpoint_name="my-vs-endpoint", + primary_key="id", + index_type="DELTA_SYNC", + delta_sync_index_spec={ + "source_table": "catalog.schema.documents", + "embedding_vector_columns": [ + { + "name": "embedding", + "embedding_dimension": 768 + } + ], + "pipeline_type": "TRIGGERED" + } + ) + ``` + + Use self-managed embeddings when you need a specific embedding model not available as a Databricks endpoint, or when you pre-compute embeddings in a pipeline. + execution_success: true + expectations: + expected_facts: + - "Self-managed embeddings use embedding_vector_columns instead of embedding_source_columns" + - "embedding_dimension must match the dimension of your pre-computed vectors" + - "The source table must already contain an embedding column" + expected_patterns: + - pattern: "embedding_vector_columns" + min_count: 1 + description: "Uses self-managed embedding columns" + - pattern: "embedding_dimension" + min_count: 1 + description: "Specifies embedding dimension" + - pattern: "DELTA_SYNC" + min_count: 1 + description: "Uses Delta Sync index type" + guidelines: + - "Must use embedding_vector_columns (not embedding_source_columns)" + - "Must include embedding_dimension parameter" + - "Must explain when to use self-managed vs managed embeddings" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: vs_query_index_004 + inputs: + prompt: "How do I query a vector search index and get similarity results?" + outputs: + response: | + ```python + results = w.vector_search_indexes.query_index( + index_name="catalog.schema.my_index", + columns=["id", "content", "metadata"], + query_text="What is machine learning?", + num_results=5 + ) + + for doc in results.result.data_array: + score = doc[-1] # Similarity score is last column + print(f"Score: {score}, Content: {doc[1][:100]}...") + ``` + + For managed embedding indexes, use `query_text`. For self-managed, use `query_vector` with a pre-computed embedding. + execution_success: true + expectations: + expected_facts: + - "query_index() searches for similar documents by text or vector" + - "query_text is used for managed embedding indexes" + - "query_vector is used for self-managed embedding indexes" + - "Similarity score is the last column in the result array" + expected_patterns: + - pattern: "query_index" + min_count: 1 + description: "Uses query_index() method" + - pattern: "query_text" + min_count: 1 + description: "Uses text-based query" + - pattern: "num_results" + min_count: 1 + description: "Specifies number of results" + - pattern: "data_array" + min_count: 1 + description: "Accesses result data array" + guidelines: + - "Must show query_text for managed embeddings" + - "Must mention query_vector as alternative for self-managed" + - "Must show how to access similarity scores" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: vs_query_with_filters_005 + inputs: + prompt: "How do I apply metadata filters when querying a vector search index?" + outputs: + response: | + ```python + results = w.vector_search_indexes.query_index( + index_name="catalog.schema.my_index", + columns=["id", "content", "category"], + query_text="machine learning best practices", + num_results=10, + filters_json='{"category": "technical", "status": "published"}' + ) + ``` + + The `filters_json` parameter accepts a JSON string of key-value pairs that filter results by metadata columns before similarity ranking. Only documents matching all filters are returned. + execution_success: true + expectations: + expected_facts: + - "filters_json accepts a JSON string for metadata filtering" + - "Filters are applied before similarity ranking" + - "All filter conditions must be satisfied (AND logic)" + expected_patterns: + - pattern: "filters_json" + min_count: 1 + description: "Uses filters_json parameter" + - pattern: "query_index" + min_count: 1 + description: "Uses query_index() method" + - pattern: "query_text" + min_count: 1 + description: "Combines text query with filters" + guidelines: + - "Must show filters_json parameter with a JSON string" + - "Must explain that filters are applied before similarity ranking" + metadata: + category: happy_path + difficulty: medium + source: auto_generated + +- id: vs_cli_reference_006 + inputs: + prompt: "What CLI commands are available for managing Databricks vector search?" + outputs: + response: | + ```bash + # List endpoints + databricks vector-search endpoints list + + # Create endpoint + databricks vector-search endpoints create \ + --name my-endpoint \ + --endpoint-type STANDARD + + # List indexes on endpoint + databricks vector-search indexes list-indexes \ + --endpoint-name my-endpoint + + # Get index status + databricks vector-search indexes get-index \ + --index-name catalog.schema.my_index + + # Sync index (for TRIGGERED pipeline) + databricks vector-search indexes sync-index \ + --index-name catalog.schema.my_index + + # Delete index + databricks vector-search indexes delete-index \ + --index-name catalog.schema.my_index + ``` + execution_success: true + expectations: + expected_facts: + - "Vector search CLI commands are under databricks vector-search" + - "sync-index triggers a manual sync for TRIGGERED pipeline indexes" + - "Endpoints and indexes are managed as separate subcommands" + expected_patterns: + - pattern: "databricks vector-search endpoints" + min_count: 1 + description: "Shows endpoint management commands" + - pattern: "databricks vector-search indexes" + min_count: 1 + description: "Shows index management commands" + - pattern: "sync-index" + min_count: 1 + description: "Shows manual sync command" + guidelines: + - "Must show both endpoint and index management commands" + - "Must include sync-index for triggered pipelines" + metadata: + category: happy_path + difficulty: easy + source: auto_generated + +- id: vs_embedding_models_007 + inputs: + prompt: "What embedding models are available for vector search indexes?" + outputs: + response: | + Databricks provides built-in embedding models for managed indexes: + + | Model | Dimensions | + |-------|-----------| + | `databricks-gte-large-en` | 1024 | + | `databricks-bge-large-en` | 1024 | + + ```python + # Use with managed embeddings in index creation + embedding_source_columns=[ + { + "name": "content", + "embedding_model_endpoint_name": "databricks-gte-large-en" + } + ] + ``` + + You can also use custom embedding models deployed as serving endpoints. + execution_success: true + expectations: + expected_facts: + - "databricks-gte-large-en produces 1024-dimensional embeddings" + - "databricks-bge-large-en produces 1024-dimensional embeddings" + - "Custom embedding models can also be used via serving endpoints" + expected_patterns: + - pattern: "databricks-gte-large-en" + min_count: 1 + description: "Lists GTE embedding model" + - pattern: "databricks-bge-large-en" + min_count: 1 + description: "Lists BGE embedding model" + - pattern: "1024" + min_count: 1 + description: "Specifies embedding dimensions" + guidelines: + - "Must list at least two built-in embedding models with dimensions" + - "Must mention that custom models can also be used" + metadata: + category: happy_path + difficulty: easy + source: auto_generated diff --git a/.test/skills/databricks-vector-search/manifest.yaml b/.test/skills/databricks-vector-search/manifest.yaml new file mode 100644 index 00000000..a053e80a --- /dev/null +++ b/.test/skills/databricks-vector-search/manifest.yaml @@ -0,0 +1,31 @@ +skill_name: databricks-vector-search +description: 'Patterns for Databricks Vector Search: create endpoints and indexes, query with filters, manage embeddings. + Use when building RAG applications, semantic search, or similarity matching. Covers both storage-optimized and standard + endpoints.' +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - Response must address the user's request completely + - Code examples must follow documented best practices + - Response must use modern APIs (not deprecated ones) + trace_expectations: + tool_limits: + Bash: 10 + Read: 20 + token_budget: + max_total: 100000 + required_tools: + - Read + banned_tools: [] + expected_files: [] +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + execution_success: 0.8 diff --git a/.test/src/skill_test/optimize/asi.py b/.test/src/skill_test/optimize/asi.py index d8ff506d..95020035 100644 --- a/.test/src/skill_test/optimize/asi.py +++ b/.test/src/skill_test/optimize/asi.py @@ -1,12 +1,12 @@ """ASI diagnostics: convert MLflow Feedback to optimize_anything SideInfo. -Routes failure diagnostics through oa.log() so GEPA's reflection LM gets -actionable context about what went wrong with each scorer. +Builds an Actionable Side Information dict from scorer feedback so GEPA's +reflection LM gets structured context about what went wrong with each scorer. +Failure details are surfaced via the ``_failures`` key in the returned dict. """ from typing import Any -import gepa.optimize_anything as oa from mlflow.entities import Feedback @@ -37,8 +37,9 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: """Convert MLflow Feedback objects to optimize_anything (score, SideInfo). Computes the mean score across non-skipped feedbacks and builds a - SideInfo dict. Logs failures via oa.log() so GEPA's reflection LM - sees actionable failure context. + SideInfo dict. Failure diagnostics are collected in the ``_failures`` + key so GEPA's reflection LM sees actionable context directly in the + side_info dict (no ``oa.log()`` needed). Args: feedbacks: List of MLflow Feedback objects from running scorers @@ -48,6 +49,7 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: """ scores = [] side_info: dict[str, Any] = {} + failures: list[str] = [] for fb in feedbacks: score = feedback_to_score(fb) @@ -70,9 +72,11 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: "status": "pass" if score >= 0.5 else "fail", } - # Route failures through oa.log() for GEPA reflection + # Collect failure diagnostics for GEPA reflection if score < 1.0: - oa.log(f"Scorer '{name}' returned {fb.value}: {fb.rationale or 'no rationale'}") + failures.append( + f"Scorer '{name}' returned {fb.value}: {fb.rationale or 'no rationale'}" + ) composite = sum(scores) / len(scores) if scores else 0.0 @@ -85,4 +89,71 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: "failed": sum(1 for s in scores if s < 0.5), } + if failures: + side_info["_failures"] = "\n".join(failures) + + return composite, side_info + + +def build_rich_asi( + feedbacks: list[Feedback], + *, + generated_response: str | None = None, + skill_coverage: dict[str, Any] | None = None, + task_prompt: str | None = None, + per_dimension_scores: dict[str, float] | None = None, +) -> tuple[float, dict[str, Any]]: + """Build enriched ASI with categorized diagnostics for GEPA reflection. + + Extends ``feedback_to_asi()`` with additional context that helps GEPA's + reflection LM understand *why* scores changed and make better edits. + + Args: + feedbacks: MLflow Feedback objects from all scoring layers + generated_response: Truncated LLM output (so reflection sees what skill produced) + skill_coverage: Which patterns/facts found vs missing in SKILL.md + task_prompt: The test prompt (so reflection understands context) + per_dimension_scores: Per-dimension scores dict for Pareto-frontier selection + + Returns: + Tuple of (composite_score, enriched_side_info_dict) + """ + composite, side_info = feedback_to_asi(feedbacks) + + # Categorize feedbacks by layer + categories: dict[str, list[str]] = { + "skill_content": [], + "generated_response": [], + "reference": [], + "structure": [], + } + for fb in feedbacks: + name = fb.name or "" + score = feedback_to_score(fb) + if score is None: + continue + entry = f"{name}: {'pass' if score >= 0.5 else 'FAIL'} ({fb.rationale or ''})" + if name.startswith("skill_content_"): + categories["skill_content"].append(entry) + elif name.startswith("skill_"): + categories["structure"].append(entry) + else: + categories["generated_response"].append(entry) + + side_info["_diagnostics_by_layer"] = { + k: v for k, v in categories.items() if v + } + + if generated_response is not None: + side_info["_generated_response"] = generated_response[:2000] + + if skill_coverage: + side_info["_skill_coverage"] = skill_coverage + + if task_prompt: + side_info["_task_prompt"] = task_prompt[:500] + + if per_dimension_scores: + side_info["scores"] = per_dimension_scores + return composite, side_info diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index c0ed12c6..efa4dd1a 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -4,52 +4,295 @@ """ import os +import warnings -from gepa.optimize_anything import GEPAConfig, EngineConfig, ReflectionConfig +from gepa.optimize_anything import GEPAConfig, EngineConfig, ReflectionConfig, RefinerConfig DEFAULT_REFLECTION_LM = os.environ.get( - "GEPA_REFLECTION_LM", "databricks/databricks-gpt-5-2" + "GEPA_REFLECTION_LM", "databricks/databricks-claude-opus-4-6" ) +DEFAULT_GEN_LM = os.environ.get( + "GEPA_GEN_LM", "databricks/databricks-claude-sonnet-4-6" +) + +# --------------------------------------------------------------------------- +# Register Databricks models with litellm so it knows their true context +# windows. Without this, litellm may fuzzy-match to a similar model with +# different limits, or worse, the Databricks serving endpoint may reject +# requests that exceed a vLLM-configured max_model_len. +# +# NOTE: This does NOT override the endpoint's own max_model_len setting. +# If the Databricks endpoint itself is configured with a low limit (e.g. +# 8192), you must either reconfigure the endpoint or use a different +# provider (openai/, anthropic/) whose endpoints support larger contexts. +# --------------------------------------------------------------------------- +def _configure_litellm_retries() -> None: + """Configure litellm to retry on transient errors (429, 529, 500). + + GEPA calls litellm.completion() without passing num_retries, so we + set it globally. This handles Anthropic 529 "Overloaded" errors, + rate limits, and other transient failures with exponential backoff. + """ + try: + import litellm + litellm.num_retries = 5 + litellm.request_timeout = 120 # seconds per attempt + # Drop log noise from retries + litellm.suppress_debug_info = True + except ImportError: + pass + + +def _register_litellm_models() -> None: + """Register Databricks model context windows with litellm.""" + try: + import litellm + _models = { + "databricks/databricks-claude-opus-4-6": { + "max_tokens": 32_000, + "max_input_tokens": 200_000, + "max_output_tokens": 32_000, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, + "databricks/databricks-claude-sonnet-4-6": { + "max_tokens": 16_000, + "max_input_tokens": 200_000, + "max_output_tokens": 16_000, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, + "databricks/databricks-gpt-5-2": { + "max_tokens": 128_000, + "max_input_tokens": 272_000, + "max_output_tokens": 128_000, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, + } + for model_name, model_info in _models.items(): + litellm.model_cost[model_name] = model_info + except ImportError: + pass + + +_register_litellm_models() +_configure_litellm_retries() + + +# Overhead multiplier: the reflection prompt is roughly this many times +# the raw candidate tokens (includes background, ASI, framing). +_REFLECTION_OVERHEAD_MULTIPLIER = 3 + PRESETS: dict[str, GEPAConfig] = { "quick": GEPAConfig( engine=EngineConfig(max_metric_calls=15, parallel=True), reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + refiner=RefinerConfig(max_refinements=1), ), "standard": GEPAConfig( engine=EngineConfig(max_metric_calls=50, parallel=True), reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + refiner=RefinerConfig(max_refinements=1), ), "thorough": GEPAConfig( engine=EngineConfig(max_metric_calls=150, parallel=True), reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + refiner=RefinerConfig(max_refinements=1), ), } +# Base max_metric_calls per preset (used to scale by component count) +PRESET_BASE_CALLS: dict[str, int] = { + "quick": 15, + "standard": 50, + "thorough": 150, +} + +# Maximum total metric calls per pass to avoid runaway runtimes. +# With many components, uncapped scaling (e.g., 50 * 17 = 850) can cause +# multi-hour hangs with slower reflection models like Sonnet. +MAX_METRIC_CALLS_PER_PASS = 300 + +# Models known to be fast enough for large multi-component optimization. +# Other models get the metric-call cap applied. +_FAST_REFLECTION_MODELS = { + "databricks/databricks-claude-opus-4-6", + "databricks/databricks-gpt-5-2", + "openai/gpt-4o", + "anthropic/claude-opus-4-6", +} + + +def validate_databricks_env() -> None: + """Check that DATABRICKS_API_BASE is set correctly for litellm. + + litellm's Databricks provider requires: + DATABRICKS_API_BASE=https://.cloud.databricks.com/serving-endpoints + + A common mistake is omitting /serving-endpoints, which causes 404 errors. + """ + api_base = os.environ.get("DATABRICKS_API_BASE", "") + if api_base and not api_base.rstrip("/").endswith("/serving-endpoints"): + fixed = api_base.rstrip("/") + "/serving-endpoints" + warnings.warn( + f"DATABRICKS_API_BASE={api_base!r} is missing '/serving-endpoints' suffix. " + f"litellm will get 404 errors. Automatically fixing to: {fixed}", + stacklevel=2, + ) + os.environ["DATABRICKS_API_BASE"] = fixed + + +def validate_reflection_context( + reflection_lm: str, + total_candidate_tokens: int, +) -> None: + """Warn if the candidate is likely too large for the reflection model. -def get_preset(name: str, reflection_lm: str | None = None) -> GEPAConfig: - """Get a GEPA config preset by name, optionally overriding the reflection LM. + Queries litellm's model registry for the model's max_input_tokens and + compares against the estimated reflection prompt size. + + Note: this checks litellm's *client-side* knowledge of the model. The + Databricks serving endpoint may have a *different* (lower) limit set via + vLLM's ``max_model_len``. If you see ``BadRequestError`` with + ``max_model_len`` in the message, the endpoint itself is the bottleneck -- + switch to a provider whose endpoint supports your context needs (e.g. + ``openai/gpt-4o`` or ``anthropic/claude-sonnet-4-5-20250514``). + """ + try: + import litellm + info = litellm.get_model_info(reflection_lm) + limit = info.get("max_input_tokens") or info.get("max_tokens") or 0 + except Exception: + return # can't determine limit -- skip check + + if limit <= 0: + return + + estimated_prompt = total_candidate_tokens * _REFLECTION_OVERHEAD_MULTIPLIER + if estimated_prompt > limit: + raise ValueError( + f"\nReflection model '{reflection_lm}' has a {limit:,}-token input limit " + f"(per litellm), but the estimated reflection prompt is ~{estimated_prompt:,} " + f"tokens ({total_candidate_tokens:,} candidate tokens x " + f"{_REFLECTION_OVERHEAD_MULTIPLIER} overhead).\n\n" + f"Fix: use a model with a larger context window:\n" + f" --reflection-lm 'databricks/databricks-claude-opus-4-6' (200K)\n" + f" --reflection-lm 'openai/gpt-4o' (128K)\n" + f" --reflection-lm 'anthropic/claude-sonnet-4-5-20250514' (200K)\n\n" + f"Or set the environment variable:\n" + f" export GEPA_REFLECTION_LM='databricks/databricks-claude-opus-4-6'\n\n" + f"If you already use a large-context model and still see 'max_model_len'\n" + f"errors, the Databricks serving endpoint itself has a low context limit.\n" + f"Switch to a non-Databricks provider (openai/ or anthropic/) instead.\n\n" + f" Current GEPA_REFLECTION_LM={os.environ.get('GEPA_REFLECTION_LM', '(not set)')}" + ) + + +def estimate_pass_duration( + num_metric_calls: int, + reflection_lm: str, + total_candidate_tokens: int, + num_dataset_examples: int = 7, +) -> float | None: + """Estimate wall-clock seconds for one optimization pass. + + Metric calls are mostly fast local evaluations. The slow part is + reflection LLM calls, which happen roughly once per iteration + (num_metric_calls / num_dataset_examples iterations). + + Returns None if estimation is not possible. + """ + # Rough per-reflection latency (seconds) based on model class + if reflection_lm in _FAST_REFLECTION_MODELS: + secs_per_reflection = 5.0 + elif "sonnet" in reflection_lm.lower(): + secs_per_reflection = 20.0 + elif "haiku" in reflection_lm.lower(): + secs_per_reflection = 8.0 + else: + secs_per_reflection = 15.0 + + # Scale by candidate size (larger candidates → slower) + size_factor = min(max(1.0, total_candidate_tokens / 10_000), 2.5) + adjusted = secs_per_reflection * size_factor + + # Approximate iterations (each iteration evaluates all dataset examples) + num_iterations = max(1, num_metric_calls // max(num_dataset_examples, 1)) + + return num_iterations * adjusted + + +def get_preset( + name: str, + reflection_lm: str | None = None, + num_components: int = 1, + max_metric_calls_override: int | None = None, +) -> GEPAConfig: + """Get a GEPA config preset by name, scaled by component count. + + When optimizing multiple components (skill + tool modules), GEPA's + round-robin selector divides the budget across all of them. We scale + ``max_metric_calls`` so that *each component* receives the preset's + base budget rather than splitting it. + + For slower reflection models (non-Opus/GPT-4o), the total metric calls + are capped at ``MAX_METRIC_CALLS_PER_PASS`` to avoid multi-hour hangs. Args: name: One of "quick", "standard", "thorough" reflection_lm: Override reflection LM model string + num_components: Number of GEPA components (used to scale budget) + max_metric_calls_override: Explicit cap on metric calls per pass Returns: GEPAConfig instance """ if name not in PRESETS: raise KeyError(f"Unknown preset '{name}'. Choose from: {list(PRESETS.keys())}") - config = PRESETS[name] - if reflection_lm: - config = GEPAConfig( - engine=config.engine, - reflection=ReflectionConfig( - reflection_lm=reflection_lm, - reflection_minibatch_size=config.reflection.reflection_minibatch_size, - skip_perfect_score=config.reflection.skip_perfect_score, - ), - merge=config.merge, - refiner=config.refiner, - tracking=config.tracking, + + # Validate Databricks env if using databricks/ prefix + effective_lm = reflection_lm or DEFAULT_REFLECTION_LM + if isinstance(effective_lm, str) and effective_lm.startswith("databricks/"): + validate_databricks_env() + + base_calls = PRESET_BASE_CALLS[name] + scaled_calls = base_calls * max(num_components, 1) + + # Apply explicit override if provided + if max_metric_calls_override is not None: + scaled_calls = max_metric_calls_override + # Cap for slower models to avoid multi-hour hangs + elif effective_lm not in _FAST_REFLECTION_MODELS and scaled_calls > MAX_METRIC_CALLS_PER_PASS: + warnings.warn( + f"Capping metric calls from {scaled_calls} to {MAX_METRIC_CALLS_PER_PASS} " + f"for reflection model '{effective_lm}'. " + f"Use --max-metric-calls to override, or use a faster model " + f"(e.g., databricks/databricks-claude-opus-4-6).", + stacklevel=2, ) + scaled_calls = MAX_METRIC_CALLS_PER_PASS + + config = PRESETS[name] + config = GEPAConfig( + engine=EngineConfig( + max_metric_calls=scaled_calls, + parallel=config.engine.parallel, + ), + reflection=ReflectionConfig( + reflection_lm=reflection_lm or config.reflection.reflection_lm, + reflection_minibatch_size=config.reflection.reflection_minibatch_size, + skip_perfect_score=config.reflection.skip_perfect_score, + ), + merge=config.merge, + refiner=config.refiner, + tracking=config.tracking, + ) return config diff --git a/.test/src/skill_test/optimize/evaluator.py b/.test/src/skill_test/optimize/evaluator.py index 288bc368..1e7564e9 100644 --- a/.test/src/skill_test/optimize/evaluator.py +++ b/.test/src/skill_test/optimize/evaluator.py @@ -140,6 +140,40 @@ def _validate_skill_structure(candidate_text: str) -> list[Feedback]: return feedbacks +def _score_skill_content(candidate_text: str, expectations: dict[str, Any]) -> list[Feedback]: + """Score the SKILL.md candidate itself for pattern/fact coverage. + + Runs pattern_adherence and expected_facts_present against the skill text + (not the response). This gives GEPA immediate dynamic signal: if a key + pattern is removed from SKILL.md, the score drops. + + Feedback names are prefixed with ``skill_content_`` to distinguish from + response-level scores. + """ + outputs = {"response": candidate_text} + feedbacks = [] + + # Pattern adherence on skill content + pa_results = _run_scorer(pattern_adherence, outputs, expectations, {}) + for fb in pa_results: + feedbacks.append(Feedback( + name=f"skill_content_{fb.name}", + value=fb.value, + rationale=f"(skill content) {fb.rationale or ''}", + )) + + # Expected facts on skill content + ef_results = _run_scorer(expected_facts_present, outputs, expectations, {}) + for fb in ef_results: + feedbacks.append(Feedback( + name=f"skill_content_{fb.name}", + value=fb.value, + rationale=f"(skill content) {fb.rationale or ''}", + )) + + return feedbacks + + # --------------------------------------------------------------------------- # Evaluator factory (optimize_anything compatible) # --------------------------------------------------------------------------- @@ -151,6 +185,7 @@ def create_skill_evaluator( skill_name: str, mode: Literal["static", "generative"] = "static", task_lm: str | None = None, + gen_model: str | None = None, original_token_counts: dict[str, int] | None = None, ) -> Callable: """Create an optimize_anything-compatible evaluator for a skill. @@ -159,8 +194,23 @@ def create_skill_evaluator( The candidate is dict[str, str] (may have "skill_md" + "tools_*" keys). The example is a task dict from the dataset. + + Evaluation layers: + 1. Skill-content scoring: pattern/fact presence in SKILL.md itself + 2. Generative evaluation: LLM generates response from skill, scored + 3. Reference response check: fixed ground truth scoring (sanity) + 4. Structure validation: syntax, no hallucinated APIs on SKILL.md + 5. Token efficiency: conciseness vs original + + Args: + skill_name: Name of the skill being evaluated + mode: "static" uses ground truth response, "generative" generates fresh + task_lm: LLM for generative mode (deprecated, use gen_model) + gen_model: LLM model for generative evaluation + original_token_counts: Token counts of original artifacts """ scorer_config = load_scorer_config(skill_name) + effective_gen_model = gen_model or task_lm # Compute original token count for efficiency scoring if original_token_counts is None: @@ -170,6 +220,10 @@ def create_skill_evaluator( } total_original_tokens = sum(original_token_counts.values()) + # Mutable closure state: per-task baseline scorer scores for comparison. + # Populated via evaluator.set_baseline() after evaluating the seed. + _baseline: dict[str, dict[str, float | None]] = {} + def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: """Evaluate a candidate against a single task example. @@ -181,7 +235,6 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: (score, side_info) tuple for optimize_anything """ candidate_text = candidate.get(SKILL_KEY, "") - all_feedbacks: list[Feedback] = [] # Decode expectations from additional_context expectations = {} @@ -193,32 +246,78 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: except (json.JSONDecodeError, TypeError): pass - response = example.get("answer", "") + # ------------------------------------------------------------------ + # Layer 1: Skill-content scoring (pattern/fact presence in SKILL.md) + # ------------------------------------------------------------------ + skill_content_feedbacks: list[Feedback] = [] + if candidate_text and expectations: + skill_content_feedbacks = _score_skill_content(candidate_text, expectations) - if mode == "generative" and task_lm: + skill_content_composite, skill_content_si = feedback_to_asi(skill_content_feedbacks) + + # ------------------------------------------------------------------ + # Layer 2: Generative evaluation (LLM generates from skill, score that) + # ------------------------------------------------------------------ + generated_response = None + gen_feedbacks: list[Feedback] = [] + gen_composite = 0.0 + + if effective_gen_model and candidate_text and example.get("input"): import litellm messages = [ - {"role": "system", "content": f"Skill documentation:\n\n{candidate_text}\n\nAnswer the user's question."}, + { + "role": "system", + "content": ( + "Use ONLY the following skill documentation to answer " + "the user's question. Do not use any other knowledge.\n\n" + f"{candidate_text}" + ), + }, {"role": "user", "content": example.get("input", "")}, ] - resp = litellm.completion(model=task_lm, messages=messages) - response = resp.choices[0].message.content - - # 1. Score the response against test expectations - response_feedbacks = _run_deterministic_scorers( - response, expectations, example.get("input", ""), scorer_config - ) - all_feedbacks.extend(response_feedbacks) - - # 2. Validate skill structure + try: + resp = litellm.completion(model=effective_gen_model, messages=messages) + generated_response = resp.choices[0].message.content + except Exception as e: + generated_response = None + gen_feedbacks.append(Feedback( + name="generation_error", + value="no", + rationale=f"LLM generation failed: {e}", + )) + + if generated_response: + gen_feedbacks = _run_deterministic_scorers( + generated_response, expectations, example.get("input", ""), scorer_config + ) + + gen_composite, gen_si = feedback_to_asi(gen_feedbacks) + + # ------------------------------------------------------------------ + # Layer 3: Reference response check (ground truth — sanity baseline) + # ------------------------------------------------------------------ + reference_response = example.get("answer", "") + ref_feedbacks: list[Feedback] = [] + ref_composite = 0.0 + + if reference_response: + ref_feedbacks = _run_deterministic_scorers( + reference_response, expectations, example.get("input", ""), scorer_config + ) + ref_composite, _ = feedback_to_asi(ref_feedbacks) + + # ------------------------------------------------------------------ + # Layer 4: Validate skill structure + # ------------------------------------------------------------------ + structure_feedbacks: list[Feedback] = [] if candidate_text: structure_feedbacks = _validate_skill_structure(candidate_text) - all_feedbacks.extend(structure_feedbacks) - # 3. Convert to score + side_info (with oa.log() for failures) - composite, side_info = feedback_to_asi(all_feedbacks) + structure_composite, _ = feedback_to_asi(structure_feedbacks) - # 4. Token efficiency across ALL components + # ------------------------------------------------------------------ + # Layer 5: Token efficiency across ALL components + # ------------------------------------------------------------------ total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) if total_original_tokens > 0: ratio = total_candidate_tokens / total_original_tokens @@ -226,20 +325,100 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: else: efficiency = 1.0 - # Weighted composite: 80% quality, 20% token efficiency - final_score = 0.8 * composite + 0.2 * efficiency + # ------------------------------------------------------------------ + # Weighted final score + # ------------------------------------------------------------------ + # When generative eval is available, it gets the dominant weight. + # When no gen_model, fall back to reference-heavy weighting. + if effective_gen_model and generated_response is not None: + # Full layered evaluation + final_score = ( + 0.40 * gen_composite # Generated response quality + + 0.25 * skill_content_composite # Skill content coverage + + 0.05 * ref_composite # Reference response (sanity) + + 0.10 * structure_composite # Structure validation + + 0.20 * efficiency # Token efficiency + ) + else: + # Fallback: no generative eval, emphasize skill content + reference + final_score = ( + 0.35 * skill_content_composite # Skill content coverage + + 0.35 * ref_composite # Reference response + + 0.10 * structure_composite # Structure validation + + 0.20 * efficiency # Token efficiency + ) + + # ------------------------------------------------------------------ + # Build unified side_info for GEPA reflection + # ------------------------------------------------------------------ + # Merge all feedbacks for the side_info dict + all_feedbacks = skill_content_feedbacks + gen_feedbacks + ref_feedbacks + structure_feedbacks + _, side_info = feedback_to_asi(all_feedbacks) side_info["scores"] = { - "quality": composite, + "generated_response_quality": gen_composite, + "skill_content_coverage": skill_content_composite, + "reference_response_check": ref_composite, + "structure_validation": structure_composite, "token_efficiency": efficiency, + "final": final_score, } side_info["token_counts"] = { "candidate_total": total_candidate_tokens, "original_total": total_original_tokens, } + # Enrich ASI for GEPA reflection (Step 4 from plan) + if generated_response is not None: + side_info["_generated_response"] = generated_response[:2000] + side_info["_task_prompt"] = example.get("input", "")[:500] + + # Skill coverage summary + if skill_content_feedbacks: + found = [fb.name for fb in skill_content_feedbacks if fb.value == "yes"] + missing = [fb.name for fb in skill_content_feedbacks if fb.value == "no"] + side_info["_skill_coverage"] = { + "found": found, + "missing": missing, + "coverage_ratio": len(found) / max(len(found) + len(missing), 1), + } + + # Baseline comparison -- show GEPA's reflection LM what improved/regressed + task_key = example.get("input", "") + if task_key and task_key in _baseline: + comparisons = [] + for scorer_name, baseline_val in _baseline[task_key].items(): + current_val = side_info.get(scorer_name, {}).get("score") + if current_val is None or baseline_val is None: + continue + if current_val > baseline_val + 0.01: + comparisons.append( + f"Improved on {scorer_name} ({baseline_val:.2f} -> {current_val:.2f})" + ) + elif current_val < baseline_val - 0.01: + comparisons.append( + f"Regressed on {scorer_name} ({baseline_val:.2f} -> {current_val:.2f})" + ) + if comparisons: + side_info["_baseline_comparison"] = "; ".join(comparisons) + return final_score, side_info + def set_baseline(per_task_side_info: dict[str, dict]) -> None: + """Cache per-task per-scorer scores from the seed evaluation. + + Args: + per_task_side_info: {task_input_text: side_info_dict} from seed eval. + """ + for task_key, info in per_task_side_info.items(): + _baseline[task_key] = { + name: data.get("score") + for name, data in info.items() + if isinstance(data, dict) and "score" in data + and not name.startswith("_") + } + + evaluator.set_baseline = set_baseline # type: ignore[attr-defined] return evaluator @@ -247,8 +426,18 @@ def build_optimization_background( skill_name: str, original_token_count: int, component_names: list[str] | None = None, + baseline_scores: dict[str, float] | None = None, + baseline_side_info: dict[str, dict] | None = None, ) -> str: - """Build the background context string for GEPA's reflection LM.""" + """Build the background context string for GEPA's reflection LM. + + Args: + skill_name: Name of the skill being optimized. + original_token_count: Total token count of the original artifacts. + component_names: Names of the candidate components (e.g. "skill_md", "tools_*"). + baseline_scores: Per-task overall scores from evaluating the seed candidate. + baseline_side_info: Per-task side_info dicts from evaluating the seed candidate. + """ components_desc = "" if component_names and any(c.startswith("tools_") for c in component_names): tool_modules = [c.replace("tools_", "") for c in component_names if c.startswith("tools_")] @@ -259,20 +448,65 @@ def build_optimization_background( "accurate, concise, and action-oriented.\n" ) + # Build baseline performance summary + baseline_desc = "" + if baseline_scores: + mean_score = sum(baseline_scores.values()) / len(baseline_scores) + perfect = [tid for tid, s in baseline_scores.items() if s >= 0.99] + weak = sorted( + [(tid, s) for tid, s in baseline_scores.items() if s < 0.99], + key=lambda x: x[1], + ) + + baseline_desc = ( + f"\n\nBASELINE PERFORMANCE (seed candidate):\n" + f" Mean score: {mean_score:.3f} across {len(baseline_scores)} test cases.\n" + ) + if perfect: + baseline_desc += f" Perfect/near-perfect ({len(perfect)}): {', '.join(perfect)}\n" + if weak: + baseline_desc += " Needs improvement:\n" + for tid, score in weak: + baseline_desc += f" - {tid}: {score:.3f}" + # Add per-scorer detail if available + if baseline_side_info and tid in baseline_side_info: + info = baseline_side_info[tid] + failing = [ + name for name, data in info.items() + if isinstance(data, dict) and data.get("status") == "fail" + and not name.startswith("_") + ] + if failing: + baseline_desc += f" (failing: {', '.join(failing)})" + baseline_desc += "\n" + + baseline_desc += ( + "\n PRIORITY: Focus optimization effort on the weak test cases above. " + "Do NOT break test cases that already score well.\n" + ) + return ( - f"You are optimizing a SKILL.md file for the '{skill_name}' Databricks skill. " + f"You are REFINING an existing, working SKILL.md file for the '{skill_name}' " + "Databricks skill. The seed candidate is a production skill that already works -- " + "preserve what already works and improve what doesn't.\n\n" "SKILL.md files teach AI agents (like Claude Code) how to use specific Databricks features. " "They contain patterns, code examples, API references, and best practices.\n\n" - "The skill is evaluated against test cases that check:\n" - "- Python/SQL code syntax validity\n" - "- Adherence to expected patterns (regex matches)\n" - "- Absence of hallucinated/deprecated APIs\n" - "- Presence of expected factual information\n" - "- Overall structural quality of the skill document\n\n" + "EVALUATION: The skill is evaluated by having a small LLM generate responses from it. " + "Better skill documentation produces more correct responses. Scores come from:\n" + "- Generated response quality (40%): An LLM reads ONLY the skill and answers a test prompt. " + "Its response is scored against expected patterns and facts.\n" + "- Skill content coverage (25%): Does the SKILL.md itself contain the patterns and facts " + "needed to answer test prompts? Removing key content directly drops this score.\n" + "- Reference response check (5%): Sanity check against a known-good response.\n" + "- Structure validation (10%): Python/SQL syntax, no hallucinated APIs.\n" + "- Token efficiency (20%): Conciseness vs original -- smaller is better.\n\n" + "KEY INSIGHT: If the skill is missing a pattern or fact, the LLM cannot generate it. " + "The most impactful changes add missing patterns/facts and remove incorrect ones.\n\n" f"IMPORTANT: The current artifacts total {original_token_count:,} tokens. " "Optimized versions should be MORE CONCISE, not larger. " "Remove redundant examples, consolidate similar patterns, " "and eliminate verbose explanations that don't add value. " "Every token consumed is agent context window budget -- keep skills lean and focused." + f"{baseline_desc}" f"{components_desc}" ) diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index e364ca7b..1feb54bd 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -3,6 +3,7 @@ Uses optimize_anything API: evaluator function + GEPAConfig. """ +import copy import difflib import re from dataclasses import dataclass @@ -13,7 +14,7 @@ from ..config import SkillTestConfig from ..runners.evaluate import setup_mlflow -from .config import get_preset +from .config import get_preset, validate_reflection_context, estimate_pass_duration, DEFAULT_GEN_LM from .evaluator import ( SKILL_KEY, create_skill_evaluator, @@ -84,14 +85,26 @@ def _compute_diff_summary(original: str, optimized: str) -> str: def _evaluate_on_tasks(evaluator, candidate, tasks): - """Run evaluator on tasks and return mean score + per-task scores.""" + """Run evaluator on tasks and return mean score, per-task scores, and per-task side_info. + + Returns: + (mean_score, per_task_scores, side_info_by_id, side_info_by_input) + - per_task_scores: {task_id: float} + - side_info_by_id: {task_id: side_info_dict} -- for background builder + - side_info_by_input: {input_text: side_info_dict} -- for evaluator baseline + """ gepa_instances = to_gepa_instances(tasks) per_task = {} + side_info_by_id = {} + side_info_by_input = {} for i, inst in enumerate(gepa_instances): - score, _ = evaluator(candidate, inst) - per_task[tasks[i].get("id", f"task_{i}")] = score + score, side_info = evaluator(candidate, inst) + task_id = tasks[i].get("id", f"task_{i}") + per_task[task_id] = score + side_info_by_id[task_id] = side_info + side_info_by_input[inst.get("input", f"task_{i}")] = side_info mean = sum(per_task.values()) / len(per_task) if per_task else 0.0 - return mean, per_task + return mean, per_task, side_info_by_id, side_info_by_input def optimize_skill( @@ -99,26 +112,35 @@ def optimize_skill( mode: Literal["static", "generative"] = "static", preset: Literal["quick", "standard", "thorough"] = "standard", task_lm: str | None = None, + gen_model: str | None = None, reflection_lm: str | None = None, include_tools: bool = False, tool_modules: list[str] | None = None, tools_only: bool = False, dry_run: bool = False, + max_passes: int = 5, + max_metric_calls: int | None = None, ) -> OptimizationResult: """Run end-to-end GEPA optimization on a skill and/or tools. Uses optimize_anything API with a simple evaluator function. + Runs up to ``max_passes`` optimization passes per component, feeding + each pass's best candidate as the seed for the next. Stops early + when a pass produces no score improvement. Args: skill_name: Name of the skill to optimize mode: "static" or "generative" preset: "quick" (15), "standard" (50), "thorough" (150) - task_lm: LLM for generative mode + task_lm: LLM for generative mode (deprecated, use gen_model) + gen_model: LLM for generative evaluation (default: GEPA_GEN_LM env) reflection_lm: Override reflection LM include_tools: Include MCP tool descriptions as additional components tool_modules: Specific tool modules (None = all) tools_only: Optimize ONLY tool descriptions dry_run: Show config without running + max_passes: Maximum optimization passes (default 5). Each pass + re-seeds from the previous best and runs a full GEPA cycle. """ # 1. Load SKILL.md skill_path = _find_skill_md(skill_name) @@ -163,13 +185,31 @@ def optimize_skill( print(f"No test cases found. Using {len(train)} auto-generated tasks.") # 4. Build evaluator + # Resolve gen_model: explicit arg > task_lm (deprecated) > env default + effective_gen_model = gen_model or task_lm or DEFAULT_GEN_LM + if effective_gen_model: + print(f"Generation model: {effective_gen_model}") + evaluator = create_skill_evaluator( skill_name, mode=mode, task_lm=task_lm, + gen_model=effective_gen_model, original_token_counts=original_token_counts, ) - # 5. Get config - config = get_preset(preset, reflection_lm=reflection_lm) + # 5. Get config (scaled by component count) + num_components = len(seed_candidate) + config = get_preset( + preset, + reflection_lm=reflection_lm, + num_components=num_components, + max_metric_calls_override=max_metric_calls, + ) + print(f"Reflection model: {config.reflection.reflection_lm}") + + # 5b. Validate reflection model context window + validate_reflection_context( + config.reflection.reflection_lm, total_original_tokens, + ) # Dry run if dry_run: @@ -183,12 +223,29 @@ def optimize_skill( print(f"Train tasks: {len(train)}") print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Mode: {mode}") - print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls})") + print(f"Generation model: {effective_gen_model or 'None (static only)'}") + print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls}, " + f"scaled for {num_components} component(s))") + print(f"Max passes: {max_passes}") print(f"Reflection LM: {config.reflection.reflection_lm}") - original_score, _ = _evaluate_on_tasks(evaluator, seed_candidate, train) + original_score, original_per_task, si_by_id, _ = _evaluate_on_tasks( + evaluator, seed_candidate, train + ) print(f"Current score: {original_score:.3f}") + # Show per-task baseline in dry-run output + for task_id, score in original_per_task.items(): + print(f" {task_id}: {score:.3f}") + + background = build_optimization_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + ) + print(f"\nBackground preview:\n{background[:500]}...") + return OptimizationResult( skill_name=skill_name, original_score=original_score, @@ -208,16 +265,26 @@ def optimize_skill( tool_map=tool_map, ) - # Evaluate original - original_score, _ = _evaluate_on_tasks(evaluator, seed_candidate, train) + # Evaluate original and capture per-task detail for baseline context + original_score, original_per_task, si_by_id, si_by_input = _evaluate_on_tasks( + evaluator, seed_candidate, train + ) + + # Set baseline on evaluator so future calls include regression/improvement info + if hasattr(evaluator, "set_baseline"): + evaluator.set_baseline(si_by_input) - # 6. Build background and objective + # 6. Build background (with baseline scores) and objective background = build_optimization_background( skill_name, total_original_tokens, component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, ) objective = ( - f"Optimize the '{skill_name}' skill for maximum quality and minimum token count. " + f"Refine and improve the existing '{skill_name}' skill. " + "Preserve patterns and examples that already score well. " + "Focus on fixing scorer failures and reducing token count without sacrificing correctness. " "Higher quality scores and fewer tokens are both better." ) @@ -225,27 +292,84 @@ def optimize_skill( trainset = to_gepa_instances(train) valset = to_gepa_instances(val) if val else None - # 8. Run optimize_anything - result = optimize_anything( - seed_candidate=seed_candidate, - evaluator=evaluator, - dataset=trainset, - valset=valset, - objective=objective, - background=background, - config=config, + # 8. Multi-pass optimization loop + # Each pass feeds the previous best as the new seed. + # Stops early when a pass produces no score improvement. + current_seed = dict(seed_candidate) + best = dict(seed_candidate) + best_score = original_score + last_result = None + total_metric_calls = 0 + improvement_threshold = 0.001 # minimum improvement to continue + + print(f"\n Starting multi-pass optimization (up to {max_passes} passes, " + f"{num_components} component(s), {config.engine.max_metric_calls} metric calls/pass)") + + # Print estimated time per pass + est_secs = estimate_pass_duration( + config.engine.max_metric_calls, + config.reflection.reflection_lm, + total_original_tokens, + num_dataset_examples=len(train), ) + if est_secs is not None: + est_mins = est_secs / 60 + if est_mins > 5: + print(f" Estimated ~{est_mins:.0f} min/pass ({est_mins * max_passes:.0f} min total for {max_passes} passes)") + + for pass_num in range(1, max_passes + 1): + print(f"\n --- Pass {pass_num}/{max_passes} (best score so far: {best_score:.4f}) ---") + + # Deep-copy config: optimize_anything mutates + # config.reflection.reflection_prompt_template when objective/background + # are provided, which would cause a "mutually exclusive" error on pass 2+. + pass_config = copy.deepcopy(config) + + result = optimize_anything( + seed_candidate=current_seed, + evaluator=evaluator, + dataset=trainset, + valset=valset, + objective=objective, + background=background, + config=pass_config, + ) + total_metric_calls += result.total_metric_calls or 0 + + # Evaluate this pass's best candidate + candidate = result.best_candidate + pass_score, _, _, _ = _evaluate_on_tasks(evaluator, candidate, train) + improvement = pass_score - best_score + + print(f" Pass {pass_num} score: {pass_score:.4f} " + f"(delta: {'+' if improvement >= 0 else ''}{improvement:.4f})") + + if pass_score > best_score + improvement_threshold: + best = dict(candidate) + best_score = pass_score + last_result = result + # Use the improved candidate as seed for next pass + current_seed = dict(candidate) + else: + print(f" No significant improvement in pass {pass_num} -- stopping early.") + if last_result is None: + last_result = result + break + else: + print(f" Completed all {max_passes} passes.") + + if last_result is None: + last_result = result # 9. Extract results - best = result.best_candidate optimized_content = best.get(SKILL_KEY, original_content) optimized_token_count = sum(count_tokens(v) for v in best.values()) - optimized_score, _ = _evaluate_on_tasks(evaluator, best, train) + optimized_score = best_score val_scores: dict[str, float] = {} if val: - _, val_scores = _evaluate_on_tasks(evaluator, best, val) + _, val_scores, _, _ = _evaluate_on_tasks(evaluator, best, val) token_reduction_pct = ( (total_original_tokens - optimized_token_count) / total_original_tokens * 100 @@ -269,7 +393,7 @@ def optimize_skill( "original_tokens": float(total_original_tokens), "optimized_tokens": float(optimized_token_count), "token_reduction_pct": token_reduction_pct, - "total_metric_calls": float(result.total_metric_calls or 0), + "total_metric_calls": float(total_metric_calls), }) mlflow_run_id = mlflow.active_run().info.run_id except Exception: @@ -288,7 +412,7 @@ def optimize_skill( diff_summary=diff_summary, val_scores=val_scores, mlflow_run_id=mlflow_run_id, - gepa_result=result, + gepa_result=last_result, components=dict(best), original_components=dict(seed_candidate), tool_map=tool_map, diff --git a/.test/src/skill_test/optimize/splitter.py b/.test/src/skill_test/optimize/splitter.py index 52bbb87b..98f6c871 100644 --- a/.test/src/skill_test/optimize/splitter.py +++ b/.test/src/skill_test/optimize/splitter.py @@ -30,6 +30,35 @@ class SkillTask(TypedDict, total=False): metadata: dict[str, Any] # Category, difficulty, etc. +def _summarize_expectations(expectations: dict[str, Any]) -> str: + """Produce a human-readable summary of what a task tests. + + Included in additional_context so GEPA's reflection LM understands + what each test case is checking without parsing JSON. + """ + parts = [] + + patterns = expectations.get("expected_patterns", []) + if patterns: + descs = [] + for p in patterns: + if isinstance(p, str): + descs.append(p[:40]) + elif isinstance(p, dict): + descs.append(p.get("description", p.get("pattern", "")[:40])) + parts.append(f"Patterns: {', '.join(descs)}") + + facts = expectations.get("expected_facts", []) + if facts: + parts.append(f"Facts: {', '.join(str(f) for f in facts)}") + + guidelines = expectations.get("guidelines", []) + if guidelines: + parts.append(f"Guidelines: {'; '.join(str(g) for g in guidelines[:3])}") + + return " | ".join(parts) if parts else "No specific expectations" + + def _record_to_task(record: EvalRecord) -> SkillTask: """Convert an EvalRecord to our internal task format.""" task: SkillTask = { @@ -45,6 +74,10 @@ def _record_to_task(record: EvalRecord) -> SkillTask: task["expectations"] = record.expectations # Also encode expectations into additional_context for GEPA reflection task["additional_context"]["expectations"] = json.dumps(record.expectations) + # Human-readable summary for GEPA's reflection LM + task["additional_context"]["evaluation_criteria"] = _summarize_expectations( + record.expectations + ) return task diff --git a/.test/uv.lock b/.test/uv.lock index 0a1e71f8..67d8cfe0 100644 --- a/.test/uv.lock +++ b/.test/uv.lock @@ -838,7 +838,7 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ @@ -1149,6 +1149,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/01/c9/97cc5aae1648dcb851958a3ddf73ccd7dbe5650d95203ecb4d7720b4cdbf/fsspec-2026.1.0-py3-none-any.whl", hash = "sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc", size = 201838, upload-time = "2026-01-09T15:21:34.041Z" }, ] +[[package]] +name = "gepa" +version = "0.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f5/30/511e52916956508f56eca721260fcd524cfffd580e57782dd471be925f7e/gepa-0.1.0.tar.gz", hash = "sha256:f8b3d7918d4cdcf8593f39ef1cc757c4ba1a4e6793e3ffb622e6c0bc60a1efd9", size = 226064, upload-time = "2026-02-19T19:43:08.272Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1f/32/fe8afb3d2a6605a6bcbc8f119f0a2adae96e9e5d57ebed055490219956a8/gepa-0.1.0-py3-none-any.whl", hash = "sha256:4e3f8fe8ca20169e60518b2e9d416e8c4a579459848adffdcad12223fbf9643e", size = 191392, upload-time = "2026-02-19T19:43:07.065Z" }, +] + [[package]] name = "gitdb" version = "4.0.12" @@ -3919,8 +3928,10 @@ dependencies = [ [package.optional-dependencies] all = [ { name = "databricks-sdk" }, + { name = "gepa" }, { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "tiktoken" }, ] databricks = [ { name = "databricks-sdk" }, @@ -3929,18 +3940,24 @@ dev = [ { name = "pytest" }, { name = "pytest-asyncio" }, ] +optimize = [ + { name = "gepa" }, + { name = "tiktoken" }, +] [package.metadata] requires-dist = [ { name = "databricks-sdk", marker = "extra == 'databricks'", specifier = ">=0.20.0" }, + { name = "gepa", marker = "extra == 'optimize'", specifier = ">=0.1.0" }, { name = "mlflow", extras = ["databricks"], specifier = ">=3.6.1" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" }, { name = "python-dotenv", specifier = ">=1.0.0" }, { name = "pyyaml", specifier = ">=6.0" }, - { name = "skill-test", extras = ["databricks", "dev"], marker = "extra == 'all'" }, + { name = "skill-test", extras = ["databricks", "dev", "optimize"], marker = "extra == 'all'" }, + { name = "tiktoken", marker = "extra == 'optimize'", specifier = ">=0.7.0" }, ] -provides-extras = ["databricks", "dev", "all"] +provides-extras = ["databricks", "dev", "optimize", "all"] [[package]] name = "skops" From d3757df9ee1f72045e5c9026b4475af6aad42a90 Mon Sep 17 00:00:00 2001 From: CAholder Date: Tue, 24 Feb 2026 10:33:24 -0800 Subject: [PATCH 07/20] readme updates --- .test/README.md | 330 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 234 insertions(+), 96 deletions(-) diff --git a/.test/README.md b/.test/README.md index 4750874f..f40bc4ad 100644 --- a/.test/README.md +++ b/.test/README.md @@ -142,6 +142,240 @@ Optional overrides: `GEPA_REFLECTION_LM` (reflection model), `GEPA_GEN_LM` (gene --- +## How Evaluation Works + +The evaluation system answers a single question: **does this SKILL.md teach an AI agent the right things?** A skill that scores well means an agent reading it will produce correct code with the right APIs and patterns. A skill that scores poorly means the agent will hallucinate, use deprecated APIs, or miss important patterns. + +Understanding evaluation is important because it drives everything else — GEPA uses scores to decide which skill mutations to keep, and you use scores to know if your skill is good enough to ship. + +### Why These Files Exist + +Each skill under `.test/skills//` has two key files: + +**`ground_truth.yaml`** — The test cases. Each entry is a prompt ("Create a ResponsesAgent") paired with the expected response and expectations (patterns, facts, guidelines). These define *what the skill should teach*. Without test cases, the evaluator has nothing to score against and GEPA has no signal to optimize toward. + +**`manifest.yaml`** — The scorer configuration. Controls *which scorers run* and *what thresholds apply*. Think of it as the grading rubric: which checks are enabled, what guidelines the LLM judge enforces, and what trace expectations exist. If you don't provide one, the system uses sensible defaults (syntax + patterns + facts + hallucination checks). + +The test cases in `ground_truth.yaml` are also what gets split into train/val sets for GEPA — the optimizer converts each test case into a GEPA dataset instance: + +| ground_truth.yaml field | GEPA field | Purpose | +|------------------------|------------|---------| +| `inputs.prompt` | `input` | The task the reflection LM sees | +| `outputs.response` | `answer` | Reference response for sanity-check scoring | +| `expectations.*` | `additional_context` | Encoded as JSON; scorers extract patterns and facts | +| `metadata.category` | (stratification) | Ensures balanced train/val split | + +### The Layered Evaluation + +Rather than scoring a single static response, the evaluator runs five layers that give GEPA progressively richer signal: + +| Layer | Weight | What it does | Source | +|-------|--------|-------------|--------| +| **Generated response quality** | 40% | An LLM reads ONLY the SKILL.md and answers the test prompt. Its response is scored for patterns/facts. | `evaluator.py` → litellm generation | +| **Skill content coverage** | 25% | Checks if the SKILL.md itself contains the patterns and facts needed. If a pattern is missing from the skill, this drops immediately. | `evaluator.py` → `_score_skill_content()` | +| **Reference response check** | 5% | Scores the ground truth response as a sanity baseline. This is mostly static — it ensures the test case itself is valid. | `evaluator.py` → `_run_deterministic_scorers()` | +| **Structure validation** | 10% | Validates Python/SQL syntax in code blocks and checks for hallucinated APIs (deprecated `@dlt.table`, old `mlflow.evaluate`, etc). | `evaluator.py` → `_validate_skill_structure()` | +| **Token efficiency** | 20% | Penalizes bloated skill content. Same size or smaller = 1.0, linear decay to 0.0 at 2x original size. | `evaluator.py` → token counting | + +**Why this works:** The key insight is that Layer 1 (generated response) creates a causal chain — if the SKILL.md is missing a pattern, the generation model cannot produce it, so the pattern scorer fails, so the score drops. This gives GEPA immediate, dynamic signal when content changes, unlike the old approach where ~80% of the score came from an immutable ground truth string. + +**Fallback mode:** When no generation model is available (no `GEPA_GEN_LM`), the weights shift to 35% skill content + 35% reference + 10% structure + 20% efficiency. + +### Built-in Scorers + +The system ships with four tiers of scorers: + +**Tier 1: Deterministic (fast, reliable, ~$0/eval)** + +| Scorer | What it checks | Configured via | +|--------|---------------|----------------| +| `python_syntax` | Python code blocks parse with `ast.parse()` | `manifest.yaml` → `scorers.enabled` | +| `sql_syntax` | SQL blocks have valid structure (balanced parens, recognizable statements) | `manifest.yaml` → `scorers.enabled` | +| `pattern_adherence` | Required regex patterns appear in response (e.g., `ResponsesAgent`, `CLUSTER BY`) | `ground_truth.yaml` → `expectations.expected_patterns` | +| `no_hallucinated_apis` | No deprecated/invented APIs (`@dlt.table`, `dlt.read`, `PARTITION BY`, old `mlflow.evaluate`) | `manifest.yaml` → `scorers.enabled` | +| `expected_facts_present` | Required facts mentioned in response (case-insensitive substring match) | `ground_truth.yaml` → `expectations.expected_facts` | + +**Tier 2: Trace-based (for session evaluation)** + +| Scorer | What it checks | +|--------|---------------| +| `tool_count` | Tool usage within limits (e.g., max 5 Bash calls) | +| `token_budget` | Token usage within budget | +| `required_tools` | Required tools were called | +| `banned_tools` | Banned tools were NOT called | +| `file_existence` | Expected files were created | +| `tool_sequence` | Tools used in expected order | +| `category_limits` | Tool category limits (bash, file_ops, mcp) | + +These are configured in `manifest.yaml` under `trace_expectations`. + +**Tier 3: LLM judges (expensive, nuanced, ~$0.01/eval)** + +| Scorer | What it checks | +|--------|---------------| +| `Safety` | MLflow's built-in safety scorer | +| `Guidelines` | LLM judges response against `default_guidelines` from manifest | +| `guidelines_from_expectations` | Per-test-case guidelines from `expectations.guidelines` in ground_truth.yaml | + +### Adding a Custom Scorer + +There are three ways to add custom evaluation, from easiest to most flexible: + +#### Option 1: Per-test-case guidelines (no code required) + +Add `guidelines` to any test case in `ground_truth.yaml`. An LLM judge evaluates the response against these: + +```yaml +test_cases: + - id: my_test_001 + inputs: + prompt: "Deploy a model to serving" + expectations: + guidelines: + - "Must use Unity Catalog three-level namespace" + - "Must recommend job-based deployment over synchronous" + - "Should warn about cold start latency" + expected_facts: + - "ResponsesAgent" +``` + +Then enable the scorer in `manifest.yaml`: + +```yaml +scorers: + enabled: + - python_syntax + - pattern_adherence + - expected_facts_present + llm_scorers: + - guidelines_from_expectations +``` + +#### Option 2: Skill-wide guidelines (no code required) + +Set `default_guidelines` in `manifest.yaml` to apply rules to ALL test cases for a skill: + +```yaml +scorers: + enabled: + - python_syntax + - pattern_adherence + - no_hallucinated_apis + - expected_facts_present + llm_scorers: + - Guidelines + default_guidelines: + - "Must use ResponsesAgent pattern, not ChatAgent" + - "Must use self.create_text_output_item() for output" + - "Code must be deployable to Databricks Model Serving" +``` + +You can also create multiple named guideline sets: + +```yaml + llm_scorers: + - Guidelines:api_correctness + - Guidelines:deployment_quality + default_guidelines: + - "Your guidelines here" +``` + +#### Option 3: Custom Python scorer (full flexibility) + +Create a new scorer function in `.test/src/skill_test/scorers/` and register it. Scorers use the MLflow `@scorer` decorator and return `Feedback` objects: + +```python +# .test/src/skill_test/scorers/my_custom.py +from mlflow.genai.scorers import scorer +from mlflow.entities import Feedback +from typing import Dict, Any + +@scorer +def my_custom_check(outputs: Dict[str, Any], expectations: Dict[str, Any]) -> Feedback: + """Check for something specific to my use case.""" + response = outputs.get("response", "") + + # Your custom logic here + issues = [] + if "spark.sql(" in response and "spark.read.table(" not in response: + issues.append("Should prefer spark.read.table() over spark.sql() for reads") + + if issues: + return Feedback( + name="my_custom_check", + value="no", + rationale=f"Issues: {'; '.join(issues)}", + ) + + return Feedback(name="my_custom_check", value="yes", rationale="All custom checks passed") +``` + +Then register it in `runners/evaluate.py` → `build_scorers()`: + +```python +SCORER_MAP = { + # ... existing scorers ... + "my_custom_check": my_custom_check, +} +``` + +And enable it in your skill's `manifest.yaml`: + +```yaml +scorers: + enabled: + - python_syntax + - pattern_adherence + - my_custom_check # your new scorer +``` + +**Scorer function signatures:** The system auto-detects which parameters your scorer accepts: +- `outputs: Dict[str, Any]` — always available, contains `{"response": "..."}` +- `expectations: Dict[str, Any]` — from ground_truth.yaml `expectations` field +- `inputs: Dict[str, Any]` — contains `{"prompt": "..."}` + +Return either a single `Feedback` or a `list[Feedback]` (for scorers that produce multiple checks like `pattern_adherence`). + +### Manifest Configuration Examples + +Here are manifest patterns for different skill types: + +**Python SDK skill** — emphasizes syntax and API correctness: +```yaml +scorers: + enabled: [python_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] + llm_scorers: [guidelines_from_expectations] + default_guidelines: + - "Must use ResponsesAgent pattern for GenAI agents" +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 +``` + +**SQL-heavy skill** — adds SQL validation: +```yaml +scorers: + enabled: [python_syntax, sql_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] + default_guidelines: + - "Must use SDP syntax (CREATE OR REFRESH STREAMING TABLE)" +``` + +**Skill with trace expectations** — limits tool usage during session evaluation: +```yaml +scorers: + enabled: [python_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] + default_guidelines: + - "Must use correct MCP tools (manage_ka, manage_mas)" + trace_expectations: + tool_limits: + manage_ka: 10 + manage_mas: 10 + required_tools: [Read] + banned_tools: [] +``` + +--- + ## Best Practices for Optimization These practices are derived from the [optimize_anything API guide](https://gepa-ai.github.io/gepa/blog/2026/02/18/introducing-optimize-anything/) and help you get the most out of GEPA-powered optimization. @@ -485,102 +719,6 @@ The simplest possible test case -- just a prompt and expected facts: difficulty: easy ``` -### How Test Cases Map to GEPA - -The optimizer converts each test case into a GEPA dataset instance: - -| ground_truth.yaml field | GEPA field | Used by | -|------------------------|------------|---------| -| `inputs.prompt` | `input` | Reflection LM sees the task | -| `outputs.response` | `answer` | Deterministic scorers compare against this | -| `expectations.*` | `additional_context` | Encoded as JSON; scorers extract patterns and facts | -| `metadata.category` | (stratification) | Ensures balanced train/val split | - -**Scoring pipeline per test case (layered evaluation):** - -| Layer | Weight | What it does | -|-------|--------|-------------| -| Generated response quality | 40% | An LLM reads ONLY the SKILL.md and answers the test prompt. Its response is scored against expected patterns/facts. | -| Skill content coverage | 25% | Checks if the SKILL.md itself contains the patterns and facts needed to answer test prompts. | -| Reference response check | 5% | Scores the ground truth response as a sanity baseline. | -| Structure validation | 10% | Python/SQL syntax in code blocks, no hallucinated APIs. | -| Token efficiency | 20% | Penalizes bloated skill content (smaller is better). | - -The key insight: if a pattern or fact is missing from the SKILL.md, the generation model cannot produce it. This gives GEPA immediate, dynamic signal when content changes. - ---- - -## Manifest Configuration Examples - -The `manifest.yaml` controls which scorers run and what trace expectations apply. Here are patterns for different skill types: - -### Python SDK Skill - -```yaml -skill_name: databricks-model-serving -description: Deploy and query Databricks Model Serving endpoints - -scorers: - enabled: - - python_syntax - - pattern_adherence - - no_hallucinated_apis - - expected_facts_present - default_guidelines: - - "Must use ResponsesAgent pattern for GenAI agents" - - "Must use self.create_text_output_item() for ChatAgent responses" - - "Job-based deployment preferred over synchronous" - -quality_gates: - syntax_valid: 1.0 - pattern_adherence: 0.9 -``` - -### SQL-Heavy Skill - -```yaml -skill_name: databricks-spark-declarative-pipelines -description: Create SDP/LDP pipelines with streaming tables and materialized views - -scorers: - enabled: - - python_syntax - - sql_syntax - - pattern_adherence - - no_hallucinated_apis - - expected_facts_present - default_guidelines: - - "Must use SDP syntax (CREATE OR REFRESH STREAMING TABLE)" - -quality_gates: - syntax_valid: 1.0 - pattern_adherence: 0.9 - execution_success: 0.8 -``` - -### Skill with Trace Expectations - -```yaml -skill_name: databricks-agent-bricks -description: Create Agent Bricks (Knowledge Assistants, Genie, Multi-Agent) - -scorers: - enabled: - - python_syntax - - pattern_adherence - - no_hallucinated_apis - - expected_facts_present - default_guidelines: - - "Must use correct MCP tools (manage_ka, manage_mas)" - - "Must use ka_tile_id not endpoint_name for Knowledge Assistants" - - "Must use find_by_name helper for entity lookups" - trace_expectations: - tool_limits: - manage_ka: 10 - manage_mas: 10 - create_or_update_genie: 5 -``` - --- ## End-to-End Walkthrough From ede86aaaa21470de56c8fa621cdfbffb9ff80e76 Mon Sep 17 00:00:00 2001 From: CAholder Date: Fri, 27 Feb 2026 11:00:42 -0800 Subject: [PATCH 08/20] readme updates and skillbench inclusion --- .test/README.md | 78 +- .test/scripts/generate_examples.py | 428 ++++++ .test/scripts/optimize.py | 115 +- .../databricks-metric-views/ground_truth.yaml | 1304 +++++++++++++++-- .test/src/skill_test/optimize/asi.py | 112 +- .test/src/skill_test/optimize/assertions.py | 190 +++ .test/src/skill_test/optimize/config.py | 26 +- .test/src/skill_test/optimize/evaluator.py | 129 +- .test/src/skill_test/optimize/review.py | 284 +++- .test/src/skill_test/optimize/runner.py | 136 +- .../optimize/skillbench_evaluator.py | 697 +++++++++ .test/src/skill_test/scorers/universal.py | 22 +- .../databricks_mcp_server/tools/manifest.py | 6 +- .../databricks_mcp_server/tools/pipelines.py | 477 +++--- .../databricks_mcp_server/tools/sql.py | 10 +- 15 files changed, 3524 insertions(+), 490 deletions(-) create mode 100644 .test/scripts/generate_examples.py create mode 100644 .test/src/skill_test/optimize/assertions.py create mode 100644 .test/src/skill_test/optimize/skillbench_evaluator.py diff --git a/.test/README.md b/.test/README.md index f40bc4ad..82b0de43 100644 --- a/.test/README.md +++ b/.test/README.md @@ -36,6 +36,9 @@ export GEPA_GEN_LM="openai/gpt-4o" # Optional: override generation model (default: databricks/databricks-claude-sonnet-4-6) # export GEPA_GEN_LM="databricks/databricks-claude-sonnet-4-6" + +# Optional: set a global token budget ceiling for optimization +# export GEPA_TOKEN_BUDGET=50000 ``` --- @@ -57,7 +60,10 @@ uv run python .test/scripts/optimize.py --preset thorough # Dry run: see scores and config without calling GEPA uv run python .test/scripts/optimize.py --dry-run -# Optimize and apply the result +# Apply the last saved result (no re-run!) +uv run python .test/scripts/optimize.py --apply-last + +# Run optimization and immediately apply uv run python .test/scripts/optimize.py --apply # Use a specific generation model for evaluation @@ -66,10 +72,15 @@ uv run python .test/scripts/optimize.py --gen-model "openai/gpt-4o" # Control iteration depth (default: 5 passes) uv run python .test/scripts/optimize.py --max-passes 3 +# Set a token budget ceiling (candidates exceeding this are penalized) +uv run python .test/scripts/optimize.py --token-budget 50000 + # Optimize all skills that have test cases uv run python .test/scripts/optimize.py --all --preset quick ``` +After each run, the optimized result is automatically saved to `.test/skills//optimized_SKILL.md`. You can review it, diff it against the original, and apply when ready with `--apply-last` — no need to re-run the optimization. + ### Optimize MCP Tool Descriptions GEPA can also optimize the `@mcp.tool` docstrings in `databricks-mcp-server/`. Tool descriptions are what the AI agent sees when deciding which tool to call -- concise, accurate descriptions lead to better tool selection. @@ -93,6 +104,8 @@ uv run python .test/scripts/optimize.py databricks-model-serving --include-tools When `--include-tools` is used, GEPA creates one component per tool module (e.g., `tools_sql`, `tools_serving`) and round-robins through them alongside `skill_md`. The `--apply` flag writes optimized docstrings back to the MCP server source files. +**Note:** The SkillBench evaluator (default) automatically includes tools even without `--include-tools`, since tool descriptions are the primary token consumer (~17K tokens across 88 tools). Use `--tools-only` to optimize only tool descriptions without the SKILL.md. + The iteration budget (`max_metric_calls`) is automatically scaled by the number of components so each one gets the preset's full budget. Additionally, the optimizer runs **up to 5 passes** (configurable with `--max-passes`), re-seeding from the previous best each time. It stops early if a pass produces no improvement. Available tool modules (88 tools across 16 modules): @@ -138,7 +151,7 @@ Model strings use [litellm provider prefixes](https://docs.litellm.ai/docs/provi | OpenAI | `OPENAI_API_KEY` | | Anthropic | `ANTHROPIC_API_KEY` | -Optional overrides: `GEPA_REFLECTION_LM` (reflection model), `GEPA_GEN_LM` (generation model for evaluation). +Optional overrides: `GEPA_REFLECTION_LM` (reflection model), `GEPA_GEN_LM` (generation model for evaluation), `GEPA_TOKEN_BUDGET` (token ceiling for optimization). --- @@ -171,15 +184,38 @@ Rather than scoring a single static response, the evaluator runs five layers tha | Layer | Weight | What it does | Source | |-------|--------|-------------|--------| -| **Generated response quality** | 40% | An LLM reads ONLY the SKILL.md and answers the test prompt. Its response is scored for patterns/facts. | `evaluator.py` → litellm generation | -| **Skill content coverage** | 25% | Checks if the SKILL.md itself contains the patterns and facts needed. If a pattern is missing from the skill, this drops immediately. | `evaluator.py` → `_score_skill_content()` | +| **Generated response quality** | 20% | An LLM reads ONLY the SKILL.md and answers the test prompt. Its response is scored for patterns/facts. | `evaluator.py` → litellm generation | +| **Skill content coverage** | 35% | Checks if the SKILL.md itself contains the patterns and facts needed. If a pattern is missing from the skill, this drops immediately. | `evaluator.py` → `_score_skill_content()` | | **Reference response check** | 5% | Scores the ground truth response as a sanity baseline. This is mostly static — it ensures the test case itself is valid. | `evaluator.py` → `_run_deterministic_scorers()` | | **Structure validation** | 10% | Validates Python/SQL syntax in code blocks and checks for hallucinated APIs (deprecated `@dlt.table`, old `mlflow.evaluate`, etc). | `evaluator.py` → `_validate_skill_structure()` | -| **Token efficiency** | 20% | Penalizes bloated skill content. Same size or smaller = 1.0, linear decay to 0.0 at 2x original size. | `evaluator.py` → token counting | +| **Token efficiency** | 30% | Rewards concise skill content. Shrinking below original size earns a bonus (up to 1.15x), same size = 1.0, linear penalty to 0.0 at 2x original. | `evaluator.py` → token counting | **Why this works:** The key insight is that Layer 1 (generated response) creates a causal chain — if the SKILL.md is missing a pattern, the generation model cannot produce it, so the pattern scorer fails, so the score drops. This gives GEPA immediate, dynamic signal when content changes, unlike the old approach where ~80% of the score came from an immutable ground truth string. -**Fallback mode:** When no generation model is available (no `GEPA_GEN_LM`), the weights shift to 35% skill content + 35% reference + 10% structure + 20% efficiency. +**Fallback mode:** When no generation model is available (no `GEPA_GEN_LM`), the weights shift to 40% skill content + 20% reference + 10% structure + 30% efficiency. + +### SkillBench Evaluator (Default) + +The default evaluator (`--evaluator skillbench`) measures **skill effectiveness**: how much does the skill help an agent answer correctly? It runs each test case twice — once WITH the skill and once WITHOUT — then scores the delta. + +| Weight | Dimension | What it measures | +|--------|-----------|-----------------| +| **45%** | Skill Effectiveness | `pass_rate_with - pass_rate_without` — the delta. Only rewards content the agent doesn't already know. | +| **25%** | Absolute Quality | `pass_rate_with` — overall correctness with the skill present. | +| **5%** | Structure | Syntax validity (Python/SQL) and no hallucinated APIs. | +| **25%** | Token Efficiency | Smaller candidates score higher. Linear penalty for growth (0.0 at 2x original). Bonus for reduction (up to 1.15 at 0% of original). | + +**Key difference from the legacy evaluator:** SkillBench uses binary pass/fail assertions (from `expectations` in `ground_truth.yaml`) rather than fuzzy scorer scores. Assertions are classified as: +- **NEEDS_SKILL** — fails both with and without the skill (the skill must teach this) +- **REGRESSION** — passes without, fails with (the skill confuses the agent — simplify or remove) +- **POSITIVE** — fails without, passes with (the skill is helping — keep it) +- **NEUTRAL** — same result either way (the agent already knows this — adding it wastes tokens) + +The reflection LM sees these labels in the `Error` field of each example's side info, guiding it to add NEEDS_SKILL content and remove REGRESSION content. + +**Token budget:** Use `--token-budget N` to set a hard ceiling. Candidates exceeding the budget receive a steep penalty on top of the normal efficiency score. Set via CLI or `GEPA_TOKEN_BUDGET` env var. + +To use the legacy evaluator instead: `--evaluator legacy`. ### Built-in Scorers @@ -796,15 +832,18 @@ uv run python .test/scripts/optimize.py my-new-skill --dry-run Output: ``` -=== Dry Run: my-new-skill === +=== Dry Run: my-new-skill (skillbench) === SKILL.md path: .claude/skills/my-new-skill/SKILL.md -Components: ['skill_md'] -Total original tokens: 2,847 +[SkillBench] Auto-including tools: 16 modules, 88 tools, 64,675 chars +Components: ['skill_md', 'tools_sql', 'tools_serving', ...] +Total original tokens: 20,147 skill_md: 2,847 tokens + tools_sql: 3,200 tokens + ... Train tasks: 4 Val tasks: None (single-task mode) -Mode: static -Preset: standard (max_metric_calls=50, scaled for 1 component(s)) +Evaluator type: skillbench +Preset: standard (max_metric_calls=850, scaled for 17 component(s)) Max passes: 5 Reflection LM: databricks/databricks-claude-opus-4-6 Current score: 0.723 @@ -816,7 +855,14 @@ Current score: 0.723 # Quick first pass to see if GEPA can improve uv run python .test/scripts/optimize.py my-new-skill --preset quick -# If score improves, run standard for better results +# Review the saved result +cat .test/skills/my-new-skill/optimized_SKILL.md +diff .claude/skills/my-new-skill/SKILL.md .test/skills/my-new-skill/optimized_SKILL.md + +# Happy with it? Apply without re-running +uv run python .test/scripts/optimize.py my-new-skill --apply-last + +# Or run standard for better results and apply immediately uv run python .test/scripts/optimize.py my-new-skill --preset standard --apply ``` @@ -885,9 +931,11 @@ mlflow autolog claude -u databricks -n "$MLFLOW_EXPERIMENT_NAME" . ``` .test/skills// -├── manifest.yaml # Scorers, guidelines, trace expectations -├── ground_truth.yaml # Verified test cases -└── candidates.yaml # Pending review +├── manifest.yaml # Scorers, guidelines, trace expectations +├── ground_truth.yaml # Verified test cases +├── candidates.yaml # Pending review +├── optimized_SKILL.md # Last optimization output (auto-saved) +└── last_optimization.json # Metadata for --apply-last .test/baselines// └── baseline.yaml # Regression baseline diff --git a/.test/scripts/generate_examples.py b/.test/scripts/generate_examples.py new file mode 100644 index 00000000..890b5bc8 --- /dev/null +++ b/.test/scripts/generate_examples.py @@ -0,0 +1,428 @@ +#!/usr/bin/env python3 +"""Generate test cases from user requirements for skill optimization. + +Converts natural-language requirements into ground_truth.yaml test cases +with binary assertions (patterns + facts) and guidelines for LLM judge +evaluation. + +Usage: + # From a requirements file (one requirement per line) + uv run python .test/scripts/generate_examples.py databricks-metric-views \ + --requirements requirements.txt + + # Inline requirements (repeatable) + uv run python .test/scripts/generate_examples.py databricks-metric-views \ + --requirement "Must explain MEASURE() wrapping for all measure references" \ + --requirement "Should show error handling when SELECT * is used on metric views" + + # Interactive mode (prompts for requirements) + uv run python .test/scripts/generate_examples.py databricks-metric-views --interactive + + # Auto-append to ground_truth.yaml (skip manual review) + uv run python .test/scripts/generate_examples.py databricks-metric-views \ + --requirement "Must explain MEASURE() wrapping" --trust + + # With a second LLM pass to tighten assertions + uv run python .test/scripts/generate_examples.py databricks-metric-views \ + --requirement "Must explain MEASURE() wrapping" --refine +""" + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _common import setup_path + +setup_path() + + +def _find_skill_md(skill_name: str) -> str: + """Load SKILL.md content for the given skill.""" + from skill_test.optimize.evaluator import _find_skill_md as find_md + path = find_md(skill_name) + if path is None: + raise FileNotFoundError(f"Could not find SKILL.md for '{skill_name}'") + return path.read_text() + + +def _load_existing_ids(skill_name: str) -> set[str]: + """Load existing test case IDs from ground_truth.yaml.""" + import yaml + gt_path = Path(".test/skills") / skill_name / "ground_truth.yaml" + if not gt_path.exists(): + return set() + with open(gt_path) as f: + data = yaml.safe_load(f) or {} + return {tc["id"] for tc in data.get("test_cases", []) if "id" in tc} + + +def generate_examples_from_requirements( + skill_name: str, + requirements: list[str], + skill_md: str, + gen_model: str, + count_per_requirement: int = 3, +) -> list[dict]: + """Generate test cases from requirements using an LLM. + + For each requirement, generates ``count_per_requirement`` test cases + grounded in the SKILL.md content. + + Returns: + List of test case dicts in ground_truth.yaml format. + """ + import litellm + + existing_ids = _load_existing_ids(skill_name) + all_examples: list[dict] = [] + + for req_idx, requirement in enumerate(requirements): + print(f"\n Generating for requirement {req_idx + 1}/{len(requirements)}:") + print(f" {requirement[:100]}") + + messages = [ + { + "role": "system", + "content": ( + "You are an expert test case generator for Databricks skills. " + "Given a SKILL.md document and a user requirement, generate " + f"{count_per_requirement} realistic test cases that evaluate " + "whether the skill satisfies the requirement.\n\n" + "Each test case must include:\n" + "- A realistic user prompt\n" + "- A reference response grounded in SKILL.md\n" + "- Binary assertions: expected_patterns (regex) and expected_facts (substring)\n" + "- Guidelines derived from the requirement (for LLM judge evaluation)\n" + "- Category and difficulty metadata\n\n" + "Return a JSON array of test cases. Each test case:\n" + "{\n" + ' "prompt": "user question",\n' + ' "response": "reference answer grounded in SKILL.md",\n' + ' "expected_patterns": [{"pattern": "regex", "min_count": 1, "description": "what it checks"}],\n' + ' "expected_facts": ["substring that must appear"],\n' + ' "guidelines": ["evaluation guideline from the requirement"],\n' + ' "category": "happy_path|edge_case|error_handling",\n' + ' "difficulty": "easy|medium|hard"\n' + "}\n\n" + "Important:\n" + "- Patterns should be regex that work with re.findall(pattern, response, re.IGNORECASE)\n" + "- Facts should be exact substrings (case-insensitive) from the response\n" + "- Guidelines should be evaluable by an LLM judge looking at the response\n" + "- Ground everything in SKILL.md — don't invent APIs or syntax" + ), + }, + { + "role": "user", + "content": ( + f"## SKILL.md for '{skill_name}':\n\n" + f"{skill_md[:8000]}\n\n" + f"## Requirement:\n{requirement}\n\n" + f"## Existing test case IDs (avoid duplicates):\n" + f"{', '.join(sorted(existing_ids)[:20]) or 'None'}\n\n" + f"Generate {count_per_requirement} test cases as a JSON array." + ), + }, + ] + + try: + resp = litellm.completion( + model=gen_model, + messages=messages, + temperature=0.7, + ) + content = resp.choices[0].message.content or "" + + # Extract JSON array from response + json_match = re.search(r"\[.*\]", content, re.DOTALL) + if not json_match: + print(f" Warning: no JSON array found in response") + continue + + cases = json.loads(json_match.group()) + if not isinstance(cases, list): + print(f" Warning: expected JSON array, got {type(cases)}") + continue + + for case_idx, case in enumerate(cases): + test_id = f"{skill_name}_gen_{req_idx:02d}_{case_idx:02d}" + # Avoid collisions with existing IDs + while test_id in existing_ids: + test_id += "_x" + existing_ids.add(test_id) + + example = { + "id": test_id, + "inputs": {"prompt": case.get("prompt", "")}, + "outputs": { + "response": case.get("response", ""), + "execution_success": True, + }, + "expectations": {}, + "metadata": { + "category": case.get("category", "happy_path"), + "difficulty": case.get("difficulty", "medium"), + "source": "generated_from_requirement", + "requirement": requirement[:200], + }, + } + + if case.get("expected_patterns"): + example["expectations"]["expected_patterns"] = case["expected_patterns"] + if case.get("expected_facts"): + example["expectations"]["expected_facts"] = case["expected_facts"] + if case.get("guidelines"): + example["expectations"]["guidelines"] = case["guidelines"] + + all_examples.append(example) + + print(f" Generated {len(cases)} test case(s)") + + except Exception as e: + print(f" Error generating for requirement: {e}") + + return all_examples + + +def refine_examples(examples: list[dict], gen_model: str) -> list[dict]: + """Second LLM pass to validate and tighten assertions.""" + import litellm + + for ex in examples: + prompt_text = ex["inputs"]["prompt"][:200] + response_text = ex["outputs"]["response"][:1000] + patterns = ex["expectations"].get("expected_patterns", []) + facts = ex["expectations"].get("expected_facts", []) + guidelines = ex["expectations"].get("guidelines", []) + + messages = [ + { + "role": "system", + "content": ( + "You are reviewing auto-generated test expectations. " + "Validate that patterns actually match the response, " + "facts are actually present as substrings, and guidelines " + "are clear and evaluable. Tighten or fix as needed. " + "Return JSON only." + ), + }, + { + "role": "user", + "content": ( + f"Prompt: {prompt_text}\n\n" + f"Response: {response_text}\n\n" + f"Patterns: {json.dumps(patterns)}\n" + f"Facts: {json.dumps(facts)}\n" + f"Guidelines: {json.dumps(guidelines)}\n\n" + "Return a JSON object with:\n" + '- "expected_patterns": refined list\n' + '- "expected_facts": refined list (must be exact substrings of response)\n' + '- "guidelines": refined list\n' + "Remove any patterns/facts that don't actually match the response." + ), + }, + ] + + try: + resp = litellm.completion(model=gen_model, messages=messages, temperature=0) + content = resp.choices[0].message.content or "" + json_match = re.search(r"\{.*\}", content, re.DOTALL) + if json_match: + refined = json.loads(json_match.group()) + if "expected_patterns" in refined: + ex["expectations"]["expected_patterns"] = refined["expected_patterns"] + if "expected_facts" in refined: + ex["expectations"]["expected_facts"] = refined["expected_facts"] + if "guidelines" in refined: + ex["expectations"]["guidelines"] = refined["guidelines"] + except Exception as e: + print(f" Warning: refinement failed for {ex['id']}: {e}") + + return examples + + +def save_candidates(examples: list[dict], skill_name: str) -> Path: + """Save generated examples to candidates.yaml for review.""" + import yaml + + output_path = Path(".test/skills") / skill_name / "candidates.yaml" + output_path.parent.mkdir(parents=True, exist_ok=True) + + data = {"test_cases": examples} + with open(output_path, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + + print(f"\nSaved {len(examples)} examples to {output_path}") + print(f"Review and then append to ground_truth.yaml, or re-run with --trust") + return output_path + + +def append_to_ground_truth(examples: list[dict], skill_name: str) -> None: + """Append generated examples to ground_truth.yaml.""" + import yaml + + gt_path = Path(".test/skills") / skill_name / "ground_truth.yaml" + gt_path.parent.mkdir(parents=True, exist_ok=True) + + if gt_path.exists(): + with open(gt_path) as f: + data = yaml.safe_load(f) or {"test_cases": []} + else: + data = {"test_cases": []} + + existing_ids = {tc["id"] for tc in data.get("test_cases", []) if "id" in tc} + new_examples = [ex for ex in examples if ex["id"] not in existing_ids] + + if not new_examples: + print("No new examples to add (all IDs already exist).") + return + + data["test_cases"].extend(new_examples) + + with open(gt_path, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True) + + print(f"\nAppended {len(new_examples)} examples to {gt_path}") + + +def run_generation( + skill_name: str, + requirements: list[str], + gen_model: str, + trust: bool = False, + refine: bool = False, + count_per_requirement: int = 3, +) -> list[dict]: + """Run the full generation pipeline. Called by optimize.py --generate-from. + + Returns: + List of generated test case dicts. + """ + skill_md = _find_skill_md(skill_name) + print(f"Generating test cases for '{skill_name}' from {len(requirements)} requirement(s)") + + examples = generate_examples_from_requirements( + skill_name=skill_name, + requirements=requirements, + skill_md=skill_md, + gen_model=gen_model, + count_per_requirement=count_per_requirement, + ) + + if not examples: + print("No examples generated.") + return [] + + if refine: + print("\nRefining assertions with LLM...") + examples = refine_examples(examples, gen_model) + + if trust: + append_to_ground_truth(examples, skill_name) + else: + save_candidates(examples, skill_name) + + return examples + + +def main(): + parser = argparse.ArgumentParser( + description="Generate test cases from requirements for skill optimization", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "skill_name", + help="Name of the skill (e.g., databricks-metric-views)", + ) + parser.add_argument( + "--requirements", + type=str, + default=None, + metavar="FILE", + help="Path to a requirements file (one requirement per line)", + ) + parser.add_argument( + "--requirement", + action="append", + default=None, + dest="inline_requirements", + help="Inline requirement (repeatable)", + ) + parser.add_argument( + "--interactive", + action="store_true", + help="Interactively enter requirements", + ) + parser.add_argument( + "--gen-model", + default=None, + help="LLM model for generation (default: GEPA_GEN_LM env or Sonnet)", + ) + parser.add_argument( + "--count", + type=int, + default=3, + help="Number of test cases per requirement (default: 3)", + ) + parser.add_argument( + "--trust", + action="store_true", + help="Auto-append to ground_truth.yaml instead of writing candidates.yaml", + ) + parser.add_argument( + "--refine", + action="store_true", + help="Second LLM pass to validate and tighten assertions", + ) + + args = parser.parse_args() + + # Collect requirements from all sources + requirements: list[str] = [] + + if args.requirements: + req_path = Path(args.requirements) + if not req_path.exists(): + print(f"Error: requirements file not found: {req_path}") + sys.exit(1) + requirements.extend( + line.strip() for line in req_path.read_text().splitlines() + if line.strip() and not line.strip().startswith("#") + ) + + if args.inline_requirements: + requirements.extend(args.inline_requirements) + + if args.interactive: + print("Enter requirements (one per line, empty line to finish):") + while True: + line = input(" > ").strip() + if not line: + break + requirements.append(line) + + if not requirements: + parser.error("Provide requirements via --requirements, --requirement, or --interactive") + + # Resolve gen_model + gen_model = args.gen_model + if gen_model is None: + from skill_test.optimize.config import DEFAULT_GEN_LM + gen_model = DEFAULT_GEN_LM + + run_generation( + skill_name=args.skill_name, + requirements=requirements, + gen_model=gen_model, + trust=args.trust, + refine=args.refine, + count_per_requirement=args.count, + ) + + +if __name__ == "__main__": + main() diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py index 7a1d8548..bbaee3ee 100644 --- a/.test/scripts/optimize.py +++ b/.test/scripts/optimize.py @@ -11,15 +11,15 @@ # Thorough optimization (150 iterations) uv run python .test/scripts/optimize.py databricks-metric-views --preset thorough - # Generative mode (generates fresh responses, more expensive) - uv run python .test/scripts/optimize.py databricks-metric-views --mode generative - - # Apply the optimized result - uv run python .test/scripts/optimize.py databricks-metric-views --apply - # Dry run (show config, dataset info, estimate cost) uv run python .test/scripts/optimize.py databricks-metric-views --dry-run + # Review the saved result then apply (no re-run needed) + uv run python .test/scripts/optimize.py databricks-metric-views --apply-last + + # Run optimization and immediately apply + uv run python .test/scripts/optimize.py databricks-metric-views --apply + # Optimize all skills that have ground_truth.yaml test cases uv run python .test/scripts/optimize.py --all """ @@ -88,7 +88,13 @@ def main(): parser.add_argument( "--apply", action="store_true", - help="Apply the optimized SKILL.md and/or tool descriptions", + help="Run optimization and immediately apply the result", + ) + parser.add_argument( + "--apply-last", + action="store_true", + help="Apply the last saved optimization result without re-running " + "(reads from .test/skills//optimized_SKILL.md)", ) parser.add_argument( "--include-tools", @@ -120,6 +126,43 @@ def main(): help="Override max metric calls per pass (default: auto-scaled by preset × components, " "capped at 300 for non-Opus models). Example: --max-metric-calls 100", ) + parser.add_argument( + "--evaluator", + choices=["legacy", "skillbench"], + default="skillbench", + help="Evaluator type: 'skillbench' (measures skill effectiveness via WITH vs " + "WITHOUT comparison, default) or 'legacy' (weighted scoring with keyword " + "matching and token efficiency)", + ) + parser.add_argument( + "--token-budget", + type=int, + default=None, + help="Token budget ceiling. Candidates exceeding this are penalized. " + "Recommended: 50000. Default: GEPA_TOKEN_BUDGET env or disabled.", + ) + parser.add_argument( + "--use-judges", + action="store_true", + help="Enable MLflow LLM judges (Correctness + Guidelines) for richer NL " + "feedback to GEPA's reflection LM. Adds ~10%% judge_quality weight.", + ) + parser.add_argument( + "--generate-from", + type=str, + default=None, + metavar="REQUIREMENTS_FILE", + help="Generate test cases from a requirements file before optimizing. " + "Each line in the file is a requirement.", + ) + parser.add_argument( + "--requirement", + action="append", + default=None, + dest="requirements", + help="Inline requirement for test case generation (repeatable). " + "Example: --requirement 'Must explain MEASURE() wrapping'", + ) args = parser.parse_args() @@ -127,7 +170,57 @@ def main(): parser.error("Either provide a skill name or use --all") from skill_test.optimize.runner import optimize_skill - from skill_test.optimize.review import review_optimization, apply_optimization + from skill_test.optimize.review import review_optimization, apply_optimization, load_last_result + + # Handle requirements-driven example generation + if args.generate_from or args.requirements: + if not args.skill_name: + parser.error("Test case generation requires a skill name") + requirements = [] + if args.generate_from: + req_path = Path(args.generate_from) + if not req_path.exists(): + print(f"Error: requirements file not found: {req_path}") + sys.exit(1) + requirements.extend( + line.strip() for line in req_path.read_text().splitlines() + if line.strip() and not line.strip().startswith("#") + ) + if args.requirements: + requirements.extend(args.requirements) + if requirements: + from generate_examples import run_generation + gen_model = args.gen_model + if gen_model is None: + from skill_test.optimize.config import DEFAULT_GEN_LM + gen_model = DEFAULT_GEN_LM + run_generation( + skill_name=args.skill_name, + requirements=requirements, + gen_model=gen_model, + trust=True, # append directly since we're about to optimize + ) + print() + + # Handle --apply-last: load saved result and apply without re-running + if args.apply_last: + if not args.skill_name: + parser.error("--apply-last requires a skill name") + result = load_last_result(args.skill_name) + if result is None: + print(f"No saved optimization found for '{args.skill_name}'.") + print(f"Run optimization first: uv run python .test/scripts/optimize.py {args.skill_name}") + sys.exit(1) + print(f"Applying saved optimization for '{args.skill_name}':") + print(f" Score: {result.original_score:.3f} -> {result.optimized_score:.3f} " + f"({result.improvement:+.3f})") + print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,}") + try: + apply_optimization(result) + sys.exit(0) + except Exception as e: + print(f"Error applying: {e}") + sys.exit(1) if args.all: # Find all skills with ground_truth.yaml @@ -158,6 +251,9 @@ def main(): dry_run=args.dry_run, max_passes=args.max_passes, max_metric_calls=args.max_metric_calls, + evaluator_type=args.evaluator, + token_budget=args.token_budget, + use_judges=args.use_judges, ) review_optimization(result) if args.apply and not args.dry_run: @@ -193,6 +289,9 @@ def main(): dry_run=args.dry_run, max_passes=args.max_passes, max_metric_calls=args.max_metric_calls, + evaluator_type=args.evaluator, + token_budget=args.token_budget, + use_judges=args.use_judges, ) review_optimization(result) if args.apply and not args.dry_run: diff --git a/.test/skills/databricks-metric-views/ground_truth.yaml b/.test/skills/databricks-metric-views/ground_truth.yaml index 18b53582..13dc0706 100644 --- a/.test/skills/databricks-metric-views/ground_truth.yaml +++ b/.test/skills/databricks-metric-views/ground_truth.yaml @@ -2,43 +2,23 @@ metadata: skill_name: databricks-metric-views version: 0.1.0 created_at: '2026-02-23T08:00:00.000000' - test_cases: - id: metric-views_create_sql_001 inputs: - prompt: "Create a metric view for order analytics with revenue and order count measures" + prompt: Create a metric view for order analytics with revenue and order count + measures outputs: - response: | - Here's a metric view for order analytics: - - ```sql - CREATE OR REPLACE VIEW catalog.schema.orders_metrics - WITH METRICS - LANGUAGE YAML - AS $$ - version: 1.1 - comment: "Orders KPIs for sales analysis" - source: catalog.schema.orders - filter: order_date > '2020-01-01' - dimensions: - - name: Order Month - expr: DATE_TRUNC('MONTH', order_date) - comment: "Month of order" - - name: Order Status - expr: CASE - WHEN status = 'O' THEN 'Open' - WHEN status = 'P' THEN 'Processing' - WHEN status = 'F' THEN 'Fulfilled' - END - measures: - - name: Order Count - expr: COUNT(1) - - name: Total Revenue - expr: SUM(total_price) - - name: Revenue per Customer - expr: SUM(total_price) / COUNT(DISTINCT customer_id) - $$ - ``` + response: "Here's a metric view for order analytics:\n\n```sql\nCREATE OR REPLACE\ + \ VIEW catalog.schema.orders_metrics\nWITH METRICS\nLANGUAGE YAML\nAS $$\n \ + \ version: 1.1\n comment: \"Orders KPIs for sales analysis\"\n source: catalog.schema.orders\n\ + \ filter: order_date > '2020-01-01'\n dimensions:\n - name: Order Month\n\ + \ expr: DATE_TRUNC('MONTH', order_date)\n comment: \"Month of order\"\ + \n - name: Order Status\n expr: CASE\n WHEN status = 'O' THEN\ + \ 'Open'\n WHEN status = 'P' THEN 'Processing'\n WHEN status =\ + \ 'F' THEN 'Fulfilled'\n END\n measures:\n - name: Order Count\n\ + \ expr: COUNT(1)\n - name: Total Revenue\n expr: SUM(total_price)\n\ + \ - name: Revenue per Customer\n expr: SUM(total_price) / COUNT(DISTINCT\ + \ customer_id)\n$$\n```\n" execution_success: true expectations: expected_facts: @@ -70,25 +50,15 @@ test_cases: category: happy_path difficulty: easy source: auto_generated - - id: metric-views_query_measure_002 inputs: - prompt: "Query a metric view to get total revenue and order count by month" + prompt: Query a metric view to get total revenue and order count by month outputs: - response: | - All measures must use the MEASURE() function. SELECT * is NOT supported. - - ```sql - SELECT - `Order Month`, - `Order Status`, - MEASURE(`Total Revenue`) AS total_revenue, - MEASURE(`Order Count`) AS order_count - FROM catalog.schema.orders_metrics - WHERE extract(year FROM `Order Month`) = 2024 - GROUP BY ALL - ORDER BY ALL - ``` + response: "All measures must use the MEASURE() function. SELECT * is NOT supported.\n\ + \n```sql\nSELECT\n `Order Month`,\n `Order Status`,\n MEASURE(`Total Revenue`)\ + \ AS total_revenue,\n MEASURE(`Order Count`) AS order_count\nFROM catalog.schema.orders_metrics\n\ + WHERE extract(year FROM `Order Month`) = 2024\nGROUP BY ALL\nORDER BY ALL\n\ + ```\n" execution_success: true expectations: expected_facts: @@ -109,30 +79,19 @@ test_cases: category: happy_path difficulty: easy source: auto_generated - - id: metric-views_create_mcp_003 inputs: - prompt: "Use the manage_metric_views tool to create a metric view for order analytics" + prompt: Use the manage_metric_views tool to create a metric view for order analytics outputs: - response: | - ```python - manage_metric_views( - action="create", - full_name="catalog.schema.orders_metrics", - source="catalog.schema.orders", - or_replace=True, - comment="Orders KPIs for sales analysis", - filter_expr="order_date > '2020-01-01'", - dimensions=[ - {"name": "Order Month", "expr": "DATE_TRUNC('MONTH', order_date)", "comment": "Month of order"}, - {"name": "Order Status", "expr": "status"}, - ], - measures=[ - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Total Revenue", "expr": "SUM(total_price)", "comment": "Sum of total price"}, - ], - ) - ``` + response: "```python\nmanage_metric_views(\n action=\"create\",\n full_name=\"\ + catalog.schema.orders_metrics\",\n source=\"catalog.schema.orders\",\n \ + \ or_replace=True,\n comment=\"Orders KPIs for sales analysis\",\n filter_expr=\"\ + order_date > '2020-01-01'\",\n dimensions=[\n {\"name\": \"Order Month\"\ + , \"expr\": \"DATE_TRUNC('MONTH', order_date)\", \"comment\": \"Month of order\"\ + },\n {\"name\": \"Order Status\", \"expr\": \"status\"},\n ],\n \ + \ measures=[\n {\"name\": \"Order Count\", \"expr\": \"COUNT(1)\"},\n\ + \ {\"name\": \"Total Revenue\", \"expr\": \"SUM(total_price)\", \"comment\"\ + : \"Sum of total price\"},\n ],\n)\n```\n" execution_success: true expectations: expected_facts: @@ -159,23 +118,16 @@ test_cases: category: happy_path difficulty: easy source: auto_generated - - id: metric-views_query_mcp_004 inputs: - prompt: "Query the orders_metrics metric view using the MCP tool to get revenue by month" + prompt: Query the orders_metrics metric view using the MCP tool to get revenue + by month outputs: - response: | - ```python - manage_metric_views( - action="query", - full_name="catalog.schema.orders_metrics", - query_measures=["Total Revenue", "Order Count"], - query_dimensions=["Order Month"], - where="extract(year FROM `Order Month`) = 2024", - order_by="ALL", - limit=100, - ) - ``` + response: "```python\nmanage_metric_views(\n action=\"query\",\n full_name=\"\ + catalog.schema.orders_metrics\",\n query_measures=[\"Total Revenue\", \"\ + Order Count\"],\n query_dimensions=[\"Order Month\"],\n where=\"extract(year\ + \ FROM `Order Month`) = 2024\",\n order_by=\"ALL\",\n limit=100,\n)\n\ + ```\n" execution_success: true expectations: expected_facts: @@ -201,37 +153,21 @@ test_cases: category: happy_path difficulty: medium source: auto_generated - - id: metric-views_yaml_spec_005 inputs: - prompt: "Show me the full YAML specification for a metric view definition" + prompt: Show me the full YAML specification for a metric view definition outputs: - response: | - ```yaml - version: 1.1 # Required: "1.1" for DBR 17.2+ - comment: "Description" # Optional - source: catalog.schema.table # Required: source table/view - filter: column > value # Optional: global WHERE filter - - dimensions: # Required: at least one - - name: Display Name - expr: sql_expression - comment: "Description" - - measures: # Required: at least one - - name: Display Name - expr: AGG_FUNC(column) - comment: "Description" - - joins: # Optional: star/snowflake schema - - name: dim_table - source: catalog.schema.dim_table - on: source.fk = dim_table.pk - - materialization: # Optional (experimental) - schedule: every 6 hours - mode: relaxed - ``` + response: "```yaml\nversion: 1.1 # Required: \"1.1\" for DBR\ + \ 17.2+\ncomment: \"Description\" # Optional\nsource: catalog.schema.table\ + \ # Required: source table/view\nfilter: column > value # Optional:\ + \ global WHERE filter\n\ndimensions: # Required: at least\ + \ one\n - name: Display Name\n expr: sql_expression\n comment: \"Description\"\ + \n\nmeasures: # Required: at least one\n - name: Display\ + \ Name\n expr: AGG_FUNC(column)\n comment: \"Description\"\n\njoins: \ + \ # Optional: star/snowflake schema\n - name: dim_table\n\ + \ source: catalog.schema.dim_table\n on: source.fk = dim_table.pk\n\n\ + materialization: # Optional (experimental)\n schedule: every\ + \ 6 hours\n mode: relaxed\n```\n" execution_success: true expectations: expected_facts: @@ -259,37 +195,19 @@ test_cases: category: happy_path difficulty: medium source: auto_generated - - id: metric-views_star_schema_006 inputs: - prompt: "Create a metric view with joins for a star schema (fact orders + dim customers)" + prompt: Create a metric view with joins for a star schema (fact orders + dim customers) outputs: - response: | - ```sql - CREATE OR REPLACE VIEW catalog.schema.sales_metrics - WITH METRICS - LANGUAGE YAML - AS $$ - version: 1.1 - source: catalog.schema.fact_orders - joins: - - name: customers - source: catalog.schema.dim_customers - on: fact_orders.customer_id = customers.customer_id - dimensions: - - name: Customer Region - expr: customers.region - - name: Order Month - expr: DATE_TRUNC('MONTH', order_date) - measures: - - name: Total Revenue - expr: SUM(amount) - - name: Unique Customers - expr: COUNT(DISTINCT customers.customer_id) - $$ - ``` - - Joins must be declared in the YAML definition, not in the SELECT query. + response: "```sql\nCREATE OR REPLACE VIEW catalog.schema.sales_metrics\nWITH METRICS\n\ + LANGUAGE YAML\nAS $$\n version: 1.1\n source: catalog.schema.fact_orders\n\ + \ joins:\n - name: customers\n source: catalog.schema.dim_customers\n\ + \ on: fact_orders.customer_id = customers.customer_id\n dimensions:\n\ + \ - name: Customer Region\n expr: customers.region\n - name: Order\ + \ Month\n expr: DATE_TRUNC('MONTH', order_date)\n measures:\n - name:\ + \ Total Revenue\n expr: SUM(amount)\n - name: Unique Customers\n \ + \ expr: COUNT(DISTINCT customers.customer_id)\n$$\n```\n\nJoins must be declared\ + \ in the YAML definition, not in the SELECT query.\n" execution_success: true expectations: expected_facts: @@ -299,7 +217,7 @@ test_cases: - pattern: joins min_count: 1 description: Must use joins section in YAML - - pattern: "on:" + - pattern: 'on:' min_count: 1 description: Must specify join condition guidelines: @@ -309,20 +227,13 @@ test_cases: category: happy_path difficulty: medium source: auto_generated - - id: metric-views_grant_access_007 inputs: - prompt: "Grant SELECT access on a metric view to the data-consumers group" + prompt: Grant SELECT access on a metric view to the data-consumers group outputs: - response: | - ```python - manage_metric_views( - action="grant", - full_name="catalog.schema.orders_metrics", - principal="data-consumers", - privileges=["SELECT"], - ) - ``` + response: "```python\nmanage_metric_views(\n action=\"grant\",\n full_name=\"\ + catalog.schema.orders_metrics\",\n principal=\"data-consumers\",\n privileges=[\"\ + SELECT\"],\n)\n```\n" execution_success: true expectations: expected_facts: @@ -344,3 +255,1092 @@ test_cases: category: happy_path difficulty: hard source: auto_generated +- id: metric-views_select_star_error_008 + inputs: + prompt: "Why does SELECT * not work on a metric view?" + outputs: + response: | + `SELECT *` is NOT supported on metric views. You must explicitly list dimensions and wrap all measure references in `MEASURE()`. + + Incorrect: + ```sql + SELECT * FROM catalog.schema.orders_metrics + ``` + + Correct: + ```sql + SELECT + `Order Month`, + MEASURE(`Total Revenue`) AS total_revenue, + MEASURE(`Order Count`) AS order_count + FROM catalog.schema.orders_metrics + GROUP BY ALL + ``` + expectations: + expected_facts: + - SELECT * is not supported + - MEASURE() + expected_patterns: + - pattern: "SELECT \\*" + min_count: 1 + description: "Must show the incorrect SELECT * example" + - pattern: "MEASURE\\(" + min_count: 1 + description: "Must show correct MEASURE() usage" + - pattern: "GROUP BY" + min_count: 1 + description: "Must include GROUP BY" + guidelines: + - "Must clearly explain why SELECT * does not work on metric views" + - "Must show both the incorrect way and the correct way" + metadata: + category: error_handling + difficulty: easy + +- id: metric-views_backtick_quoting_009 + inputs: + prompt: "I'm getting 'Cannot resolve column' when querying my metric view. The dimension is called 'Order Month'." + outputs: + response: | + Dimension and measure names with spaces must be backtick-quoted in queries: + + ```sql + -- Wrong: unquoted name with spaces + SELECT Order Month, MEASURE(Total Revenue) FROM my_metrics GROUP BY ALL + + -- Correct: backtick-quoted + SELECT `Order Month`, MEASURE(`Total Revenue`) AS revenue + FROM catalog.schema.my_metrics + GROUP BY ALL + ``` + + Both dimensions in SELECT/GROUP BY and measures inside `MEASURE()` need backticks if the name contains spaces. + expectations: + expected_facts: + - backtick + - MEASURE() + expected_patterns: + - pattern: "`Order Month`" + min_count: 1 + description: "Must show backtick-quoted dimension name" + - pattern: "MEASURE\\(`" + min_count: 1 + description: "Must show backtick-quoted measure inside MEASURE()" + guidelines: + - "Must explain that names with spaces need backtick quoting" + - "Must show both wrong and correct syntax" + metadata: + category: error_handling + difficulty: easy + +- id: metric-views_alter_010 + inputs: + prompt: "Add a new measure 'Average Order Value' to my existing orders_metrics metric view" + outputs: + response: | + Use the `alter` action to update an existing metric view's definition: + + ```python + manage_metric_views( + action="alter", + full_name="catalog.schema.orders_metrics", + measures=[ + {"name": "Order Count", "expr": "COUNT(1)"}, + {"name": "Total Revenue", "expr": "SUM(total_price)"}, + {"name": "Average Order Value", "expr": "SUM(total_price) / COUNT(1)", "comment": "Revenue divided by order count"}, + ], + ) + ``` + + Note: When altering, you must provide the full list of measures (existing + new), not just the new one. + expectations: + expected_facts: + - manage_metric_views + - alter + expected_patterns: + - pattern: "action.*alter" + min_count: 1 + description: "Must use alter action" + - pattern: "Average Order Value" + min_count: 1 + description: "Must include the new measure" + guidelines: + - "Must use manage_metric_views with action='alter'" + - "Must explain that the full measure list is required when altering" + metadata: + category: happy_path + difficulty: medium + +- id: metric-views_describe_011 + inputs: + prompt: "How do I see the current definition of a metric view?" + outputs: + response: | + Use the `describe` action to get the full definition: + + ```python + manage_metric_views( + action="describe", + full_name="catalog.schema.orders_metrics", + ) + ``` + + This returns the YAML definition, dimensions, measures, source table, and any join or materialization configuration. + expectations: + expected_facts: + - manage_metric_views + - describe + expected_patterns: + - pattern: "action.*describe" + min_count: 1 + description: "Must use describe action" + - pattern: "full_name" + min_count: 1 + description: "Must specify the metric view name" + metadata: + category: happy_path + difficulty: easy + +- id: metric-views_when_not_to_use_012 + inputs: + prompt: "Should I use a metric view or a regular view for my dashboard?" + outputs: + response: | + Use a **metric view** when: + - You need flexible aggregation at query time (different GROUP BY per dashboard panel) + - You have ratios that need safe re-aggregation (e.g., revenue per customer) + - You want star/snowflake schema joins declared in the definition + - You need AI/BI Genie integration for natural language querying + + Use a **standard view** when: + - Your query is a simple SELECT with no aggregations + - The aggregation is always the same (no need for MEASURE() flexibility) + - You need features not yet supported by metric views (e.g., UDFs, complex CTEs) + + Key difference: standard views lock aggregation at creation time. Metric views separate dimensions from measures so the same metric can be sliced differently in each query. + expectations: + expected_facts: + - MEASURE() + - Genie + expected_patterns: + - pattern: "metric view" + min_count: 2 + description: "Must discuss metric views" + - pattern: "standard view|regular view" + min_count: 1 + description: "Must contrast with standard views" + - pattern: "aggregat" + min_count: 1 + description: "Must discuss aggregation behavior" + guidelines: + - "Must explain the key difference: standard views lock aggregation, metric views are flexible" + - "Should mention at least one scenario where a standard view is better" + metadata: + category: conceptual + difficulty: medium + +- id: metric-views_filtered_measure_013 + inputs: + prompt: "Create a metric view with a filtered measure that only counts delivered orders" + outputs: + response: | + Use a CASE expression inside the aggregate to create a filtered measure: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.orders_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + source: catalog.schema.orders + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Total Orders + expr: COUNT(1) + - name: Delivered Orders + expr: COUNT(CASE WHEN delivery_status = 'delivered' THEN 1 END) + comment: "Only counts delivered orders" + - name: Delivery Rate + expr: COUNT(CASE WHEN delivery_status = 'delivered' THEN 1 END) * 1.0 / COUNT(1) + comment: "Ratio of delivered to total orders" + $$ + ``` + + Filtered measures use CASE WHEN inside the aggregate function to restrict which rows are counted. + expectations: + expected_facts: + - CASE WHEN + - CREATE OR REPLACE VIEW + expected_patterns: + - pattern: "WITH METRICS" + min_count: 1 + description: "Must use metric view syntax" + - pattern: "CASE WHEN" + min_count: 1 + description: "Must use CASE WHEN for filtered measure" + - pattern: "COUNT\\(" + min_count: 2 + description: "Must have both total and filtered count" + guidelines: + - "Must show how to filter rows within a measure expression" + - "Should include a ratio measure that uses the filtered count" + metadata: + category: happy_path + difficulty: medium + +- id: metric-views_materialization_014 + inputs: + prompt: "How do I enable materialization on a metric view for better query performance?" + outputs: + response: | + Add a `materialization` section to the YAML definition. This is experimental and requires serverless compute: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.orders_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + source: catalog.schema.orders + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Total Revenue + expr: SUM(total_price) + materialization: + schedule: every 6 hours + mode: relaxed + $$ + ``` + + The `schedule` controls how often the pre-computed aggregations refresh. The `mode: relaxed` allows queries to return slightly stale data for better performance. + + Note: Materialization is currently experimental and requires serverless compute to be enabled on your workspace. + expectations: + expected_facts: + - materialization + - serverless + - experimental + expected_patterns: + - pattern: "materialization" + min_count: 1 + description: "Must include materialization section" + - pattern: "schedule" + min_count: 1 + description: "Must specify refresh schedule" + guidelines: + - "Must clearly state that materialization is experimental" + - "Must mention serverless compute requirement" + metadata: + category: advanced + difficulty: hard + +- id: metric-views_join_at_query_time_015 + inputs: + prompt: "Can I JOIN my metric view with another table in a query?" + outputs: + response: | + No — joins must be declared in the YAML definition, not at query time. A JOIN in the SELECT query is not supported for metric views. + + Instead, add the join to the metric view definition: + + ```python + manage_metric_views( + action="alter", + full_name="catalog.schema.sales_metrics", + joins=[ + { + "name": "customers", + "source": "catalog.schema.dim_customers", + "on": "orders.customer_id = customers.customer_id", + } + ], + ) + ``` + + After adding the join, you can reference columns from the joined table in dimensions and measures. + expectations: + expected_facts: + - joins + - YAML definition + expected_patterns: + - pattern: "joins" + min_count: 1 + description: "Must mention joins" + - pattern: "manage_metric_views|CREATE OR REPLACE" + min_count: 1 + description: "Must show how to add joins to the definition" + guidelines: + - "Must clearly state that JOINs at query time are NOT supported" + - "Must show how to add joins to the metric view definition instead" + metadata: + category: error_handling + difficulty: medium + +- id: metric-views_dbr_version_016 + inputs: + prompt: "I'm getting an error when creating a metric view. I'm on DBR 15.4." + outputs: + response: | + Metric views with YAML version 1.1 require **Databricks Runtime 17.2+**. DBR 15.4 does not support this feature. + + Options: + 1. Upgrade your cluster to DBR 17.2 or later + 2. Use a SQL warehouse (which always runs the latest runtime) + 3. If you must use an older runtime, YAML version 0.1 is supported on DBR 16.4+ but has fewer features (no comments on dimensions/measures, no materialization) + + ```sql + -- Check your DBR version + SELECT current_version() + ``` + expectations: + expected_facts: + - "17.2" + - SQL warehouse + expected_patterns: + - pattern: "17\\.2" + min_count: 1 + description: "Must mention DBR 17.2 requirement" + guidelines: + - "Must explain the minimum DBR version requirement" + - "Should suggest SQL warehouse as an alternative to cluster upgrades" + metadata: + category: error_handling + difficulty: easy + +- id: metric-views_drop_action_017 + inputs: + prompt: Drop the orders_metrics metric view from the catalog + outputs: + response: | + Use the `drop` action to remove a metric view: + + ```python + manage_metric_views( + action="drop", + full_name="catalog.schema.orders_metrics", + ) + ``` + + This drops the metric view definition. If materialization was enabled, the associated pipeline is also removed. The underlying source table is NOT affected. + execution_success: true + expectations: + expected_facts: + - manage_metric_views + - drop + - source table is not affected + expected_patterns: + - pattern: manage_metric_views + min_count: 1 + description: Must use manage_metric_views tool + - pattern: action.*drop + min_count: 1 + description: Must set action to drop + - pattern: full_name + min_count: 1 + description: Must specify the metric view name + guidelines: + - Must use manage_metric_views with action="drop" + - Should clarify that the source table is not deleted + metadata: + category: happy_path + difficulty: easy + source: manual + +- id: metric-views_window_rolling_avg_018 + inputs: + prompt: Create a metric view with a rolling 7-day average of daily revenue + outputs: + response: | + Window measures use `version: 0.1` (experimental). Here's a rolling 7-day trailing revenue measure: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.revenue_trends + WITH METRICS + LANGUAGE YAML + AS $$ + version: 0.1 + source: catalog.schema.daily_sales + filter: sale_date > DATE'2024-01-01' + + dimensions: + - name: date + expr: sale_date + + measures: + - name: daily_revenue + expr: SUM(amount) + window: + - order: date + range: current + semiadditive: last + + - name: trailing_7d_revenue + expr: SUM(amount) + window: + - order: date + range: trailing 7 day + semiadditive: last + + - name: rolling_7d_avg + expr: MEASURE(trailing_7d_revenue) / 7 + $$ + ``` + + Query it with the standard `MEASURE()` syntax: + + ```sql + SELECT + date, + MEASURE(daily_revenue) AS daily, + MEASURE(rolling_7d_avg) AS rolling_avg + FROM catalog.schema.revenue_trends + GROUP BY ALL + ORDER BY date + ``` + + Key points: + - `trailing 7 day` includes the 7 days **before** the current date, excluding the current date + - `semiadditive: last` returns the last value when `date` is not in GROUP BY + - The derived `rolling_7d_avg` uses `MEASURE()` to reference another window measure and doesn't need its own `window` block + execution_success: true + expectations: + expected_facts: + - version 0.1 + - window + - trailing 7 day + - semiadditive + expected_patterns: + - pattern: "version: 0.1" + min_count: 1 + description: Window measures require version 0.1 + - pattern: "window:" + min_count: 1 + description: Must include window block + - pattern: "trailing 7 day" + min_count: 1 + description: Must use trailing window range + - pattern: "semiadditive" + min_count: 1 + description: Must specify semiadditive behavior + - pattern: "MEASURE\\(" + min_count: 1 + description: Must show MEASURE() for derived measure or query + guidelines: + - Must use version 0.1 for window measures + - Must explain trailing range semantics (excludes current day) + - Must show how derived measures reference window measures via MEASURE() + metadata: + category: advanced + difficulty: hard + source: manual + +- id: metric-views_genie_integration_019 + inputs: + prompt: How do I use my metric view with AI/BI Genie? + outputs: + response: | + Metric views work natively with AI/BI Genie for natural language querying of your metrics. Here's how to set it up: + + 1. **Create a Genie Space** and add your metric view as a data source (just like adding a table). + + 2. **Ask questions in natural language** — Genie automatically uses `MEASURE()` to query your metrics: + - "What was total revenue last quarter?" + - "Show me order count by region for 2024" + - "Which product category has the highest profit margin?" + + 3. **Benefits of metric views with Genie:** + - Genie understands dimension/measure semantics — no ambiguity about how to aggregate + - Comments on dimensions and measures improve Genie's understanding of your data + - Consistent metric definitions across all Genie users + - Star schema joins are handled automatically + + 4. **Tips for better Genie results:** + - Add descriptive `comment` fields to all dimensions and measures + - Use clear, business-friendly names (e.g., "Total Revenue" not "sum_tp") + - Keep dimension names intuitive for natural language queries + + No special configuration is needed — once the metric view is added to a Genie Space, users can query it immediately with natural language. + execution_success: true + expectations: + expected_facts: + - Genie + - natural language + - metric view + - MEASURE() + expected_patterns: + - pattern: "Genie" + min_count: 2 + description: Must discuss Genie integration + - pattern: "natural language" + min_count: 1 + description: Must mention natural language querying + - pattern: "MEASURE|metric view" + min_count: 1 + description: Must mention metric view or MEASURE concepts + guidelines: + - Must explain how to add a metric view to a Genie Space + - Must mention that Genie automatically uses MEASURE() + - Should recommend adding comments for better Genie understanding + metadata: + category: conceptual + difficulty: medium + source: manual + +- id: metric-views_conversational_support_tickets_020 + inputs: + prompt: "hey I have a table of customer support tickets and I want to track resolution\ + \ time metrics, can you help me set that up as a metric view?" + outputs: + response: | + Sure! Here's a metric view for tracking support ticket resolution metrics: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.support_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + comment: "Customer support ticket KPIs" + source: catalog.schema.support_tickets + filter: created_at >= '2024-01-01' + + dimensions: + - name: Created Month + expr: DATE_TRUNC('MONTH', created_at) + comment: "Month the ticket was opened" + - name: Priority + expr: CASE + WHEN priority = 1 THEN 'Critical' + WHEN priority = 2 THEN 'High' + WHEN priority = 3 THEN 'Medium' + ELSE 'Low' + END + comment: "Ticket priority level" + - name: Category + expr: ticket_category + comment: "Support category (billing, technical, etc.)" + + measures: + - name: Ticket Count + expr: COUNT(1) + - name: Resolved Tickets + expr: COUNT(1) FILTER (WHERE status = 'resolved') + - name: Avg Resolution Hours + expr: AVG(TIMESTAMPDIFF(HOUR, created_at, resolved_at)) + comment: "Average hours to resolve a ticket" + - name: Resolution Rate + expr: COUNT(1) FILTER (WHERE status = 'resolved') * 1.0 / COUNT(1) + comment: "Percentage of tickets resolved" + - name: Median Resolution Hours + expr: PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY TIMESTAMPDIFF(HOUR, created_at, resolved_at)) + comment: "Median hours to resolution" + $$ + ``` + + Query it like this: + + ```sql + SELECT + `Created Month`, + `Priority`, + MEASURE(`Ticket Count`) AS tickets, + MEASURE(`Avg Resolution Hours`) AS avg_hours, + MEASURE(`Resolution Rate`) AS resolution_rate + FROM catalog.schema.support_metrics + GROUP BY ALL + ORDER BY ALL + ``` + execution_success: true + expectations: + expected_facts: + - CREATE OR REPLACE VIEW + - WITH METRICS + - support + - resolution + expected_patterns: + - pattern: "CREATE OR REPLACE VIEW" + min_count: 1 + description: Must use CREATE OR REPLACE VIEW syntax + - pattern: "WITH METRICS" + min_count: 1 + description: Must include WITH METRICS clause + - pattern: "LANGUAGE YAML" + min_count: 1 + description: Must specify LANGUAGE YAML + - pattern: "dimensions" + min_count: 1 + description: Must define dimensions + - pattern: "measures" + min_count: 1 + description: Must define measures + - pattern: "MEASURE\\(" + min_count: 1 + description: Must show MEASURE() in query example + - pattern: "FILTER" + min_count: 1 + description: Should use FILTER clause for conditional measures + guidelines: + - Must create a metric view relevant to support tickets domain + - Must include resolution time metrics + - Must include a query example using MEASURE() + metadata: + category: happy_path + difficulty: medium + source: manual + +- id: metric-views_udf_not_supported_021 + inputs: + prompt: Can I use a Python UDF inside a metric view measure expression? + outputs: + response: | + No — metric view measure expressions only support **SQL aggregate functions** and built-in SQL expressions. Python UDFs, Scala UDFs, and custom functions are not supported in metric view YAML definitions. + + **Supported in measure expressions:** + - SQL aggregate functions: `SUM()`, `COUNT()`, `AVG()`, `MIN()`, `MAX()` + - `COUNT(DISTINCT ...)` + - `FILTER (WHERE ...)` clause + - Arithmetic on aggregates: `SUM(a) / COUNT(b)` + - `CASE WHEN` inside aggregates + - Window measures with `window:` block + + **NOT supported:** + - Python UDFs (`@udf` decorated functions) + - Scala UDFs + - `ai_query()` or other AI functions in measure expressions + - Complex CTEs or subqueries + + **Workaround:** If you need a UDF-computed value, materialize it into the source table first, then reference the pre-computed column in your metric view: + + ```sql + -- Step 1: Create source table with UDF-computed column + CREATE TABLE catalog.schema.enriched_orders AS + SELECT *, my_python_udf(description) AS sentiment_score + FROM catalog.schema.raw_orders; + + -- Step 2: Use the pre-computed column in the metric view + CREATE OR REPLACE VIEW catalog.schema.sentiment_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + source: catalog.schema.enriched_orders + dimensions: + - name: Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Avg Sentiment + expr: AVG(sentiment_score) + $$ + ``` + execution_success: true + expectations: + expected_facts: + - UDF + - not supported + - SQL aggregate functions + expected_patterns: + - pattern: "not supported|NOT supported|No" + min_count: 1 + description: Must clearly state UDFs are not supported + - pattern: "SUM|COUNT|AVG" + min_count: 1 + description: Must list supported aggregate functions + - pattern: "workaround|pre-computed|source table" + min_count: 1 + description: Should suggest a workaround + guidelines: + - Must clearly state that UDFs are not supported in metric views + - Must list what IS supported as alternatives + - Should suggest pre-computing UDF results in the source table as a workaround + metadata: + category: error_handling + difficulty: medium + source: manual + +- id: metric-views_hr_employee_domain_022 + inputs: + prompt: Build a metric view for HR analytics tracking employee headcount, attrition, + and tenure from our employees table + outputs: + response: | + ```python + manage_metric_views( + action="create", + full_name="catalog.schema.hr_metrics", + source="catalog.schema.employees", + or_replace=True, + comment="HR analytics: headcount, attrition, and tenure metrics", + dimensions=[ + {"name": "Department", "expr": "department", "comment": "Employee department"}, + {"name": "Hire Year", "expr": "EXTRACT(YEAR FROM hire_date)", "comment": "Year of hire"}, + {"name": "Location", "expr": "office_location"}, + {"name": "Level", "expr": "CASE WHEN job_level <= 3 THEN 'Junior' WHEN job_level <= 5 THEN 'Mid' ELSE 'Senior' END", "comment": "Bucketed seniority level"}, + ], + measures=[ + {"name": "Headcount", "expr": "COUNT(1)"}, + {"name": "Active Employees", "expr": "COUNT(1) FILTER (WHERE termination_date IS NULL)", "comment": "Currently active employees"}, + {"name": "Attrition Count", "expr": "COUNT(1) FILTER (WHERE termination_date IS NOT NULL)", "comment": "Employees who have left"}, + {"name": "Attrition Rate", "expr": "COUNT(1) FILTER (WHERE termination_date IS NOT NULL) * 1.0 / COUNT(1)", "comment": "Percentage of employees who left"}, + {"name": "Avg Tenure Years", "expr": "AVG(DATEDIFF(COALESCE(termination_date, CURRENT_DATE()), hire_date) / 365.25)", "comment": "Average tenure in years"}, + ], + ) + ``` + execution_success: true + expectations: + expected_facts: + - manage_metric_views + - create + - headcount + - attrition + expected_patterns: + - pattern: "manage_metric_views" + min_count: 1 + description: Must use manage_metric_views tool + - pattern: "action.*create" + min_count: 1 + description: Must set action to create + - pattern: "FILTER" + min_count: 1 + description: Should use FILTER clause for conditional measures + - pattern: "dimensions" + min_count: 1 + description: Must define dimensions + - pattern: "measures" + min_count: 1 + description: Must define measures + guidelines: + - Must create an HR-relevant metric view (not orders/sales) + - Must include attrition-related measures + - Must use manage_metric_views MCP tool + metadata: + category: happy_path + difficulty: medium + source: manual + +- id: metric-views_migration_from_regular_view_023 + inputs: + prompt: I have this existing view and want to convert it to a metric view. Here's + my current SQL - SELECT region, DATE_TRUNC('MONTH', sale_date) AS month, SUM(amount) + AS revenue, COUNT(*) AS num_sales FROM catalog.schema.transactions GROUP BY 1, + 2 + outputs: + response: | + Here's your existing view converted to a metric view. The key change is separating the GROUP BY columns into **dimensions** and the aggregations into **measures**: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.transaction_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + comment: "Transaction metrics - migrated from standard view" + source: catalog.schema.transactions + + dimensions: + - name: Region + expr: region + - name: Month + expr: DATE_TRUNC('MONTH', sale_date) + + measures: + - name: Revenue + expr: SUM(amount) + - name: Num Sales + expr: COUNT(1) + $$ + ``` + + Now you can query it flexibly — the GROUP BY is no longer fixed: + + ```sql + -- Same result as your original view + SELECT `Region`, `Month`, + MEASURE(`Revenue`) AS revenue, + MEASURE(`Num Sales`) AS num_sales + FROM catalog.schema.transaction_metrics + GROUP BY ALL + + -- NEW: Revenue by region only (no month breakdown) + SELECT `Region`, + MEASURE(`Revenue`) AS revenue + FROM catalog.schema.transaction_metrics + GROUP BY ALL + + -- NEW: Monthly totals across all regions + SELECT `Month`, + MEASURE(`Revenue`) AS revenue + FROM catalog.schema.transaction_metrics + GROUP BY ALL + ``` + + **Key differences from standard views:** + - Aggregation is flexible at query time — no need to recreate the view for different GROUP BY + - Measures must be wrapped in `MEASURE()` when querying + - `SELECT *` is not supported — you must explicitly list dimensions and measures + execution_success: true + expectations: + expected_facts: + - dimensions + - measures + - GROUP BY + - flexible + expected_patterns: + - pattern: "CREATE OR REPLACE VIEW" + min_count: 1 + description: Must show the new metric view DDL + - pattern: "WITH METRICS" + min_count: 1 + description: Must include WITH METRICS clause + - pattern: "MEASURE\\(" + min_count: 2 + description: Must show MEASURE() in query examples + - pattern: "GROUP BY" + min_count: 1 + description: Must demonstrate flexible GROUP BY + guidelines: + - Must convert the provided SQL into a metric view + - Must explain the dimension/measure separation + - Must show that the metric view enables flexible querying + metadata: + category: happy_path + difficulty: hard + source: manual + +- id: metric-views_semiadditive_balance_024 + inputs: + prompt: Create a metric view for bank account balances that correctly handles aggregation + across time (don't sum balances across dates) + outputs: + response: | + Use a **semiadditive** window measure to prevent summing balances across dates. This returns the last date's balance when the date dimension is not in the GROUP BY: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.account_balance_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 0.1 + source: catalog.schema.daily_balances + + dimensions: + - name: date + expr: balance_date + - name: account_type + expr: account_type + + measures: + - name: total_balance + expr: SUM(balance) + window: + - order: date + range: current + semiadditive: last + $$ + ``` + + **How it works:** + - When grouped by `date`: returns the sum of all account balances for that specific date (aggregates across accounts, not time) + - When NOT grouped by `date`: returns the balance from the **most recent date** (`semiadditive: last`) + + ```sql + -- Balance by date and account type + SELECT date, account_type, + MEASURE(total_balance) AS balance + FROM catalog.schema.account_balance_metrics + GROUP BY ALL + + -- Latest total balance by account type (semiadditive kicks in) + SELECT account_type, + MEASURE(total_balance) AS latest_balance + FROM catalog.schema.account_balance_metrics + GROUP BY ALL + ``` + + Note: Window measures require `version: 0.1` (experimental feature). + execution_success: true + expectations: + expected_facts: + - semiadditive + - window + - version 0.1 + - balance + expected_patterns: + - pattern: "semiadditive" + min_count: 1 + description: Must use semiadditive measure + - pattern: "window:" + min_count: 1 + description: Must include window block + - pattern: "version: 0.1" + min_count: 1 + description: Window measures require version 0.1 + - pattern: "MEASURE\\(" + min_count: 1 + description: Must show MEASURE() query syntax + guidelines: + - Must use semiadditive window measure to prevent summing across time + - Must explain the behavior difference when date is vs is not in GROUP BY + - Must note that window measures require version 0.1 + metadata: + category: advanced + difficulty: hard + source: manual + +- id: metric-views_messy_phrasing_025 + inputs: + prompt: "so umm i wanna make like a metrics thing for my product_usage table??\ + \ like track daily active users and feature adoption.. not sure how metric views\ + \ work exactly" + outputs: + response: | + Metric views let you define reusable metrics in YAML that can be sliced by any dimension at query time. Here's one for your product usage data: + + ```sql + CREATE OR REPLACE VIEW catalog.schema.product_usage_metrics + WITH METRICS + LANGUAGE YAML + AS $$ + version: 1.1 + comment: "Product usage KPIs: DAU and feature adoption" + source: catalog.schema.product_usage + + dimensions: + - name: Date + expr: usage_date + - name: Feature + expr: feature_name + comment: "Product feature being used" + - name: Platform + expr: platform + comment: "web, ios, android" + + measures: + - name: Daily Active Users + expr: COUNT(DISTINCT user_id) + comment: "Unique users per day" + - name: Total Events + expr: COUNT(1) + - name: Events per User + expr: COUNT(1) * 1.0 / COUNT(DISTINCT user_id) + comment: "Average events per unique user" + $$ + ``` + + Query your metrics: + + ```sql + -- DAU by date + SELECT `Date`, MEASURE(`Daily Active Users`) AS dau + FROM catalog.schema.product_usage_metrics + GROUP BY ALL + ORDER BY `Date` + + -- Feature adoption: DAU by feature + SELECT `Feature`, MEASURE(`Daily Active Users`) AS users, + MEASURE(`Events per User`) AS engagement + FROM catalog.schema.product_usage_metrics + GROUP BY ALL + ORDER BY users DESC + ``` + + The same metric view supports both queries — you just change which dimensions you include in SELECT and GROUP BY. + execution_success: true + expectations: + expected_facts: + - CREATE OR REPLACE VIEW + - WITH METRICS + - product usage + - daily active users + expected_patterns: + - pattern: "CREATE OR REPLACE VIEW" + min_count: 1 + description: Must use CREATE OR REPLACE VIEW syntax + - pattern: "WITH METRICS" + min_count: 1 + description: Must include WITH METRICS clause + - pattern: "LANGUAGE YAML" + min_count: 1 + description: Must specify LANGUAGE YAML + - pattern: "COUNT\\(DISTINCT" + min_count: 1 + description: Must use COUNT(DISTINCT) for DAU + - pattern: "MEASURE\\(" + min_count: 1 + description: Must show MEASURE() in query + guidelines: + - Must handle the informal/messy phrasing gracefully + - Must create a product-usage metric view (not orders) + - Must include DAU and feature adoption metrics + metadata: + category: happy_path + difficulty: easy + source: manual + +- id: metric-views_query_with_where_filter_026 + inputs: + prompt: "I want to query my metric view but only for a specific region and date\ + \ range. How do I filter?" + outputs: + response: | + Use standard SQL `WHERE` clauses on dimensions when querying a metric view: + + ```sql + SELECT + `Order Month`, + MEASURE(`Total Revenue`) AS revenue, + MEASURE(`Order Count`) AS orders + FROM catalog.schema.orders_metrics + WHERE `Region` = 'EMEA' + AND `Order Month` >= DATE'2024-01-01' + AND `Order Month` < DATE'2025-01-01' + GROUP BY ALL + ORDER BY `Order Month` + ``` + + Or via the MCP tool: + + ```python + manage_metric_views( + action="query", + full_name="catalog.schema.orders_metrics", + query_measures=["Total Revenue", "Order Count"], + query_dimensions=["Order Month"], + where="`Region` = 'EMEA' AND `Order Month` >= DATE'2024-01-01'", + order_by="ALL", + limit=100, + ) + ``` + + **Key points:** + - Filter on dimensions using WHERE — just like a normal SQL query + - Dimension names with spaces need backtick quoting in WHERE clauses too + - The metric view's global `filter` (defined in YAML) is always applied in addition to your WHERE + - You cannot filter directly on measures — use HAVING after GROUP BY for that + execution_success: true + expectations: + expected_facts: + - WHERE + - backtick + - MEASURE() + expected_patterns: + - pattern: "WHERE" + min_count: 2 + description: Must show WHERE clause for filtering + - pattern: "MEASURE\\(" + min_count: 1 + description: Must use MEASURE() for measures + - pattern: "GROUP BY" + min_count: 1 + description: Must include GROUP BY + guidelines: + - Must show how to filter on dimensions in WHERE clause + - Must mention backtick quoting for dimension names with spaces + - Should show both SQL and MCP approaches + metadata: + category: happy_path + difficulty: easy + source: manual diff --git a/.test/src/skill_test/optimize/asi.py b/.test/src/skill_test/optimize/asi.py index 95020035..9c0e6cff 100644 --- a/.test/src/skill_test/optimize/asi.py +++ b/.test/src/skill_test/optimize/asi.py @@ -3,12 +3,28 @@ Builds an Actionable Side Information dict from scorer feedback so GEPA's reflection LM gets structured context about what went wrong with each scorer. Failure details are surfaced via the ``_failures`` key in the returned dict. + +Also provides ``skillbench_to_asi()`` for the SkillBench-style evaluator, +which produces GEPA-optimized side info with standard diagnostic keys +(Error, Expected, Actual) and ``skill_md_specific_info`` for per-component +routing. """ -from typing import Any +from __future__ import annotations + +from typing import Any, TYPE_CHECKING from mlflow.entities import Feedback +if TYPE_CHECKING: + from .assertions import AssertionResult + +from .assertions import ( + summarize_failures as _summarize_failures, + _classify_assertion, + _extract_content, +) + def feedback_to_score(feedback: Feedback) -> float | None: """Convert a single MLflow Feedback to a numeric score. @@ -157,3 +173,97 @@ def build_rich_asi( side_info["scores"] = per_dimension_scores return composite, side_info + + +# --------------------------------------------------------------------------- +# SkillBench → GEPA side info +# --------------------------------------------------------------------------- + + +def skillbench_to_asi( + with_results: list[AssertionResult], + without_results: list[AssertionResult], + *, + task_prompt: str | None = None, + scores: dict[str, float] | None = None, + with_response: str | None = None, + without_response: str | None = None, + reference_answer: str | None = None, + candidate: dict[str, str] | None = None, +) -> dict[str, Any]: + """Convert SkillBench assertion results to GEPA-optimized side info. + + Produces a flat dict with GEPA's standard diagnostic keys plus actual + agent output and reference answers so the reflection LM can make + targeted SKILL.md edits. + + Budget: ~1480 chars/example (Task 200 + Error ~80 + Expected 500 + + Actual 500 + scores ~200). With minibatch=3: ~4440 chars (~1100 tokens). + + Keys produced (all optional, only non-empty included): + ``Task`` — the task prompt (truncated at 200 chars) + ``Error`` — compact NEEDS_SKILL/REGRESSION assertion labels + ``Expected`` — reference answer from ground_truth.yaml (truncated at 500 chars) + ``Actual`` — agent response WITH skill (truncated at 500 chars) + ``skill_md_specific_info`` — sub-dict with ``Regressions`` for per-component routing + ``scores`` — score breakdown promoted to objective_scores by GEPA + + Args: + with_results: Assertion results from the WITH-skill run. + without_results: Assertion results from the WITHOUT-skill run. + task_prompt: The test prompt (for reflection context). + scores: Score breakdown dict (effectiveness, pass_with, structure, final). + with_response: Agent output WITH skill (truncated at 500 chars). + without_response: Agent output WITHOUT skill (reserved for future use). + reference_answer: Ground truth answer from ground_truth.yaml. + candidate: Full candidate dict for tool-specific diagnostic routing. + + Returns: + Side info dict for optimize_anything. + """ + diag = _summarize_failures(with_results, without_results) + + side_info: dict[str, Any] = {} + + # 1. Task context (short — just enough for the reflection LM) + if task_prompt: + side_info["Task"] = task_prompt[:200] + + # 2. Error: what specific assertions fail (from assertions.py) + if diag.get("Error"): + side_info["Error"] = diag["Error"] + + # 3. Expected: reference answer (what correct output looks like) + if reference_answer: + side_info["Expected"] = reference_answer[:500] + + # 4. Actual: agent response WITH skill (what was produced) + if with_response is not None: + side_info["Actual"] = with_response[:500] + + # 5. Regressions: routed to skill_md component + if diag.get("Regressions"): + side_info["skill_md_specific_info"] = {"Regressions": diag["Regressions"]} + + # 5b. Route tool-specific failures to {component}_specific_info + if candidate: + tool_components = {k: v for k, v in candidate.items() if k.startswith("tools_")} + for comp_name, comp_text in tool_components.items(): + comp_text_lower = comp_text.lower() + tool_failures = [] + for w, wo in zip(with_results, without_results): + label = _classify_assertion(w, wo) + if label in ("NEEDS_SKILL", "REGRESSION"): + content = _extract_content(w) + if content.lower() in comp_text_lower: + tool_failures.append(f"{label}: {w.assertion_type} — '{content}'") + if tool_failures: + side_info[f"{comp_name}_specific_info"] = { + "Related_assertions": "\n".join(tool_failures) + } + + # 6. Scores: needed for GEPA Pareto tracking + if scores: + side_info["scores"] = scores + + return side_info diff --git a/.test/src/skill_test/optimize/assertions.py b/.test/src/skill_test/optimize/assertions.py new file mode 100644 index 00000000..3b6815ab --- /dev/null +++ b/.test/src/skill_test/optimize/assertions.py @@ -0,0 +1,190 @@ +"""Binary assertion layer for SkillBench-style evaluation. + +Wraps pattern and fact checks into binary pass/fail assertions, +mirroring SkillBench's pytest-style binary approach. No fuzzy keyword +scoring -- each assertion either passes or fails. +""" + +import re +from dataclasses import dataclass +from typing import Any + + +@dataclass +class AssertionResult: + """Result of a single binary assertion.""" + + name: str + passed: bool + rationale: str + assertion_type: str # "pattern" | "fact" + + +def _run_pattern_assertions(response: str, expected_patterns: list) -> list[AssertionResult]: + """Run pattern assertions against a response. + + Each pattern spec can be a plain regex string or a dict with + ``pattern``, ``min_count``, ``max_count``, ``description`` keys. + """ + results = [] + for pattern_spec in expected_patterns: + if isinstance(pattern_spec, str): + pattern = pattern_spec + min_count = 1 + max_count = None + description = pattern[:40] + else: + pattern = pattern_spec["pattern"] + min_count = pattern_spec.get("min_count", 1) + max_count = pattern_spec.get("max_count", None) + description = pattern_spec.get("description", pattern[:40]) + + matches = len(re.findall(pattern, response, re.IGNORECASE)) + + if max_count is not None: + passed = min_count <= matches <= max_count + rationale = f"Found {matches} matches (need {min_count}-{max_count})" + else: + passed = matches >= min_count + rationale = f"Found {matches} matches (need >={min_count})" + + results.append(AssertionResult( + name=f"pattern_{description}", + passed=passed, + rationale=rationale, + assertion_type="pattern", + )) + return results + + +def _run_fact_assertions(response: str, expected_facts: list[str]) -> list[AssertionResult]: + """Run fact assertions against a response. + + Exact substring match (case-insensitive). No fuzzy keyword overlap. + """ + response_lower = response.lower() + results = [] + for fact in expected_facts: + found = fact.lower() in response_lower + results.append(AssertionResult( + name=f"fact_{fact[:40]}", + passed=found, + rationale=f"{'Found' if found else 'Missing'}: {fact}", + assertion_type="fact", + )) + return results + + +def run_all_assertions(response: str, expectations: dict[str, Any]) -> list[AssertionResult]: + """Run all pattern + fact assertions, return binary pass/fail per assertion. + + Args: + response: The text to check assertions against. + expectations: Dict with optional ``expected_patterns`` and ``expected_facts`` keys. + + Returns: + List of AssertionResult with binary pass/fail for each assertion. + """ + results: list[AssertionResult] = [] + + patterns = expectations.get("expected_patterns", []) + if patterns: + results.extend(_run_pattern_assertions(response, patterns)) + + facts = expectations.get("expected_facts", []) + if facts: + results.extend(_run_fact_assertions(response, facts)) + + return results + + +def _classify_assertion( + with_result: AssertionResult, + without_result: AssertionResult, +) -> str: + """Classify a single assertion by comparing with-skill vs without-skill. + + Returns one of: + POSITIVE — fails without skill, passes with (skill is helping) + REGRESSION — passes without skill, fails with (skill is confusing the agent) + NEEDS_SKILL — fails both with and without (skill must add this content) + NEUTRAL — same result either way (agent already knows this) + """ + if with_result.passed and not without_result.passed: + return "POSITIVE" + elif not with_result.passed and without_result.passed: + return "REGRESSION" + elif not with_result.passed and not without_result.passed: + return "NEEDS_SKILL" + else: + return "NEUTRAL" + + +def _extract_content(result: AssertionResult) -> str: + """Extract the actual expected content from an assertion result. + + For facts, strips the ``Missing: `` / ``Found: `` prefix to get the raw + fact text. For patterns, uses the description embedded in the assertion + name (strips the ``pattern_`` prefix). + """ + if result.assertion_type == "fact": + for prefix in ("Missing: ", "Found: "): + if result.rationale.startswith(prefix): + return result.rationale[len(prefix):] + return result.rationale + else: + # Pattern: name is "pattern_{description}", rationale is match count + return result.name.removeprefix("pattern_") + + +def summarize_failures( + with_results: list[AssertionResult], + without_results: list[AssertionResult], +) -> dict[str, str]: + """Build GEPA-friendly diagnostic strings from assertion results. + + Collects only NEEDS_SKILL and REGRESSION assertions (skips NEUTRAL/POSITIVE) + and produces structured output that maps to GEPA's standard diagnostic keys. + + Only non-empty keys are included in the returned dict so that GEPA does not + render empty ``## Header`` sections that waste tokens and confuse the + reflection LM. + + Returns: + Dict with a subset of: ``Error``, ``Regressions``. + ``Error`` carries compact NEEDS_SKILL/REGRESSION tokens that downstream + consumers (``_review_skillbench``, ``build_skillbench_background``) parse. + ``Regressions`` is a concise NL summary only present when regressions exist. + """ + needs_skill: list[tuple[AssertionResult, AssertionResult]] = [] + regressions: list[tuple[AssertionResult, AssertionResult]] = [] + + for w, wo in zip(with_results, without_results): + label = _classify_assertion(w, wo) + if label == "NEEDS_SKILL": + needs_skill.append((w, wo)) + elif label == "REGRESSION": + regressions.append((w, wo)) + + result: dict[str, str] = {} + + # Error: compact assertion labels (NEEDS_SKILL/REGRESSION tokens preserved) + error_lines: list[str] = [] + for w, _ in needs_skill: + content = _extract_content(w) + error_lines.append(f"NEEDS_SKILL: {w.assertion_type} — '{content}'") + for w, _ in regressions: + content = _extract_content(w) + error_lines.append(f"REGRESSION: {w.assertion_type} — '{content}'") + if error_lines: + result["Error"] = "\n".join(error_lines) + + # Regressions: concise NL (only when non-empty) + if regressions: + lines: list[str] = [] + for i, (w, wo) in enumerate(regressions, 1): + content = _extract_content(w) + lines.append(f"{i}. '{content}' — passes without skill, fails with it") + result["Regressions"] = "\n".join(lines) + + return result diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index efa4dd1a..2aca7a6e 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -16,6 +16,10 @@ "GEPA_GEN_LM", "databricks/databricks-claude-sonnet-4-6" ) +DEFAULT_TOKEN_BUDGET: int | None = int( + os.environ.get("GEPA_TOKEN_BUDGET", "0") +) or None + # --------------------------------------------------------------------------- # Register Databricks models with litellm so it knows their true context # windows. Without this, litellm may fuzzy-match to a similar model with @@ -33,11 +37,21 @@ def _configure_litellm_retries() -> None: GEPA calls litellm.completion() without passing num_retries, so we set it globally. This handles Anthropic 529 "Overloaded" errors, rate limits, and other transient failures with exponential backoff. + + Rate-limit retries get extra attempts (10) since --include-tools sends + large contexts that easily hit token-per-minute ceilings on Opus. """ try: import litellm + from litellm import RetryPolicy + litellm.num_retries = 5 - litellm.request_timeout = 120 # seconds per attempt + litellm.request_timeout = 180 # seconds per attempt + litellm.retry_policy = RetryPolicy( + RateLimitErrorRetries=10, + InternalServerErrorRetries=5, + TimeoutErrorRetries=5, + ) # Drop log noise from retries litellm.suppress_debug_info = True except ImportError: @@ -99,12 +113,18 @@ def _register_litellm_models() -> None: ), "standard": GEPAConfig( engine=EngineConfig(max_metric_calls=50, parallel=True), - reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + reflection=ReflectionConfig( + reflection_lm=DEFAULT_REFLECTION_LM, + reflection_minibatch_size=3, + ), refiner=RefinerConfig(max_refinements=1), ), "thorough": GEPAConfig( engine=EngineConfig(max_metric_calls=150, parallel=True), - reflection=ReflectionConfig(reflection_lm=DEFAULT_REFLECTION_LM), + reflection=ReflectionConfig( + reflection_lm=DEFAULT_REFLECTION_LM, + reflection_minibatch_size=3, + ), refiner=RefinerConfig(max_refinements=1), ), } diff --git a/.test/src/skill_test/optimize/evaluator.py b/.test/src/skill_test/optimize/evaluator.py index 1e7564e9..88c8157b 100644 --- a/.test/src/skill_test/optimize/evaluator.py +++ b/.test/src/skill_test/optimize/evaluator.py @@ -5,6 +5,7 @@ """ import inspect +import re from pathlib import Path from typing import Any, Callable, Literal @@ -21,6 +22,10 @@ expected_facts_present, ) from .asi import feedback_to_asi +from .skillbench_evaluator import ( # noqa: F401 — re-exported for runner.py + create_skillbench_evaluator, + build_skillbench_background, +) # --------------------------------------------------------------------------- @@ -63,16 +68,20 @@ def count_tokens(text: str) -> int: def token_efficiency_score(candidate_text: str, original_token_count: int) -> float: - """Score 0-1 based on how concise the candidate is vs. the original. + """Score based on how concise the candidate is vs. the original. - Same size or smaller = 1.0, linear penalty up to 0.0 at 2x. + Smaller than original = bonus up to 1.15, same size = 1.0, + larger = linear penalty to 0.0 at 2x. """ if original_token_count <= 0: return 1.0 enc = tiktoken.get_encoding("cl100k_base") candidate_tokens = len(enc.encode(candidate_text)) ratio = candidate_tokens / original_token_count - return max(0.0, min(1.0, 2.0 - ratio)) + if ratio <= 1.0: + return 1.0 + 0.15 * (1.0 - ratio) + else: + return max(0.0, 2.0 - ratio) # --------------------------------------------------------------------------- @@ -140,6 +149,45 @@ def _validate_skill_structure(candidate_text: str) -> list[Feedback]: return feedbacks +_STOP_WORDS = frozenset({ + "the", "and", "for", "with", "that", "this", "from", "are", "was", + "were", "been", "being", "have", "has", "had", "does", "did", "but", + "not", "you", "all", "can", "her", "his", "its", "may", "our", + "out", "use", "uses", "will", "how", "who", "get", "which", "would", + "make", "like", "into", "than", "them", "then", "each", "other", + "should", "could", +}) + + +def _keyword_fact_score(fact: str, text: str) -> float: + """Score 0-1 based on keyword overlap between a fact and text.""" + words = [w for w in re.findall(r'\w{3,}', fact.lower()) if w not in _STOP_WORDS] + if not words: + return 1.0 + text_lower = text.lower() + found = sum(1 for w in words if w in text_lower) + return found / len(words) + + +def _score_skill_content_facts(candidate_text: str, expected_facts: list[str]) -> list[Feedback]: + """Score SKILL.md content against expected facts using keyword matching. + + Unlike the universal ``expected_facts_present`` scorer which requires exact + substring matches, this uses keyword extraction so descriptive facts like + "Uses CREATE OR REPLACE VIEW with WITH METRICS LANGUAGE YAML" match when + the individual keywords appear in the skill text. + """ + feedbacks = [] + for fact in expected_facts: + score = _keyword_fact_score(fact, candidate_text) + feedbacks.append(Feedback( + name=f"skill_content_fact_{fact[:40]}", + value=score, # continuous 0.0-1.0 + rationale=f"(skill content) Keyword match {score:.0%} for: {fact}", + )) + return feedbacks + + def _score_skill_content(candidate_text: str, expectations: dict[str, Any]) -> list[Feedback]: """Score the SKILL.md candidate itself for pattern/fact coverage. @@ -162,14 +210,10 @@ def _score_skill_content(candidate_text: str, expectations: dict[str, Any]) -> l rationale=f"(skill content) {fb.rationale or ''}", )) - # Expected facts on skill content - ef_results = _run_scorer(expected_facts_present, outputs, expectations, {}) - for fb in ef_results: - feedbacks.append(Feedback( - name=f"skill_content_{fb.name}", - value=fb.value, - rationale=f"(skill content) {fb.rationale or ''}", - )) + # Expected facts on skill content (keyword matching for descriptive facts) + expected_facts = expectations.get("expected_facts", []) + if expected_facts: + feedbacks.extend(_score_skill_content_facts(candidate_text, expected_facts)) return feedbacks @@ -212,6 +256,9 @@ def create_skill_evaluator( scorer_config = load_scorer_config(skill_name) effective_gen_model = gen_model or task_lm + # Track whether we've warned about generation failure + _gen_warned = [False] + # Compute original token count for efficiency scoring if original_token_counts is None: skill_path = _find_skill_md(skill_name) @@ -262,6 +309,7 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: gen_feedbacks: list[Feedback] = [] gen_composite = 0.0 + _gen_available = False # Track if generation actually worked if effective_gen_model and candidate_text and example.get("input"): import litellm messages = [ @@ -276,8 +324,10 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: {"role": "user", "content": example.get("input", "")}, ] try: - resp = litellm.completion(model=effective_gen_model, messages=messages) + from .skillbench_evaluator import _completion_with_backoff + resp = _completion_with_backoff(model=effective_gen_model, messages=messages) generated_response = resp.choices[0].message.content + _gen_available = True except Exception as e: generated_response = None gen_feedbacks.append(Feedback( @@ -285,6 +335,17 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: value="no", rationale=f"LLM generation failed: {e}", )) + if not _gen_warned[0]: + _gen_warned[0] = True + import warnings + warnings.warn( + f"\nGeneration model '{effective_gen_model}' failed: {e}\n" + "Falling back to skill-content + reference scoring (no generative eval).\n" + "The 20% 'generated response quality' layer will be inactive.\n" + "Fix: set DATABRICKS_API_KEY + DATABRICKS_API_BASE, or use " + "--gen-model with a working provider (e.g., --gen-model openai/gpt-4o).\n", + stacklevel=2, + ) if generated_response: gen_feedbacks = _run_deterministic_scorers( @@ -321,31 +382,36 @@ def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) if total_original_tokens > 0: ratio = total_candidate_tokens / total_original_tokens - efficiency = max(0.0, min(1.0, 2.0 - ratio)) + if ratio <= 1.0: + efficiency = 1.0 + 0.15 * (1.0 - ratio) + else: + efficiency = max(0.0, 2.0 - ratio) else: efficiency = 1.0 # ------------------------------------------------------------------ # Weighted final score # ------------------------------------------------------------------ - # When generative eval is available, it gets the dominant weight. - # When no gen_model, fall back to reference-heavy weighting. - if effective_gen_model and generated_response is not None: + # When generative eval succeeds, it gets the dominant weight. + # When gen fails (auth error, timeout, etc), fall back to + # skill-content-heavy weighting — this is the only layer that + # changes dynamically as GEPA mutates the skill. + if _gen_available and generated_response is not None: # Full layered evaluation final_score = ( - 0.40 * gen_composite # Generated response quality - + 0.25 * skill_content_composite # Skill content coverage + 0.20 * gen_composite # Generated response quality + + 0.35 * skill_content_composite # Skill content coverage + 0.05 * ref_composite # Reference response (sanity) + 0.10 * structure_composite # Structure validation - + 0.20 * efficiency # Token efficiency + + 0.30 * efficiency # Token efficiency ) else: - # Fallback: no generative eval, emphasize skill content + reference + # Fallback: no generative eval, emphasize skill content + efficiency final_score = ( - 0.35 * skill_content_composite # Skill content coverage - + 0.35 * ref_composite # Reference response + 0.40 * skill_content_composite # Skill content coverage + + 0.20 * ref_composite # Reference response + 0.10 * structure_composite # Structure validation - + 0.20 * efficiency # Token efficiency + + 0.30 * efficiency # Token efficiency ) # ------------------------------------------------------------------ @@ -493,19 +559,22 @@ def build_optimization_background( "They contain patterns, code examples, API references, and best practices.\n\n" "EVALUATION: The skill is evaluated by having a small LLM generate responses from it. " "Better skill documentation produces more correct responses. Scores come from:\n" - "- Generated response quality (40%): An LLM reads ONLY the skill and answers a test prompt. " + "- Generated response quality (20%): An LLM reads ONLY the skill and answers a test prompt. " "Its response is scored against expected patterns and facts.\n" - "- Skill content coverage (25%): Does the SKILL.md itself contain the patterns and facts " + "- Skill content coverage (35%): Does the SKILL.md itself contain the patterns and facts " "needed to answer test prompts? Removing key content directly drops this score.\n" "- Reference response check (5%): Sanity check against a known-good response.\n" "- Structure validation (10%): Python/SQL syntax, no hallucinated APIs.\n" - "- Token efficiency (20%): Conciseness vs original -- smaller is better.\n\n" - "KEY INSIGHT: If the skill is missing a pattern or fact, the LLM cannot generate it. " - "The most impactful changes add missing patterns/facts and remove incorrect ones.\n\n" + "- Token efficiency (30%): Conciseness vs original -- smaller is ACTIVELY REWARDED. " + "Shrinking the skill below its original size gives a bonus score (up to 1.15x at 0% of original). " + "Growing the skill is penalized linearly to 0.0 at 2x original size.\n\n" + "KEY INSIGHT: Token efficiency is the second-highest weight. Every token you remove " + "directly improves the score. SkillsBench research shows long skills hurt agent performance " + "via 'cognitive overhead' -- agents get confused by verbose docs. Be ruthlessly concise.\n\n" f"IMPORTANT: The current artifacts total {original_token_count:,} tokens. " - "Optimized versions should be MORE CONCISE, not larger. " + "Optimized versions MUST be MORE CONCISE. Target at least 10-20% token reduction. " "Remove redundant examples, consolidate similar patterns, " - "and eliminate verbose explanations that don't add value. " + "eliminate verbose explanations, and merge overlapping sections. " "Every token consumed is agent context window budget -- keep skills lean and focused." f"{baseline_desc}" f"{components_desc}" diff --git a/.test/src/skill_test/optimize/review.py b/.test/src/skill_test/optimize/review.py index 908b279a..c5bc4e93 100644 --- a/.test/src/skill_test/optimize/review.py +++ b/.test/src/skill_test/optimize/review.py @@ -2,21 +2,296 @@ Provides human-readable output of optimization results and the ability to apply the optimized SKILL.md to the repository. + +After each optimization run, results are saved to: + .test/skills//optimized_SKILL.md — the optimized content + .test/skills//last_optimization.md — summary with scores and diff + +Use ``--apply-last`` to apply a saved result without re-running optimization. """ import difflib +import json +from datetime import datetime, timezone from pathlib import Path from .runner import OptimizationResult from .evaluator import _find_skill_md +def _get_results_dir(skill_name: str) -> Path: + """Get the results directory for a skill.""" + # Try standard skills dir first + candidates = [ + Path(".test/skills") / skill_name, + Path(__file__).resolve().parent.parent.parent / "skills" / skill_name, + ] + for d in candidates: + if d.exists(): + return d + # Fallback: create under .test/skills + d = Path(".test/skills") / skill_name + d.mkdir(parents=True, exist_ok=True) + return d + + +def save_result(result: OptimizationResult) -> tuple[Path | None, Path | None]: + """Save optimization results to disk for later application. + + Writes two files: + - ``optimized_SKILL.md`` — the raw optimized content (can be diffed/reviewed) + - ``last_optimization.json`` — metadata for ``--apply-last`` + + Returns: + Tuple of (optimized_skill_path, metadata_path), either may be None on error. + """ + if result.improvement <= 0 and result.original_content == result.optimized_content: + return None, None + + results_dir = _get_results_dir(result.skill_name) + + optimized_path = None + metadata_path = None + + # Write the optimized SKILL.md + if result.optimized_content and result.optimized_content != result.original_content: + optimized_path = results_dir / "optimized_SKILL.md" + optimized_path.write_text(result.optimized_content) + + # Write metadata for --apply-last + metadata = { + "skill_name": result.skill_name, + "timestamp": datetime.now(timezone.utc).isoformat(), + "original_score": result.original_score, + "optimized_score": result.optimized_score, + "improvement": result.improvement, + "original_token_count": result.original_token_count, + "optimized_token_count": result.optimized_token_count, + "token_reduction_pct": result.token_reduction_pct, + "diff_summary": result.diff_summary, + "mlflow_run_id": result.mlflow_run_id, + "evaluator_type": getattr(result, "evaluator_type", "legacy"), + } + + # Save tool components if present + if result.components: + tool_components = { + k: v for k, v in result.components.items() if k.startswith("tools_") + } + if tool_components: + metadata["has_tool_components"] = True + # Save each tool component + for comp_name, comp_text in tool_components.items(): + comp_path = results_dir / f"optimized_{comp_name}.txt" + comp_path.write_text(comp_text) + + metadata_path = results_dir / "last_optimization.json" + metadata_path.write_text(json.dumps(metadata, indent=2)) + + return optimized_path, metadata_path + + +def load_last_result(skill_name: str) -> OptimizationResult | None: + """Load the last saved optimization result for a skill. + + Returns: + OptimizationResult reconstructed from saved files, or None if not found. + """ + results_dir = _get_results_dir(skill_name) + metadata_path = results_dir / "last_optimization.json" + optimized_path = results_dir / "optimized_SKILL.md" + + if not metadata_path.exists(): + return None + + metadata = json.loads(metadata_path.read_text()) + + # Load optimized content + optimized_content = "" + if optimized_path.exists(): + optimized_content = optimized_path.read_text() + + # Load original content + original_content = "" + skill_path = _find_skill_md(skill_name) + if skill_path: + original_content = skill_path.read_text() + + # Reconstruct tool components + components = None + if metadata.get("has_tool_components"): + components = {} + if optimized_content: + components["skill_md"] = optimized_content + for f in results_dir.glob("optimized_tools_*.txt"): + comp_name = f.stem.replace("optimized_", "") + components[comp_name] = f.read_text() + + return OptimizationResult( + skill_name=skill_name, + original_score=metadata.get("original_score", 0.0), + optimized_score=metadata.get("optimized_score", 0.0), + improvement=metadata.get("improvement", 0.0), + original_content=original_content, + optimized_content=optimized_content, + original_token_count=metadata.get("original_token_count", 0), + optimized_token_count=metadata.get("optimized_token_count", 0), + token_reduction_pct=metadata.get("token_reduction_pct", 0.0), + diff_summary=metadata.get("diff_summary", ""), + val_scores={}, + mlflow_run_id=metadata.get("mlflow_run_id"), + gepa_result=None, + components=components, + ) + + +def _review_skillbench(result: OptimizationResult) -> None: + """Print SkillBench-style effectiveness metrics.""" + print(f"\n{'=' * 60}") + print(f" Optimization Results: {result.skill_name} (skillbench)") + print(f"{'=' * 60}") + + # Aggregate effectiveness metrics from side_info + si = result.skillbench_side_info or {} + + # Compute aggregate pass rates from per-task scores + task_count = 0 + sum_with = 0.0 + sum_without = 0.0 + sum_eff = 0.0 + per_task_lines: list[str] = [] + + for task_id in sorted(si.keys()): + info = si[task_id] + scores = info.get("scores", {}) + pw = scores.get("pass_rate_with", 0.0) + pwo = scores.get("pass_rate_without", 0.0) + eff = scores.get("skill_effectiveness", 0.0) + sum_with += pw + sum_without += pwo + sum_eff += eff + task_count += 1 + + # Build per-task notes from the new Error key + error = info.get("Error", "") + notes = [] + if "NEEDS_SKILL" in error: + # Extract assertion names from Error lines + failing = [ + line.split(":", 1)[1].strip().split(" — ")[0] + for line in error.split("\n") + if line.startswith("NEEDS_SKILL") + ] + notes.append(f"NEEDS: {', '.join(failing[:3])}") + if "REGRESSION" in error: + reg_count = error.count("REGRESSION") + notes.append(f"REGRESSION x{reg_count}") + if not notes: + notes.append("OK/NEUTRAL") + note_str = f" [{'; '.join(notes)}]" + per_task_lines.append( + f" {task_id:<30s} WITH {pw:.2f} WITHOUT {pwo:.2f} " + f"delta {eff:+.2f}{note_str}" + ) + + # Aggregate rates + if task_count > 0: + agg_with = sum_with / task_count + agg_without = sum_without / task_count + agg_eff = sum_eff / task_count + else: + agg_with = agg_without = agg_eff = 0.0 + + # Score summary + improvement_sign = "+" if result.improvement >= 0 else "" + print(f" Score: {result.original_score:.3f} -> {result.optimized_score:.3f} " + f"({improvement_sign}{result.improvement:.3f})") + print(f" Skill Effectiveness: {agg_eff:.2f}") + print(f" Pass Rate (with): {agg_with:.2f}") + print(f" Pass Rate (without): {agg_without:.2f} (baseline)") + + # Token counts + reduction_sign = "+" if result.token_reduction_pct >= 0 else "" + print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " + f"({reduction_sign}{result.token_reduction_pct:.1f}%)") + + # GEPA iterations + if result.gepa_result and hasattr(result.gepa_result, "iterations"): + print(f" Iterations: {result.gepa_result.iterations}") + + if result.mlflow_run_id: + print(f" MLflow run: {result.mlflow_run_id}") + + print() + + # Per-task breakdown + if per_task_lines: + print(" Per-task:") + for line in per_task_lines: + print(line) + print() + + # Diff summary + if result.diff_summary and result.diff_summary != "No changes": + print(" Changes:") + for line in result.diff_summary.split("\n"): + print(f" {line}") + print() + + # Detailed diff (first 50 lines) + if result.original_content != result.optimized_content: + diff_lines = list(difflib.unified_diff( + result.original_content.splitlines(keepends=True), + result.optimized_content.splitlines(keepends=True), + fromfile="original SKILL.md", + tofile="optimized SKILL.md", + n=2, + )) + if len(diff_lines) > 50: + print(f" Diff (first 50 of {len(diff_lines)} lines):") + for line in diff_lines[:50]: + print(f" {line}", end="") + print(f"\n ... ({len(diff_lines) - 50} more lines)") + else: + print(" Diff:") + for line in diff_lines: + print(f" {line}", end="") + print() + else: + print(" No changes to SKILL.md content.") + + # Validation breakdown + if result.val_scores: + print(" Validation scores by test case:") + for task_id, score in sorted(result.val_scores.items()): + status = "PASS" if score >= 0.5 else "FAIL" + print(f" {status} {task_id}: {score:.3f}") + print() + + # Auto-save result to disk + saved_skill, saved_meta = save_result(result) + if saved_skill: + print(f" Saved: {saved_skill}") + print(f" Apply: uv run python .test/scripts/optimize.py {result.skill_name} --apply-last") + elif result.original_content == result.optimized_content: + print(f" No improvement found -- nothing saved.") + print(f"{'=' * 60}\n") + + def review_optimization(result: OptimizationResult) -> None: """Print optimization summary for human review. + Automatically saves the result to disk so it can be applied later + with ``--apply-last`` without re-running optimization. + Shows: score improvement, token reduction, diff of changed sections, per-test-case score breakdown, validation set performance. + + Delegates to ``_review_skillbench()`` when ``evaluator_type == "skillbench"``. """ + if getattr(result, "evaluator_type", "legacy") == "skillbench": + return _review_skillbench(result) + print(f"\n{'=' * 60}") print(f" Optimization Results: {result.skill_name}") print(f"{'=' * 60}") @@ -83,8 +358,13 @@ def review_optimization(result: OptimizationResult) -> None: print(f" {status} {task_id}: {score:.3f}") print() - # Apply hint - print(f" To apply: uv run python .test/scripts/optimize.py {result.skill_name} --apply") + # Auto-save result to disk + saved_skill, saved_meta = save_result(result) + if saved_skill: + print(f" Saved: {saved_skill}") + print(f" Apply: uv run python .test/scripts/optimize.py {result.skill_name} --apply-last") + elif result.original_content == result.optimized_content: + print(f" No improvement found — nothing saved.") print(f"{'=' * 60}\n") diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index 1feb54bd..36f44dcc 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -14,13 +14,15 @@ from ..config import SkillTestConfig from ..runners.evaluate import setup_mlflow -from .config import get_preset, validate_reflection_context, estimate_pass_duration, DEFAULT_GEN_LM +from .config import get_preset, validate_reflection_context, estimate_pass_duration, DEFAULT_GEN_LM, DEFAULT_TOKEN_BUDGET from .evaluator import ( SKILL_KEY, create_skill_evaluator, count_tokens, build_optimization_background, _find_skill_md, + create_skillbench_evaluator, + build_skillbench_background, ) from .splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances from .tools import ( @@ -50,6 +52,8 @@ class OptimizationResult: components: dict[str, str] | None = None original_components: dict[str, str] | None = None tool_map: Any = None + evaluator_type: str = "legacy" + skillbench_side_info: dict[str, dict] | None = None def _compute_diff_summary(original: str, optimized: str) -> str: @@ -120,6 +124,9 @@ def optimize_skill( dry_run: bool = False, max_passes: int = 5, max_metric_calls: int | None = None, + evaluator_type: Literal["legacy", "skillbench"] = "skillbench", + token_budget: int | None = None, + use_judges: bool = False, ) -> OptimizationResult: """Run end-to-end GEPA optimization on a skill and/or tools. @@ -141,6 +148,10 @@ def optimize_skill( dry_run: Show config without running max_passes: Maximum optimization passes (default 5). Each pass re-seeds from the previous best and runs a full GEPA cycle. + evaluator_type: "skillbench" (measures skill effectiveness delta) or + "legacy" (weighted scoring with keyword/token efficiency). + token_budget: Hard token ceiling; candidates exceeding this are penalized. + use_judges: Enable MLflow LLM judges (Correctness + Guidelines) for NL feedback. """ # 1. Load SKILL.md skill_path = _find_skill_md(skill_name) @@ -173,6 +184,22 @@ def optimize_skill( total_original_tokens = sum(original_token_counts.values()) + # Default --include-tools for SkillBench (tools are the primary token consumer) + if evaluator_type == "skillbench" and not tools_only and not include_tools and not tool_components: + include_tools = True + tool_map = extract_tool_descriptions(modules=tool_modules) + tool_components = tools_to_gepa_components(tool_map, per_module=True) + stats = get_tool_stats() + print(f"[SkillBench] Auto-including tools: {stats['modules']} modules, " + f"{stats['total_tools']} tools, {stats['total_description_chars']:,} chars") + for comp_name, comp_text in tool_components.items(): + seed_candidate[comp_name] = comp_text + original_token_counts[comp_name] = count_tokens(comp_text) + total_original_tokens = sum(original_token_counts.values()) + + # Resolve token budget + token_budget = token_budget or DEFAULT_TOKEN_BUDGET + # 3. Load datasets try: train, val = create_gepa_datasets(skill_name) @@ -189,12 +216,28 @@ def optimize_skill( effective_gen_model = gen_model or task_lm or DEFAULT_GEN_LM if effective_gen_model: print(f"Generation model: {effective_gen_model}") - - evaluator = create_skill_evaluator( - skill_name, mode=mode, task_lm=task_lm, - gen_model=effective_gen_model, - original_token_counts=original_token_counts, - ) + judges_label = " (with LLM judges)" if use_judges else "" + print(f"Evaluator: {evaluator_type}{judges_label}") + + if evaluator_type == "skillbench": + if not effective_gen_model: + raise ValueError( + "SkillBench evaluator requires a gen_model. " + "Pass --gen-model or set GEPA_GEN_LM env var." + ) + evaluator = create_skillbench_evaluator( + skill_name, + gen_model=effective_gen_model, + original_token_counts=original_token_counts, + token_budget=token_budget, + use_judges=use_judges, + ) + else: + evaluator = create_skill_evaluator( + skill_name, mode=mode, task_lm=task_lm, + gen_model=effective_gen_model, + original_token_counts=original_token_counts, + ) # 5. Get config (scaled by component count) num_components = len(seed_candidate) @@ -213,7 +256,7 @@ def optimize_skill( # Dry run if dry_run: - print(f"\n=== Dry Run: {skill_name} ===") + print(f"\n=== Dry Run: {skill_name} ({evaluator_type}) ===") if not tools_only: print(f"SKILL.md path: {skill_path}") print(f"Components: {list(seed_candidate.keys())}") @@ -224,6 +267,7 @@ def optimize_skill( print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Mode: {mode}") print(f"Generation model: {effective_gen_model or 'None (static only)'}") + print(f"Evaluator type: {evaluator_type}") print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls}, " f"scaled for {num_components} component(s))") print(f"Max passes: {max_passes}") @@ -238,12 +282,22 @@ def optimize_skill( for task_id, score in original_per_task.items(): print(f" {task_id}: {score:.3f}") - background = build_optimization_background( - skill_name, total_original_tokens, - component_names=list(seed_candidate.keys()), - baseline_scores=original_per_task, - baseline_side_info=si_by_id, - ) + if evaluator_type == "skillbench": + background = build_skillbench_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + token_budget=token_budget, + use_judges=use_judges, + ) + else: + background = build_optimization_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + ) print(f"\nBackground preview:\n{background[:500]}...") return OptimizationResult( @@ -263,6 +317,8 @@ def optimize_skill( components=dict(seed_candidate), original_components=dict(seed_candidate), tool_map=tool_map, + evaluator_type=evaluator_type, + skillbench_side_info=si_by_id if evaluator_type == "skillbench" else None, ) # Evaluate original and capture per-task detail for baseline context @@ -275,18 +331,35 @@ def optimize_skill( evaluator.set_baseline(si_by_input) # 6. Build background (with baseline scores) and objective - background = build_optimization_background( - skill_name, total_original_tokens, - component_names=list(seed_candidate.keys()), - baseline_scores=original_per_task, - baseline_side_info=si_by_id, - ) - objective = ( - f"Refine and improve the existing '{skill_name}' skill. " - "Preserve patterns and examples that already score well. " - "Focus on fixing scorer failures and reducing token count without sacrificing correctness. " - "Higher quality scores and fewer tokens are both better." - ) + if evaluator_type == "skillbench": + background = build_skillbench_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + token_budget=token_budget, + use_judges=use_judges, + ) + objective = ( + f"Refine and improve the existing '{skill_name}' skill. " + "Score is based on SKILL EFFECTIVENESS (45%) and TOKEN EFFICIENCY (25%). " + "Adding content the agent already knows does NOT help and costs tokens. " + "Focus on what the agent would otherwise get wrong. " + "Be concise — remove redundant examples and verbose explanations." + ) + else: + background = build_optimization_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + ) + objective = ( + f"Refine and improve the existing '{skill_name}' skill. " + "Preserve patterns and examples that already score well. " + "Focus on fixing scorer failures and reducing token count without sacrificing correctness. " + "Higher quality scores and fewer tokens are both better." + ) # 7. Convert datasets to GEPA format trainset = to_gepa_instances(train) @@ -300,7 +373,7 @@ def optimize_skill( best_score = original_score last_result = None total_metric_calls = 0 - improvement_threshold = 0.001 # minimum improvement to continue + improvement_threshold = 0.0005 # minimum improvement to continue print(f"\n Starting multi-pass optimization (up to {max_passes} passes, " f"{num_components} component(s), {config.engine.max_metric_calls} metric calls/pass)") @@ -385,7 +458,7 @@ def optimize_skill( stc = SkillTestConfig() setup_mlflow(stc) with mlflow.start_run(run_name=f"{skill_name}_optimize_{preset}"): - mlflow.set_tags({"optimizer": "gepa", "skill_name": skill_name, "preset": preset, "mode": mode}) + mlflow.set_tags({"optimizer": "gepa", "skill_name": skill_name, "preset": preset, "mode": mode, "evaluator_type": evaluator_type}) mlflow.log_metrics({ "original_score": original_score, "optimized_score": optimized_score, @@ -399,6 +472,11 @@ def optimize_skill( except Exception: pass + # Capture final side_info for skillbench review output + final_si_by_id = None + if evaluator_type == "skillbench": + _, _, final_si_by_id, _ = _evaluate_on_tasks(evaluator, best, train) + return OptimizationResult( skill_name=skill_name, original_score=original_score, @@ -416,4 +494,6 @@ def optimize_skill( components=dict(best), original_components=dict(seed_candidate), tool_map=tool_map, + evaluator_type=evaluator_type, + skillbench_side_info=final_si_by_id, ) diff --git a/.test/src/skill_test/optimize/skillbench_evaluator.py b/.test/src/skill_test/optimize/skillbench_evaluator.py new file mode 100644 index 00000000..8ddc3290 --- /dev/null +++ b/.test/src/skill_test/optimize/skillbench_evaluator.py @@ -0,0 +1,697 @@ +"""SkillBench-inspired evaluator: measure skill effectiveness via WITH vs WITHOUT comparison. + +Evaluates skills by measuring agent performance WITH the skill vs WITHOUT it +on real tasks, then computing a skill effectiveness delta. This replaces the +5-layer weighted scoring with a 3-phase approach: + + Phase 1: WITH-SKILL -- LLM generates response with SKILL.md in context + Phase 2: WITHOUT-SKILL -- LLM generates response with NO skill (cached once) + Phase 3: COMPUTE -- binary pass/fail assertions on both, derive effectiveness + +Scoring weights (default / with --use-judges): + 45% / 35% Skill Effectiveness (delta: pass_rate_with - pass_rate_without) + 25% / 25% Absolute Quality (pass_rate_with_skill) + 0% / 10% Judge Quality (LLM judge pass rate — 6 judges, see below) + 5% / 5% Structure (syntax validity) + 25% / 25% Token Efficiency (smaller candidates score higher) + +LLM Judges (when --use-judges is enabled): + 1. RelevanceToQuery — does the response address the user's input? (always runs) + 2. Completeness — does the response fully answer all parts? (always runs) + 3. Correctness — are expected facts present? (requires expected_facts) + 4. Guidelines — does response follow per-test rules? (requires guidelines) + 5. ExpectationsGuidelines — combined facts+guidelines check (when both present) + 6. Custom skill judge — MemAlign-inspired domain judge with skill-specific + evaluation criteria extracted from ground_truth guidelines +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import threading +import time +from typing import Any, Callable + +import litellm +from mlflow.entities import Feedback + +from ..scorers.universal import python_syntax, sql_syntax, no_hallucinated_apis +from .assertions import AssertionResult, run_all_assertions +from .asi import skillbench_to_asi + +logger = logging.getLogger(__name__) + + +def _prompt_hash(prompt: str) -> str: + """Stable hash for caching baseline results by prompt.""" + return hashlib.sha256(prompt.encode()).hexdigest()[:16] + + +class _RateLimiter: + """Thread-safe token-bucket rate limiter for LLM API calls. + + Limits both concurrency (via semaphore) and request rate (via + minimum inter-call spacing). When --include-tools sends large + contexts to Opus, this prevents bursts that exceed + token-per-minute quotas. + """ + + def __init__(self, max_concurrent: int = 2, min_interval: float = 1.0): + self._semaphore = threading.Semaphore(max_concurrent) + self._min_interval = min_interval + self._lock = threading.Lock() + self._last_call: float = 0.0 + + def acquire(self) -> None: + self._semaphore.acquire() + with self._lock: + now = time.monotonic() + wait = self._last_call + self._min_interval - now + if wait > 0: + time.sleep(wait) + self._last_call = time.monotonic() + + def release(self) -> None: + self._semaphore.release() + + +# Module-level rate limiter shared across evaluator instances. +_rate_limiter = _RateLimiter(max_concurrent=2, min_interval=1.0) + + +def _completion_with_backoff(*, max_retries: int = 6, **kwargs) -> Any: + """Call litellm.completion with explicit exponential backoff for rate limits. + + This is a safety net on top of litellm's built-in retries. litellm's + global num_retries handles most transient errors, but sustained + token-per-minute exhaustion on Opus can outlast them. This wrapper + adds longer waits between retry bursts. + """ + last_err: Exception | None = None + for attempt in range(max_retries + 1): + if attempt > 0: + delay = min(2 ** attempt, 60) # 2, 4, 8, 16, 32, 60 + logger.warning( + "Rate limited (attempt %d/%d), backing off %.0fs", + attempt, max_retries, delay, + ) + time.sleep(delay) + _rate_limiter.acquire() + try: + result = litellm.completion(**kwargs) + return result + except litellm.RateLimitError as e: + last_err = e + finally: + _rate_limiter.release() + raise last_err # type: ignore[misc] + + +def _run_structure_scorers(text: str) -> float: + """Run structure validation scorers on text, return 0.0-1.0 composite.""" + outputs = {"response": text} + scores: list[float] = [] + for scorer_fn in [python_syntax, sql_syntax, no_hallucinated_apis]: + try: + result = scorer_fn(outputs=outputs) + if isinstance(result, list): + for fb in result: + if fb.value == "yes": + scores.append(1.0) + elif fb.value == "no": + scores.append(0.0) + # skip doesn't count + elif isinstance(result, Feedback): + if result.value == "yes": + scores.append(1.0) + elif result.value == "no": + scores.append(0.0) + except Exception: + pass + return sum(scores) / len(scores) if scores else 1.0 + + +class SkillBenchEvaluator: + """GEPA-compatible evaluator using SkillBench WITH vs WITHOUT methodology. + + Args: + gen_model: LLM model for generating responses. Required -- no silent fallback. + scorer_config: Optional scorer config (unused, kept for interface compat). + original_token_counts: Token counts of original artifacts for efficiency scoring. + token_budget: Hard token ceiling; candidates exceeding this are penalized. + """ + + def __init__( + self, + gen_model: str, + scorer_config: dict[str, Any] | None = None, + original_token_counts: dict[str, int] | None = None, + token_budget: int | None = None, + use_judges: bool = False, + skill_guidelines: list[str] | None = None, + ): + if not gen_model: + raise ValueError( + "SkillBench evaluator requires a gen_model. " + "Pass --gen-model or set GEPA_GEN_LM env var." + ) + self.gen_model = gen_model + self._baseline_cache: dict[str, list[AssertionResult]] = {} + self._baseline_response_cache: dict[str, str] = {} + self._original_token_counts = original_token_counts or {} + self._total_original_tokens = sum(self._original_token_counts.values()) + self._token_budget = token_budget + self._use_judges = use_judges + self._skill_guidelines = skill_guidelines or [] + + def _generate_response(self, prompt: str, skill_context: str | None = None) -> str: + """Generate a response with or without skill context. + + Uses temperature=0 for deterministic outputs. + """ + messages = [] + if skill_context: + messages.append({ + "role": "system", + "content": ( + "Use ONLY the following skill documentation to answer " + "the user's question. Do not use any other knowledge.\n\n" + f"{skill_context}" + ), + }) + messages.append({"role": "user", "content": prompt}) + + resp = _completion_with_backoff( + model=self.gen_model, + messages=messages, + temperature=0, + ) + return resp.choices[0].message.content or "" + + def _get_baseline( + self, prompt: str, expectations: dict[str, Any], + ) -> tuple[list[AssertionResult], str]: + """Get WITHOUT-skill baseline, computing once then caching. + + Returns: + Tuple of (assertion_results, raw_response). + """ + key = _prompt_hash(prompt) + if key not in self._baseline_cache: + response = self._generate_response(prompt, skill_context=None) + self._baseline_response_cache[key] = response + self._baseline_cache[key] = run_all_assertions(response, expectations) + return self._baseline_cache[key], self._baseline_response_cache[key] + + def _run_llm_judges( + self, response: str, expectations: dict, prompt: str, reference: str, + ) -> list[dict]: + """Run MLflow LLM judges and return NL feedback dicts. + + Runs up to 6 judges for comprehensive evaluation: + 1. RelevanceToQuery — always (no ground truth needed) + 2. Completeness — always (no ground truth needed) + 3. Correctness — when expected_facts present + 4. Guidelines — when per-test guidelines present + 5. ExpectationsGuidelines — when BOTH facts + guidelines present + 6. Custom skill judge — MemAlign-inspired domain-specific judge + + Each judge returns {judge, passed, rationale} for GEPA reflection. + """ + results = [] + + inputs_dict = {"prompt": prompt} + outputs_dict = {"response": response} + + facts = expectations.get("expected_facts", []) + guidelines = expectations.get("guidelines", []) + + # 1. RelevanceToQuery: does response address the user's input? + # Always runs — catches off-topic or confused responses that binary + # assertions can't detect. + try: + from mlflow.genai.scorers import RelevanceToQuery + judge = RelevanceToQuery() + fb = judge(inputs=inputs_dict, outputs=outputs_dict) + results.append({ + "judge": "relevance", + "passed": fb.value == "yes", + "rationale": fb.rationale or "", + }) + except Exception as e: + logger.debug("RelevanceToQuery judge failed: %s", e) + + # 2. Completeness: does response fully answer all parts of the prompt? + # Always runs — catches partial answers where a response mentions the + # right topic but skips sub-questions. + try: + from mlflow.genai.scorers import Completeness + judge = Completeness() + fb = judge(inputs=inputs_dict, outputs=outputs_dict) + results.append({ + "judge": "completeness", + "passed": fb.value == "yes", + "rationale": fb.rationale or "", + }) + except Exception as e: + logger.debug("Completeness judge failed: %s", e) + + # 3. Correctness: are expected facts present in the response? + if facts and reference: + try: + from mlflow.genai.scorers import Correctness + judge = Correctness() + fb = judge( + inputs=inputs_dict, + outputs=outputs_dict, + expectations={"expected_facts": facts}, + ) + results.append({ + "judge": "correctness", + "passed": fb.value == "yes", + "rationale": fb.rationale or "", + }) + except Exception as e: + logger.debug("Correctness judge failed: %s", e) + + # 4. Guidelines: per-test custom evaluation rules + if guidelines: + try: + from mlflow.genai.scorers import Guidelines + judge = Guidelines( + name="skill_guidelines", guidelines=guidelines, + ) + fb = judge(inputs=inputs_dict, outputs=outputs_dict) + results.append({ + "judge": "guidelines", + "passed": fb.value == "yes", + "rationale": fb.rationale or "", + }) + except Exception as e: + logger.debug("Guidelines judge failed: %s", e) + + # 5. ExpectationsGuidelines: combined facts + guidelines in one pass. + # When both are available, this judge evaluates them together and + # produces a single holistic rationale — often more insightful than + # separate Correctness + Guidelines calls. + if facts and guidelines: + try: + from mlflow.genai.scorers import ExpectationsGuidelines + judge = ExpectationsGuidelines( + name="expectations_guidelines", + guidelines=guidelines, + ) + fb = judge( + inputs=inputs_dict, + outputs=outputs_dict, + expectations={"expected_facts": facts}, + ) + results.append({ + "judge": "expectations_guidelines", + "passed": fb.value == "yes", + "rationale": fb.rationale or "", + }) + except Exception as e: + logger.debug("ExpectationsGuidelines judge failed: %s", e) + + # 6. Custom skill judge (MemAlign-inspired): uses domain-specific + # evaluation principles extracted from the skill's ground_truth + # guidelines. This mimics MemAlign's "semantic memory" — a set of + # generalizable principles learned from labeled examples — without + # requiring MLflow trace alignment infrastructure. + if self._skill_guidelines: + try: + self._run_custom_skill_judge( + results, prompt, response, facts, + ) + except Exception as e: + logger.debug("Custom skill judge failed: %s", e) + + return results + + def _run_custom_skill_judge( + self, + results: list[dict], + prompt: str, + response: str, + facts: list[str], + ) -> None: + """Run a MemAlign-inspired custom judge with skill-domain principles. + + Uses ``mlflow.genai.judges.make_judge`` to create a domain-specific + judge whose instructions incorporate evaluation principles extracted + from all ground_truth.yaml guidelines across the skill's test cases. + + This is the "semantic memory" component of the MemAlign approach: + rather than aligning from traces, we extract and deduplicate the + skill's evaluation principles upfront and inject them as judge + instructions. + + Appends result dicts directly to ``results``. + """ + from mlflow.genai.judges import make_judge + + # Build instruction prompt from collected skill guidelines + principles = "\n".join( + f"- {g}" for g in self._skill_guidelines + ) + + judge = make_judge( + name="skill_domain_judge", + instructions=( + "You are an expert evaluator for a Databricks skill. " + "Evaluate whether the response correctly follows the " + "domain-specific principles below.\n\n" + "## Domain Principles (from skill evaluation criteria)\n" + f"{principles}\n\n" + "## Evaluation\n" + "Given the user question and response, determine if the " + "response adheres to the domain principles above. Focus on " + "technical accuracy, correct API usage, and completeness " + "of the domain-specific guidance.\n\n" + "Question: {{ inputs.prompt }}\n" + "Response: {{ outputs.response }}" + ), + feedback_value_type=bool, + ) + + fb = judge( + inputs={"prompt": prompt}, + outputs={"response": response}, + ) + results.append({ + "judge": "skill_domain", + "passed": fb.value == "yes" if isinstance(fb.value, str) else bool(fb.value), + "rationale": fb.rationale or "", + }) + + def __call__( + self, candidate: dict[str, str], example: dict, + ) -> tuple[float, dict]: + """Evaluate a candidate skill against a single task example. + + GEPA-compatible signature: (candidate, example) -> (score, side_info) + """ + skill_md = candidate.get("skill_md", "") + + # Build combined context: skill + tool descriptions + tool_parts = [] + for key in sorted(candidate): + if key.startswith("tools_"): + tool_parts.append(candidate[key]) + + full_context = skill_md + if tool_parts: + full_context += "\n\n## Available MCP Tools\n\n" + "\n\n".join(tool_parts) + + prompt = example.get("input", "") + + # Decode expectations + expectations: dict[str, Any] = {} + expectations_json = example.get("additional_context", {}).get("expectations", "") + if expectations_json: + try: + expectations = json.loads(expectations_json) + except (json.JSONDecodeError, TypeError): + pass + + # If no prompt or no expectations, return minimal score + if not prompt or not expectations: + return 0.0, {"_error": "No prompt or expectations for this task"} + + # Phase 1: WITH skill + tools + with_response = self._generate_response(prompt, skill_context=full_context) + with_results = run_all_assertions(with_response, expectations) + + # Phase 2: WITHOUT skill (cached) + without_results, without_response = self._get_baseline(prompt, expectations) + + # Phase 3: Compute scores + total = len(with_results) + if total == 0: + return 0.0, {"_error": "No assertions to evaluate"} + + pass_with = sum(r.passed for r in with_results) / total + pass_without = sum(r.passed for r in without_results) / total + effectiveness = pass_with - pass_without + + # LLM judge scoring (optional) + judge_results = [] + if self._use_judges: + judge_results = self._run_llm_judges( + with_response, expectations, prompt, + reference=example.get("answer", ""), + ) + + judge_pass_rate = 1.0 # default when judges disabled + if judge_results: + judge_pass_rate = sum(j["passed"] for j in judge_results) / len(judge_results) + + # Structure validation on the skill itself + structure = _run_structure_scorers(skill_md) if skill_md else 1.0 + + # Token efficiency scoring + from .evaluator import count_tokens + total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) + + if self._total_original_tokens > 0: + ratio = total_candidate_tokens / self._total_original_tokens + if ratio <= 1.0: + efficiency = 1.0 + 0.15 * (1.0 - ratio) # Bonus for smaller + else: + efficiency = max(0.0, 2.0 - ratio) # Penalty for growth + + # Hard penalty if over explicit budget + if self._token_budget and total_candidate_tokens > self._token_budget: + over_ratio = total_candidate_tokens / self._token_budget + efficiency = min(efficiency, max(0.0, 2.0 - over_ratio)) + else: + efficiency = 1.0 + + # Weighted final score + # When judges enabled: steal 10% from effectiveness for judge_quality + # When judges disabled: 10% folds back into effectiveness (original weights) + if self._use_judges and judge_results: + final_score = ( + 0.35 * max(0.0, effectiveness) + + 0.25 * pass_with + + 0.10 * judge_pass_rate + + 0.05 * structure + + 0.25 * efficiency + ) + else: + final_score = ( + 0.45 * max(0.0, effectiveness) + + 0.25 * pass_with + + 0.05 * structure + + 0.25 * efficiency + ) + + # Build side info via skillbench_to_asi + score_breakdown = { + "skill_effectiveness": effectiveness, + "pass_rate_with": pass_with, + "pass_rate_without": pass_without, + "structure": structure, + "token_efficiency": efficiency, + "final": final_score, + } + if self._use_judges and judge_results: + score_breakdown["judge_quality"] = judge_pass_rate + + reference_answer = example.get("answer", "") + + side_info = skillbench_to_asi( + with_results, + without_results, + task_prompt=prompt, + scores=score_breakdown, + with_response=with_response, + without_response=without_response, + reference_answer=reference_answer or None, + candidate=candidate, + ) + + # Feed judge rationale into side_info for GEPA reflection + if judge_results: + failing_judges = [j for j in judge_results if not j["passed"]] + if failing_judges: + rationale_lines = [ + f"{j['judge']}: {j['rationale'][:200]}" for j in failing_judges + ] + side_info["Judge_feedback"] = "\n".join(rationale_lines) + + # Also route to skill_md_specific_info for component targeting + if "skill_md_specific_info" not in side_info: + side_info["skill_md_specific_info"] = {} + side_info["skill_md_specific_info"]["Judge_analysis"] = "\n".join(rationale_lines) + + # Add token counts to side_info for GEPA Pareto tracking + side_info["token_counts"] = { + "candidate_total": total_candidate_tokens, + "original_total": self._total_original_tokens, + } + if self._token_budget: + side_info["token_counts"]["budget"] = self._token_budget + + return final_score, side_info + + +def _collect_skill_guidelines(skill_name: str) -> list[str]: + """Collect and deduplicate all guidelines from a skill's ground_truth.yaml. + + These form the "semantic memory" for the MemAlign-inspired custom judge. + Returns a deduplicated list of guideline strings. + """ + from pathlib import Path + import yaml + + gt_path = Path(".test/skills") / skill_name / "ground_truth.yaml" + if not gt_path.exists(): + return [] + + try: + with open(gt_path) as f: + data = yaml.safe_load(f) or {} + except Exception: + return [] + + seen: set[str] = set() + guidelines: list[str] = [] + for tc in data.get("test_cases", []): + for g in tc.get("expectations", {}).get("guidelines", []): + g_norm = g.strip() + if g_norm and g_norm not in seen: + seen.add(g_norm) + guidelines.append(g_norm) + + return guidelines + + +def create_skillbench_evaluator( + skill_name: str, + gen_model: str, + original_token_counts: dict[str, int] | None = None, + token_budget: int | None = None, + use_judges: bool = False, +) -> Callable: + """Factory for SkillBench-style evaluator. + + Returns a GEPA-compatible callable: (candidate, example) -> (score, side_info) + + When ``use_judges`` is enabled, collects all guidelines from the skill's + ground_truth.yaml to build a MemAlign-inspired custom domain judge + alongside the standard MLflow predefined judges. + + Args: + skill_name: Name of the skill being evaluated. + gen_model: LLM model for generating responses. Required. + original_token_counts: Token counts of original artifacts for efficiency scoring. + token_budget: Hard token ceiling; candidates exceeding this are penalized. + use_judges: Enable MLflow LLM judges for NL feedback. + """ + skill_guidelines: list[str] = [] + if use_judges: + skill_guidelines = _collect_skill_guidelines(skill_name) + if skill_guidelines: + logger.info( + "Loaded %d domain guidelines for custom skill judge", + len(skill_guidelines), + ) + + return SkillBenchEvaluator( + gen_model=gen_model, + original_token_counts=original_token_counts, + token_budget=token_budget, + use_judges=use_judges, + skill_guidelines=skill_guidelines, + ) + + +def build_skillbench_background( + skill_name: str, + original_token_count: int, + component_names: list[str] | None = None, + baseline_scores: dict[str, float] | None = None, + baseline_side_info: dict[str, dict] | None = None, + token_budget: int | None = None, + use_judges: bool = False, +) -> str: + """Build concise GEPA reflection context for SkillBench optimization. + + Kept short so GEPA's reflection LM spends its context on the per-example + diagnostics (Error/Expected/Actual) rather than methodology. + """ + # Concise per-task baseline summary + baseline_desc = "" + if baseline_scores: + mean_score = sum(baseline_scores.values()) / len(baseline_scores) + baseline_desc = f"\nBASELINE: mean {mean_score:.3f} across {len(baseline_scores)} tasks." + + if baseline_side_info: + needs_skill_ids = [] + regression_ids = [] + for tid, info in baseline_side_info.items(): + error = info.get("Error", "") + if "NEEDS_SKILL" in error: + needs_skill_ids.append(tid) + if "REGRESSION" in error: + regression_ids.append(tid) + if needs_skill_ids: + baseline_desc += ( + f"\n NEEDS_SKILL ({len(needs_skill_ids)} tasks): " + f"{', '.join(needs_skill_ids[:5])}" + ) + if regression_ids: + baseline_desc += ( + f"\n REGRESSION ({len(regression_ids)} tasks): " + f"{', '.join(regression_ids[:5])}" + ) + + components_desc = "" + if component_names and any(c.startswith("tools_") for c in component_names): + tool_modules = [c.replace("tools_", "") for c in component_names if c.startswith("tools_")] + components_desc = ( + f"\nAlso optimizing MCP tool descriptions for: {', '.join(tool_modules)}. " + "Tool descriptions are included in the agent's context alongside the skill. " + "The agent uses them to decide which tools to call and how. " + "Keep docstrings accurate and concise — every token counts toward the budget." + ) + + # Token efficiency guidance + token_desc = ( + f"\nTOKEN EFFICIENCY (25% of score): Current artifacts total {original_token_count:,} tokens. " + "Smaller candidates score HIGHER. Remove redundant examples, consolidate " + "overlapping sections, eliminate verbose explanations. Be ruthlessly concise." + ) + if token_budget: + token_desc += ( + f"\nTOKEN BUDGET: {token_budget:,} tokens. Candidates exceeding this " + "are heavily penalized. Stay well under the budget." + ) + + judge_desc = "" + if use_judges: + judge_desc = ( + "\nLLM JUDGES (10% of score): Six judges provide natural-language feedback " + "in 'Judge_feedback': RelevanceToQuery (on-topic?), Completeness (fully " + "answered?), Correctness (facts present?), Guidelines (rules followed?), " + "ExpectationsGuidelines (combined holistic check), and a custom skill " + "domain judge (MemAlign-inspired, evaluates domain-specific principles). " + "Use judge rationale to understand WHY responses fail — it's more " + "actionable than binary NEEDS_SKILL/REGRESSION labels." + ) + + return ( + f"You are refining SKILL.md for '{skill_name}'.\n" + "The skill is scored by how much it HELPS an agent answer correctly.\n" + "Assertions labeled NEEDS_SKILL = add this content. REGRESSION = simplify or remove.\n" + "Focus on: specific API syntax, version requirements, non-obvious patterns.\n" + "Do NOT add generic knowledge the agent already has (NEUTRAL assertions)." + f"{baseline_desc}" + f"{components_desc}" + f"{token_desc}" + f"{judge_desc}" + ) diff --git a/.test/src/skill_test/scorers/universal.py b/.test/src/skill_test/scorers/universal.py index 3b839bf4..40e2a8b2 100644 --- a/.test/src/skill_test/scorers/universal.py +++ b/.test/src/skill_test/scorers/universal.py @@ -125,20 +125,20 @@ def no_hallucinated_apis(outputs: Dict[str, Any]) -> Feedback: @scorer -def expected_facts_present(outputs: Dict[str, Any], expectations: Dict[str, Any]) -> Feedback: - """Check if expected facts are mentioned in response.""" +def expected_facts_present(outputs: Dict[str, Any], expectations: Dict[str, Any]) -> List[Feedback]: + """Check if expected facts are mentioned in response (per-fact granularity).""" response = outputs.get("response", "").lower() expected_facts = expectations.get("expected_facts", []) if not expected_facts: - return Feedback(name="expected_facts", value="skip", rationale="No expected_facts defined") + return [Feedback(name="expected_facts", value="skip", rationale="No expected_facts defined")] - missing = [] + feedbacks = [] for fact in expected_facts: - if fact.lower() not in response: - missing.append(fact) - - if missing: - return Feedback(name="expected_facts", value="no", rationale=f"Missing facts: {missing}") - - return Feedback(name="expected_facts", value="yes", rationale=f"All {len(expected_facts)} expected facts present") + found = fact.lower() in response + feedbacks.append(Feedback( + name=f"fact_{fact[:40]}", + value="yes" if found else "no", + rationale=f"{'Found' if found else 'Missing'}: {fact}", + )) + return feedbacks diff --git a/databricks-mcp-server/databricks_mcp_server/tools/manifest.py b/databricks-mcp-server/databricks_mcp_server/tools/manifest.py index 1adc30c5..500a77a7 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/manifest.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/manifest.py @@ -30,7 +30,8 @@ def _delete_from_databricks(resource_type: str, resource_id: str) -> Optional[st @mcp.tool def list_tracked_resources(type: Optional[str] = None) -> Dict[str, Any]: - """List resources tracked in the project manifest. + """ + List resources tracked in the project manifest. The manifest records every resource created through the MCP server (dashboards, jobs, pipelines, Genie spaces, KAs, MAS, schemas, volumes, etc.). @@ -60,7 +61,8 @@ def delete_tracked_resource( resource_id: str, delete_from_databricks: bool = False, ) -> Dict[str, Any]: - """Delete a resource from the project manifest, and optionally from Databricks. + """ + Delete a resource from the project manifest, and optionally from Databricks. Use this to clean up resources that were created during development/testing. diff --git a/databricks-mcp-server/databricks_mcp_server/tools/pipelines.py b/databricks-mcp-server/databricks_mcp_server/tools/pipelines.py index 26b91d8c..0a77c322 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/pipelines.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/pipelines.py @@ -1,33 +1,24 @@ -"""Pipeline tools - Manage Spark Declarative Pipelines (SDP). +"""Pipeline tools - Manage Spark Declarative Pipelines (SDP).""" -Provides 4 workflow-oriented tools following the Lakebase pattern: -- create_or_update_pipeline: idempotent resource management -- get_pipeline: get details by name/ID, or list all -- delete_pipeline: delete by ID -- run_pipeline: start, stop, and wait for pipeline runs -""" - -import logging -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any from databricks_tools_core.identity import get_default_tags from databricks_tools_core.spark_declarative_pipelines.pipelines import ( - create_or_update_pipeline as _create_or_update_pipeline, + create_pipeline as _create_pipeline, get_pipeline as _get_pipeline, + update_pipeline as _update_pipeline, delete_pipeline as _delete_pipeline, start_update as _start_update, get_update as _get_update, stop_pipeline as _stop_pipeline, get_pipeline_events as _get_pipeline_events, + create_or_update_pipeline as _create_or_update_pipeline, find_pipeline_by_name as _find_pipeline_by_name, - wait_for_pipeline_update as _wait_for_pipeline_update, ) from ..manifest import register_deleter from ..server import mcp -logger = logging.getLogger(__name__) - def _delete_pipeline_resource(resource_id: str) -> None: _delete_pipeline(pipeline_id=resource_id) @@ -36,192 +27,113 @@ def _delete_pipeline_resource(resource_id: str) -> None: register_deleter("pipeline", _delete_pipeline_resource) -# ============================================================================ -# Tool 1: create_or_update_pipeline -# ============================================================================ - - @mcp.tool -def create_or_update_pipeline( +def create_pipeline( name: str, root_path: str, catalog: str, schema: str, workspace_file_paths: List[str], - start_run: bool = False, - wait_for_completion: bool = False, - full_refresh: bool = True, - timeout: int = 1800, extra_settings: Dict[str, Any] = None, ) -> Dict[str, Any]: """ - Create a new pipeline or update an existing one with the same name. - - This is the main tool for pipeline resource management. It: - 1. Searches for an existing pipeline with the same name (or uses 'id' from extra_settings) - 2. Creates a new pipeline or updates the existing one - 3. Optionally starts a run and waits for completion - - For more granular run control (stop, selective refresh, validate-only), - use run_pipeline() separately. - - Uses Unity Catalog and serverless compute by default. + Create a new Spark Declarative Pipeline (Unity Catalog, serverless by default). Args: - name: Pipeline name (used for lookup and creation) + name: Pipeline name root_path: Root folder for source code (added to Python sys.path for imports) - catalog: Unity Catalog name for output tables + catalog: Unity Catalog name schema: Schema name for output tables workspace_file_paths: List of workspace file paths (raw .sql or .py files) - start_run: If True, start a pipeline run after create/update (default: False) - wait_for_completion: If True and start_run=True, wait for the run to finish (default: False) - full_refresh: If True, performs full refresh when starting (default: True) - timeout: Maximum wait time in seconds when wait_for_completion=True (default: 1800) - extra_settings: Optional dict with additional pipeline settings. Supports all SDK - options: clusters, continuous, development, photon, edition, channel, event_log, - configuration, notifications, tags, serverless, etc. - If 'id' is provided, the pipeline will be updated instead of created. - Explicit parameters (name, root_path, catalog, schema) take precedence. + extra_settings: Optional dict with additional pipeline settings (clusters, + continuous, development, photon, edition, channel, event_log, configuration, + notifications, tags, serverless, etc.). Explicit parameters take precedence. Returns: - Dictionary with: - - pipeline_id: The pipeline ID - - pipeline_name: The pipeline name - - created: True if newly created, False if updated - - success: True if operation succeeded - - message: Human-readable status message - - Example: - >>> create_or_update_pipeline( - ... name="my_pipeline", - ... root_path="/Workspace/project", - ... catalog="my_catalog", - ... schema="my_schema", - ... workspace_file_paths=["/Workspace/project/pipeline.py"] - ... ) - >>> create_or_update_pipeline( - ... name="my_pipeline", ..., - ... start_run=True, wait_for_completion=True - ... ) + Dictionary with pipeline_id of the created pipeline. """ # Auto-inject default tags into extra_settings; user tags take precedence extra_settings = extra_settings or {} extra_settings.setdefault("tags", {}) extra_settings["tags"] = {**get_default_tags(), **extra_settings["tags"]} - result = _create_or_update_pipeline( + result = _create_pipeline( name=name, root_path=root_path, catalog=catalog, schema=schema, workspace_file_paths=workspace_file_paths, - start_run=start_run, - wait_for_completion=wait_for_completion, - full_refresh=full_refresh, - timeout=timeout, extra_settings=extra_settings, ) - # Track resource on successful create/update + # Track resource on successful create try: - result_dict = result.to_dict() - pipeline_id = result_dict.get("pipeline_id") - if pipeline_id: + if result.pipeline_id: from ..manifest import track_resource track_resource( resource_type="pipeline", name=name, - resource_id=pipeline_id, + resource_id=result.pipeline_id, ) except Exception: pass # best-effort tracking - return result.to_dict() - - -# ============================================================================ -# Tool 2: get_pipeline -# ============================================================================ + return {"pipeline_id": result.pipeline_id} @mcp.tool -def get_pipeline( - pipeline_id: Optional[str] = None, - name: Optional[str] = None, -) -> Dict[str, Any]: +def get_pipeline(pipeline_id: str) -> Dict[str, Any]: """ - Get pipeline details, or list all pipelines. - - Pass pipeline_id or name to get one pipeline's details enriched with - latest update status and recent events. Omit both to list all pipelines. + Get pipeline details and configuration. Args: - pipeline_id: Pipeline ID. Takes precedence over name. - name: Pipeline name. Used to look up pipeline ID if pipeline_id not provided. + pipeline_id: Pipeline ID Returns: - Single pipeline dict with enriched details (if ID/name provided), - or {"pipelines": [...]} when listing all. - - Example: - >>> get_pipeline(pipeline_id="abc-123") - {"pipeline_id": "abc-123", "name": "my_pipeline", "state": "IDLE", ...} - >>> get_pipeline(name="my_pipeline") - {"pipeline_id": "abc-123", "name": "my_pipeline", ...} - >>> get_pipeline() - {"pipelines": [{"pipeline_id": "abc-123", "name": "my_pipeline", ...}]} + Dictionary with pipeline configuration and state. """ - # Resolve name to pipeline_id if needed - if not pipeline_id and name: - pipeline_id = _find_pipeline_by_name(name=name) - if not pipeline_id: - return {"error": f"Pipeline '{name}' not found."} - - if pipeline_id: - result = _get_pipeline(pipeline_id=pipeline_id) - pipeline_dict = result.as_dict() if hasattr(result, "as_dict") else vars(result) - - # Enrich with latest update status - try: - latest_updates = pipeline_dict.get("latest_updates", []) - if latest_updates: - latest = latest_updates[0] - update_id = latest.get("update_id") - if update_id: - update_result = _get_update(pipeline_id=pipeline_id, update_id=update_id) - update_dict = update_result.as_dict() if hasattr(update_result, "as_dict") else vars(update_result) - pipeline_dict["latest_update_status"] = update_dict - except Exception: - pass - - # Enrich with recent events - try: - events = _get_pipeline_events(pipeline_id=pipeline_id, max_results=10) - pipeline_dict["recent_events"] = [e.as_dict() if hasattr(e, "as_dict") else vars(e) for e in events] - except Exception: - pass - - return pipeline_dict - - # List all pipelines - from databricks_tools_core.auth import get_workspace_client - - w = get_workspace_client() - pipelines = [] - try: - for p in w.pipelines.list_pipelines(): - entry = p.as_dict() if hasattr(p, "as_dict") else vars(p) - pipelines.append(entry) - except Exception as e: - return {"error": f"Failed to list pipelines: {e}"} + result = _get_pipeline(pipeline_id=pipeline_id) + return result.as_dict() if hasattr(result, "as_dict") else vars(result) - return {"pipelines": pipelines} +@mcp.tool +def update_pipeline( + pipeline_id: str, + name: str = None, + root_path: str = None, + catalog: str = None, + schema: str = None, + workspace_file_paths: List[str] = None, + extra_settings: Dict[str, Any] = None, +) -> Dict[str, str]: + """ + Update pipeline configuration. + + Args: + pipeline_id: Pipeline ID + name: New pipeline name + root_path: New root folder for source code + catalog: New catalog name + schema: New schema name + workspace_file_paths: New list of file paths (raw .sql or .py files) + extra_settings: Optional dict with additional pipeline settings (clusters, + continuous, development, photon, edition, channel, event_log, configuration, + notifications, tags, serverless, etc.). Explicit parameters take precedence. -# ============================================================================ -# Tool 3: delete_pipeline -# ============================================================================ + Returns: + Dictionary with status message. + """ + _update_pipeline( + pipeline_id=pipeline_id, + name=name, + root_path=root_path, + catalog=catalog, + schema=schema, + workspace_file_paths=workspace_file_paths, + extra_settings=extra_settings, + ) + return {"status": "updated"} @mcp.tool @@ -245,59 +157,27 @@ def delete_pipeline(pipeline_id: str) -> Dict[str, str]: return {"status": "deleted"} -# ============================================================================ -# Tool 4: run_pipeline -# ============================================================================ - - @mcp.tool -def run_pipeline( +def start_update( pipeline_id: str, + refresh_selection: List[str] = None, full_refresh: bool = False, - refresh_selection: Optional[List[str]] = None, - full_refresh_selection: Optional[List[str]] = None, + full_refresh_selection: List[str] = None, validate_only: bool = False, - wait_for_completion: bool = False, - timeout: int = 1800, - stop: bool = False, -) -> Dict[str, Any]: +) -> Dict[str, str]: """ - Start, stop, or monitor a pipeline run. - - Set stop=True to stop a running pipeline. Otherwise starts an update. - Optionally waits for the run to complete with timeout. + Start a pipeline update or dry-run validation. Args: pipeline_id: Pipeline ID - full_refresh: If True, performs full refresh of all tables refresh_selection: List of table names to refresh + full_refresh: If True, performs full refresh of all tables full_refresh_selection: List of table names for full refresh validate_only: If True, validates without updating data (dry run) - wait_for_completion: If True, wait for run to complete (default: False) - timeout: Maximum wait time in seconds (default: 1800 = 30 minutes) - stop: If True, stop the currently running pipeline instead of starting Returns: - Dictionary with: - - update_id: Update ID (if started) - - state: Final state (if waited) - - success: True if completed successfully - - duration_seconds: Time taken (if waited) - - errors: List of error details (if failed) - - status: "stopped" (if stop=True) - - Example: - >>> run_pipeline(pipeline_id="abc-123", full_refresh=True) - {"update_id": "xyz-456", "status": "started"} - >>> run_pipeline(pipeline_id="abc-123", stop=True) - {"status": "stopped"} - >>> run_pipeline(pipeline_id="abc-123", wait_for_completion=True, timeout=600) - {"update_id": "xyz-456", "state": "COMPLETED", "success": True, ...} + Dictionary with update_id for polling status. """ - if stop: - _stop_pipeline(pipeline_id=pipeline_id) - return {"pipeline_id": pipeline_id, "status": "stopped"} - update_id = _start_update( pipeline_id=pipeline_id, refresh_selection=refresh_selection, @@ -305,55 +185,188 @@ def run_pipeline( full_refresh_selection=full_refresh_selection, validate_only=validate_only, ) + return {"update_id": update_id} + + +@mcp.tool +def get_update(pipeline_id: str, update_id: str) -> Dict[str, Any]: + """ + Get pipeline update status and results. + + Args: + pipeline_id: Pipeline ID + update_id: Update ID from start_update + + Returns: + Dictionary with update status (QUEUED, RUNNING, COMPLETED, FAILED, etc.) + """ + result = _get_update(pipeline_id=pipeline_id, update_id=update_id) + return result.as_dict() if hasattr(result, "as_dict") else vars(result) - result: Dict[str, Any] = { - "pipeline_id": pipeline_id, - "update_id": update_id, - "status": "started", - } - if not wait_for_completion: - result["message"] = ( - f"Pipeline update started. Use get_pipeline(pipeline_id='{pipeline_id}') " - f"to check status, or run_pipeline with wait_for_completion=True to wait." +@mcp.tool +def stop_pipeline(pipeline_id: str) -> Dict[str, str]: + """ + Stop a running pipeline. + + Args: + pipeline_id: Pipeline ID + + Returns: + Dictionary with status message. + """ + _stop_pipeline(pipeline_id=pipeline_id) + return {"status": "stopped"} + + +@mcp.tool +def get_pipeline_events( + pipeline_id: str, + max_results: int = 100, +) -> List[Dict[str, Any]]: + """ + Get pipeline events, issues, and error messages. + + Use this to debug pipeline failures. + + Args: + pipeline_id: Pipeline ID + max_results: Maximum number of events to return (default: 100) + + Returns: + List of event dictionaries with error details. + """ + events = _get_pipeline_events(pipeline_id=pipeline_id, max_results=max_results) + return [e.as_dict() if hasattr(e, "as_dict") else vars(e) for e in events] + + +@mcp.tool +def create_or_update_pipeline( + name: str, + root_path: str, + catalog: str, + schema: str, + workspace_file_paths: List[str], + start_run: bool = False, + wait_for_completion: bool = False, + full_refresh: bool = True, + timeout: int = 1800, + extra_settings: Dict[str, Any] = None, +) -> Dict[str, Any]: + """ + Create a new pipeline or update an existing one with the same name. + + This is the main tool for pipeline management. It: + 1. Searches for an existing pipeline with the same name (or uses 'id' from extra_settings) + 2. Creates a new pipeline or updates the existing one + 3. Optionally starts a pipeline run with full refresh + 4. Optionally waits for the run to complete and returns detailed results + + Uses Unity Catalog and serverless compute by default. + + Args: + name: Pipeline name (used for lookup and creation) + root_path: Root folder for source code (added to Python sys.path for imports) + catalog: Unity Catalog name for output tables + schema: Schema name for output tables + workspace_file_paths: List of workspace file paths (raw .sql or .py files) + start_run: If True, start a pipeline update after create/update (default: False) + wait_for_completion: If True, wait for run to complete (default: False) + full_refresh: If True, perform full refresh when starting (default: True) + timeout: Maximum wait time in seconds (default: 1800 = 30 minutes) + extra_settings: Optional dict with additional pipeline settings. Supports all SDK + options: clusters, continuous, development, photon, edition, channel, event_log, + configuration, notifications, tags, serverless, etc. + If 'id' is provided, the pipeline will be updated instead of created. + Explicit parameters (name, root_path, catalog, schema) take precedence. + + Returns: + Dictionary with detailed status: + - pipeline_id: The pipeline ID + - pipeline_name: The pipeline name + - created: True if newly created, False if updated + - success: True if all operations succeeded + - state: Final state if run was started (COMPLETED, FAILED, etc.) + - duration_seconds: Time taken if waited + - error_message: Error message if failed + - errors: List of detailed errors if failed + - message: Human-readable status message + + Example usage: + # Just create/update the pipeline + create_or_update_pipeline(name="my_pipeline", ...) + + # Create/update and run immediately + create_or_update_pipeline(name="my_pipeline", ..., start_run=True) + + # Create/update, run, and wait for completion + create_or_update_pipeline( + name="my_pipeline", ..., + start_run=True, + wait_for_completion=True ) - return result - try: - wait_result = _wait_for_pipeline_update( - pipeline_id=pipeline_id, - update_id=update_id, - timeout=timeout, + # Create with custom settings (non-serverless, development mode) + create_or_update_pipeline( + name="my_pipeline", ..., + extra_settings={ + "serverless": False, + "development": True, + "clusters": [{"label": "default", "num_workers": 2}] + } ) - result["state"] = wait_result["state"] - result["success"] = wait_result["success"] - result["duration_seconds"] = wait_result["duration_seconds"] - result["status"] = "completed" if wait_result["success"] else "failed" - - if not wait_result["success"]: - result["errors"] = wait_result.get("errors", []) - if result["errors"]: - first_error = result["errors"][0] - error_msg = first_error.get("message", "") - if first_error.get("exceptions"): - exc = first_error["exceptions"][0] - error_msg = exc.get("message", error_msg) - result["error_message"] = error_msg - result["message"] = ( - f"Pipeline run failed with state: {result['state']}. " - f"Use get_pipeline(pipeline_id='{pipeline_id}') for full details." + """ + # Auto-inject default tags into extra_settings; user tags take precedence + extra_settings = extra_settings or {} + extra_settings.setdefault("tags", {}) + extra_settings["tags"] = {**get_default_tags(), **extra_settings["tags"]} + + result = _create_or_update_pipeline( + name=name, + root_path=root_path, + catalog=catalog, + schema=schema, + workspace_file_paths=workspace_file_paths, + start_run=start_run, + wait_for_completion=wait_for_completion, + full_refresh=full_refresh, + timeout=timeout, + extra_settings=extra_settings, + ) + + # Track resource on successful create/update + try: + result_dict = result.to_dict() + pipeline_id = result_dict.get("pipeline_id") + if pipeline_id: + from ..manifest import track_resource + + track_resource( + resource_type="pipeline", + name=name, + resource_id=pipeline_id, ) - else: - result["message"] = f"Pipeline completed successfully in {result['duration_seconds']}s." - - except TimeoutError as e: - result["state"] = "TIMEOUT" - result["success"] = False - result["status"] = "timeout" - result["error_message"] = str(e) - result["message"] = ( - f"Pipeline run timed out after {timeout}s. The pipeline may still be running. " - f"Use get_pipeline(pipeline_id='{pipeline_id}') to check status." - ) + except Exception: + pass # best-effort tracking - return result + return result.to_dict() + + +@mcp.tool +def find_pipeline_by_name(name: str) -> Dict[str, Any]: + """ + Find a pipeline by name and return its ID. + + Args: + name: Pipeline name to search for + + Returns: + Dictionary with: + - found: True if pipeline exists + - pipeline_id: Pipeline ID if found, None otherwise + """ + pipeline_id = _find_pipeline_by_name(name=name) + return { + "found": pipeline_id is not None, + "pipeline_id": pipeline_id, + } diff --git a/databricks-mcp-server/databricks_mcp_server/tools/sql.py b/databricks-mcp-server/databricks_mcp_server/tools/sql.py index 01355525..2be905d3 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/sql.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/sql.py @@ -28,9 +28,8 @@ def execute_sql( If no warehouse_id is provided, automatically selects the best available warehouse. IMPORTANT: For creating or dropping schemas, catalogs, and volumes, use the - manage_uc_objects tool instead of SQL DDL. It handles resource tracking and - auto-tagging. Only use execute_sql for queries (SELECT, INSERT, UPDATE) and - table DDL (CREATE TABLE, DROP TABLE). + manage_uc_objects tool instead of SQL DDL. Only use execute_sql for queries + (SELECT, INSERT, UPDATE) and table DDL (CREATE TABLE, DROP TABLE). Args: sql_query: SQL query to execute @@ -67,9 +66,8 @@ def execute_sql_multi( in optimal order. Independent queries run in parallel. IMPORTANT: For creating or dropping schemas, catalogs, and volumes, use the - manage_uc_objects tool instead of SQL DDL. It handles resource tracking and - auto-tagging. Only use execute_sql/execute_sql_multi for queries (SELECT, - INSERT, UPDATE) and table DDL (CREATE TABLE, DROP TABLE). + manage_uc_objects tool instead of SQL DDL. Only use execute_sql/execute_sql_multi + for queries (SELECT, INSERT, UPDATE) and table DDL (CREATE TABLE, DROP TABLE). Args: sql_content: SQL content with multiple statements separated by ; From 50b7cdd7478fef528b4c4b3c39c0bbe41d9c9010 Mon Sep 17 00:00:00 2001 From: Natyra Date: Mon, 2 Mar 2026 21:25:15 +0100 Subject: [PATCH 09/20] add the parsing skill for parsing documents/custom rag --- databricks-skills/README.md | 1 + databricks-skills/databricks-parsing/SKILL.md | 275 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 databricks-skills/databricks-parsing/SKILL.md diff --git a/databricks-skills/README.md b/databricks-skills/README.md index ddc5b081..afaccd9d 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -36,6 +36,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ - **databricks-agent-bricks** - Knowledge Assistants, Genie Spaces, Supervisor Agents - **databricks-genie** - Genie Spaces: create, curate, and query via Conversation API - **databricks-model-serving** - Deploy MLflow models and AI agents to endpoints +- **databricks-parsing** - Parse documents (PDF, DOCX, images) with ai_parse_document for custom RAG pipelines - **databricks-unstructured-pdf-generation** - Generate synthetic PDFs for RAG - **databricks-vector-search** - Vector similarity search for RAG and semantic search diff --git a/databricks-skills/databricks-parsing/SKILL.md b/databricks-skills/databricks-parsing/SKILL.md new file mode 100644 index 00000000..96b00b32 --- /dev/null +++ b/databricks-skills/databricks-parsing/SKILL.md @@ -0,0 +1,275 @@ +--- +name: databricks-parsing +description: "Parse documents (PDF, DOCX, PPTX, images) using ai_parse_document, or build custom RAG pipelines. Use when the user asks to parse documents or build a custom RAG." +--- + +# Databricks Document Parsing + +Parse unstructured documents into structured text using `ai_parse_document` — the foundation for document processing and custom RAG pipelines on Databricks. + +## When to Use + +Use this skill when: +- Parsing PDFs, DOCX, PPTX, or images into text +- Extracting structured data from unstructured documents +- Building a custom RAG pipeline (parse → chunk → index → query) +- Ingesting documents from Unity Catalog Volumes for search or analysis + +## Overview + +`ai_parse_document` is a SQL AI function that extracts content from binary documents. It runs on serverless SQL warehouses and supports PDF, DOC/DOCX, PPT/PPTX, JPG/JPEG, and PNG. + +| Aspect | Detail | +|--------|--------| +| **Function** | `ai_parse_document(content)` or `ai_parse_document(content, options)` | +| **Input** | Binary document content (from `read_files` with `format => 'binaryFile'`) | +| **Output** | VARIANT with `document.pages[]`, `document.elements[]`, `metadata` | +| **Requirements** | Databricks Runtime 17.1+, Serverless SQL Warehouse | +| **Tool** | Use via `execute_sql` — no dedicated MCP tool needed | + +## Quick Start + +Parse all documents in a Volume: + +```sql +SELECT + path, + ai_parse_document(content) AS parsed +FROM read_files('/Volumes/catalog/schema/volume/docs/', format => 'binaryFile'); +``` + +## Common Patterns + +### Pattern 1: Parse with Options + +```sql +SELECT ai_parse_document( + content, + map( + 'version', '2.0', + 'imageOutputPath', '/Volumes/catalog/schema/volume/images/', + 'descriptionElementTypes', '*' + ) +) AS parsed +FROM read_files('/Volumes/catalog/schema/volume/invoices/', format => 'binaryFile'); +``` + +**Options:** + +| Key | Values | Description | +|-----|--------|-------------| +| `version` | `'2.0'` | Output schema version | +| `imageOutputPath` | Volume path | Save rendered page images | +| `descriptionElementTypes` | `''`, `'figure'`, `'*'` | AI-generated descriptions (default: `'*'` for all) | + +### Pattern 2: Parse + Extract Structured Data + +Combine `ai_parse_document` with `ai_query` to extract specific fields. +Use `transform()` + `try_cast()` to concatenate element text, then pass +the full text to `ai_query` with `returnType => 'STRING'`. + +```sql +WITH parsed_documents AS ( + SELECT + path, + ai_parse_document(content) AS parsed + FROM read_files('/Volumes/catalog/schema/volume/invoices/', format => 'binaryFile') +), +parsed_text AS ( + SELECT + path, + concat_ws('\n\n', + transform( + try_cast(parsed:document:elements AS ARRAY), + element -> try_cast(element:content AS STRING) + ) + ) AS text + FROM parsed_documents + WHERE try_cast(parsed:error_status AS STRING) IS NULL +) +SELECT + path, + ai_query( + 'databricks-claude-sonnet-4', + concat( + 'Extract vendor name, invoice number, and total due from this document. ', + 'Return the result as a JSON object with keys: vendor, invoice_number, total_due. ', + text + ), + returnType => 'STRING' + ) AS structured_data +FROM parsed_text +WHERE text IS NOT NULL; +``` + +### Pattern 3: Custom RAG Pipeline + +End-to-end: parse documents → chunk text → store in Delta table → create Vector Search index. + +**Step 1 — Parse and chunk into a Delta table:** + +`ai_parse_document` returns a VARIANT. You must use `variant_get` with an explicit +`ARRAY` cast before calling `explode`, since `explode()` does not accept +raw VARIANT values. + +```sql +CREATE OR REPLACE TABLE catalog.schema.parsed_chunks AS +WITH parsed AS ( + SELECT + path, + ai_parse_document(content) AS doc + FROM read_files('/Volumes/catalog/schema/volume/docs/', format => 'binaryFile') +), +elements AS ( + SELECT + path, + explode(variant_get(doc, '$.document.elements', 'ARRAY')) AS element + FROM parsed +) +SELECT + md5(concat(path, variant_get(element, '$.content', 'STRING'))) AS chunk_id, + path AS source_path, + variant_get(element, '$.content', 'STRING') AS content, + variant_get(element, '$.type', 'STRING') AS element_type, + current_timestamp() AS parsed_at +FROM elements +WHERE variant_get(element, '$.content', 'STRING') IS NOT NULL + AND length(trim(variant_get(element, '$.content', 'STRING'))) > 10; +``` + +**Step 1a (production) — Incremental parsing with Structured Streaming:** + +For production pipelines where new documents arrive over time, use Structured +Streaming with checkpoints for exactly-once processing. Each run processes only +new files (tracked via checkpoints), then stops with `trigger(availableNow=True)`. + +See the official bundle example: +[databricks/bundle-examples/contrib/job_with_ai_parse_document](https://github.com/databricks/bundle-examples/tree/main/contrib/job_with_ai_parse_document) + +**Stage 1 — Parse raw documents (streaming):** + +```python +from pyspark.sql.functions import col, current_timestamp, expr + +files_df = ( + spark.readStream.format("binaryFile") + .option("pathGlobFilter", "*.{pdf,jpg,jpeg,png}") + .option("recursiveFileLookup", "true") + .load("/Volumes/catalog/schema/volume/docs/") +) + +parsed_df = ( + files_df + .repartition(8, expr("crc32(path) % 8")) + .withColumn("parsed", expr(""" + ai_parse_document(content, map( + 'version', '2.0', + 'descriptionElementTypes', '*' + )) + """)) + .withColumn("parsed_at", current_timestamp()) + .select("path", "parsed", "parsed_at") +) + +( + parsed_df.writeStream.format("delta") + .outputMode("append") + .option("checkpointLocation", "/Volumes/catalog/schema/checkpoints/01_parse") + .option("mergeSchema", "true") + .trigger(availableNow=True) + .toTable("catalog.schema.parsed_documents_raw") +) +``` + +**Stage 2 — Extract text from parsed VARIANT (streaming):** + +Uses `transform()` to extract element content from the VARIANT array, and +`try_cast` for safe access. Error rows are preserved but flagged. + +```python +from pyspark.sql.functions import col, concat_ws, expr, lit, when + +parsed_stream = spark.readStream.format("delta").table("catalog.schema.parsed_documents_raw") + +text_df = ( + parsed_stream + .withColumn("text", + when( + expr("try_cast(parsed:error_status AS STRING)").isNotNull(), lit(None) + ).otherwise( + concat_ws("\n\n", expr(""" + transform( + try_cast(parsed:document:elements AS ARRAY), + element -> try_cast(element:content AS STRING) + ) + """)) + ) + ) + .withColumn("error_status", expr("try_cast(parsed:error_status AS STRING)")) + .select("path", "text", "error_status", "parsed_at") +) + +( + text_df.writeStream.format("delta") + .outputMode("append") + .option("checkpointLocation", "/Volumes/catalog/schema/checkpoints/02_text") + .option("mergeSchema", "true") + .trigger(availableNow=True) + .toTable("catalog.schema.parsed_documents_text") +) +``` + +Key techniques from the official example: +- **`repartition` by file hash** — parallelizes `ai_parse_document` across workers +- **`trigger(availableNow=True)`** — processes all pending files then stops (batch-like) +- **Checkpoints** — exactly-once guarantee; no re-parsing on re-runs +- **`transform()` + `try_cast`** — safer than `explode` + `variant_get` for text extraction +- **Three-stage pipeline** — separate parse/text/structured stages with independent checkpoints + +**Step 1b — Enable Change Data Feed (required for Vector Search Delta Sync):** + +```sql +ALTER TABLE catalog.schema.parsed_chunks +SET TBLPROPERTIES (delta.enableChangeDataFeed = true); +``` + +**Step 2 — Create a Vector Search index and query it:** + +Use the **[databricks-vector-search](../databricks-vector-search/SKILL.md)** skill to create a +Delta Sync index on the chunked table and query it. Ensure CDF is enabled first +(Step 1b above). + +## Output Schema + +`ai_parse_document` returns a VARIANT with this structure: + +``` +document +├── pages[] -- page id, image_uri +└── elements[] -- extracted content + ├── type -- "text", "table", "figure", etc. + ├── content -- extracted text + ├── bbox -- bounding box coordinates + └── description -- AI-generated description +metadata -- file info, schema version +error_status[] -- errors per page (if any) +``` + +## Common Issues + +| Issue | Solution | +|-------|----------| +| **Function not available** | Requires Runtime 17.1+ and Serverless SQL Warehouse | +| **Region not supported** | US/EU regions, or enable cross-geography routing | +| **Large documents** | Use `LIMIT` during development to control costs | +| **`explode()` fails with VARIANT** | `explode()` requires ARRAY, not VARIANT. Use `variant_get(doc, '$.document.elements', 'ARRAY')` to cast before exploding | +| **Short/noisy chunks** | Filter with `length(trim(...)) > 10` — parsing produces tiny fragments (page numbers, headers) that pollute the index | +| **`ai_query` returns markdown fences** | Use `returnType => 'STRING'` for clean output. If fences still appear, strip with `regexp_replace(result, '```(json)?\\s*|```', '')` | +| **Re-parsing unchanged documents** | Use Structured Streaming with checkpoints — see Pattern 3, Step 1a | + +## Related Skills + +- **[databricks-vector-search](../databricks-vector-search/SKILL.md)** — Create indexes and query embeddings (Step 2 of RAG) +- **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** — Pre-built Knowledge Assistants (out-of-the-box RAG without custom parsing) +- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** — Production pipelines for batch document processing +- **[databricks-dbsql](../databricks-dbsql/SKILL.md)** — Full AI functions reference including `ai_query`, `ai_extract`, `ai_classify` From c0ca2c2d298f4184bd26f1b190d602cec1632503 Mon Sep 17 00:00:00 2001 From: Kevin Nguyen Date: Fri, 27 Feb 2026 10:53:22 -0600 Subject: [PATCH 10/20] feat: add lakehouse monitoring reference to Unity Catalog skill Add 7-lakehouse-monitoring.md reference file covering quality monitors, profile types (Snapshot, TimeSeries, InferenceLog), MCP tool usage, and Python SDK examples. Update SKILL.md with trigger condition and reference table entry. Tested against a live Databricks workspace - created and verified a snapshot monitor on a Unity Catalog table. --- .../7-lakehouse-monitoring.md | 229 ++++++++++++++++++ .../databricks-unity-catalog/SKILL.md | 2 + 2 files changed, 231 insertions(+) create mode 100644 databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md diff --git a/databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md b/databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md new file mode 100644 index 00000000..01298d08 --- /dev/null +++ b/databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md @@ -0,0 +1,229 @@ +# Lakehouse Monitoring + +Comprehensive reference for Lakehouse Monitoring: create quality monitors on Unity Catalog tables to track data profiles, detect drift, and monitor ML model performance. + +## Overview + +Lakehouse Monitoring automatically computes statistical profiles and drift metrics for tables over time. When you create a monitor, Databricks generates two output Delta tables (profile metrics + drift metrics) and an optional dashboard. + +| Component | Description | +|-----------|-------------| +| **Monitor** | Configuration attached to a UC table | +| **Profile Metrics Table** | Summary statistics computed per column | +| **Drift Metrics Table** | Statistical drift compared to baseline or previous time window | +| **Dashboard** | Auto-generated visualization of metrics | + +### Requirements + +- Unity Catalog enabled workspace +- Databricks SQL access +- Privileges: `USE CATALOG`, `USE SCHEMA`, `SELECT`, and `MANAGE` on the table +- Only Delta tables supported (managed, external, views, materialized views, streaming tables) + +--- + +## Profile Types + +| Type | Use Case | Key Params | Limitations | +|------|----------|------------|-------------| +| **Snapshot** | General-purpose tables without time column | None required | Max 4TB table size | +| **TimeSeries** | Tables with a timestamp column | `timestamp_col`, `granularities` | Last 30 days only | +| **InferenceLog** | ML model monitoring | `timestamp_col`, `granularities`, `model_id_col`, `problem_type`, `prediction_col` | Last 30 days only | + +### Granularities (for TimeSeries and InferenceLog) + +Supported values: `"5 minutes"`, `"30 minutes"`, `"1 hour"`, `"1 day"`, `" week(s)"`, `"1 month"`, `"1 year"` + +--- + +## MCP Tools + +Use the `manage_uc_monitors` tool for all monitor operations: + +| Action | Description | +|--------|-------------| +| `create` | Create a quality monitor on a table | +| `get` | Get monitor details and status | +| `run_refresh` | Trigger a metric refresh | +| `list_refreshes` | List refresh history | +| `delete` | Delete the monitor (assets are not deleted) | + +### Create a Monitor + +> **Note:** The MCP tool currently only creates **snapshot** monitors. For TimeSeries or InferenceLog monitors, use the Python SDK directly (see below). + +```python +manage_uc_monitors( + action="create", + table_name="catalog.schema.my_table", + output_schema_name="catalog.schema", +) +``` + +### Get Monitor Status + +```python +manage_uc_monitors( + action="get", + table_name="catalog.schema.my_table", +) +``` + +### Trigger a Refresh + +```python +manage_uc_monitors( + action="run_refresh", + table_name="catalog.schema.my_table", +) +``` + +### Delete a Monitor + +```python +manage_uc_monitors( + action="delete", + table_name="catalog.schema.my_table", +) +``` + +--- + +## Python SDK Examples + +**Doc:** https://databricks-sdk-py.readthedocs.io/en/stable/workspace/catalog/lakehouse_monitors.html + +The SDK provides full control over all profile types via `w.lakehouse_monitors`. + +### Create Snapshot Monitor + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.catalog import MonitorSnapshot + +w = WorkspaceClient() + +monitor = w.lakehouse_monitors.create( + table_name="catalog.schema.my_table", + assets_dir="/Workspace/Users/user@example.com/monitoring/my_table", + output_schema_name="catalog.schema", + snapshot=MonitorSnapshot(), +) +print(f"Monitor status: {monitor.status}") +``` + +### Create TimeSeries Monitor + +```python +from databricks.sdk.service.catalog import MonitorTimeSeries + +monitor = w.lakehouse_monitors.create( + table_name="catalog.schema.events", + assets_dir="/Workspace/Users/user@example.com/monitoring/events", + output_schema_name="catalog.schema", + time_series=MonitorTimeSeries( + timestamp_col="event_timestamp", + granularities=["1 day"], + ), +) +``` + +### Create InferenceLog Monitor + +```python +from databricks.sdk.service.catalog import MonitorInferenceLog + +monitor = w.lakehouse_monitors.create( + table_name="catalog.schema.model_predictions", + assets_dir="/Workspace/Users/user@example.com/monitoring/predictions", + output_schema_name="catalog.schema", + inference_log=MonitorInferenceLog( + timestamp_col="prediction_timestamp", + granularities=["1 hour"], + model_id_col="model_version", + problem_type="classification", # or "regression" + prediction_col="prediction", + label_col="label", + ), +) +``` + +### Schedule a Monitor + +```python +from databricks.sdk.service.catalog import MonitorSnapshot, MonitorCronSchedule + +monitor = w.lakehouse_monitors.create( + table_name="catalog.schema.my_table", + assets_dir="/Workspace/Users/user@example.com/monitoring/my_table", + output_schema_name="catalog.schema", + snapshot=MonitorSnapshot(), + schedule=MonitorCronSchedule( + quartz_cron_expression="0 0 12 * * ?", # Daily at noon + timezone_id="UTC", + ), +) +``` + +### Get, Refresh, and Delete + +```python +# Get monitor details +monitor = w.lakehouse_monitors.get(table_name="catalog.schema.my_table") + +# Trigger refresh +refresh = w.lakehouse_monitors.run_refresh(table_name="catalog.schema.my_table") + +# List refresh history +for r in w.lakehouse_monitors.list_refreshes(table_name="catalog.schema.my_table"): + print(r) + +# Delete monitor (does not delete output tables or dashboard) +w.lakehouse_monitors.delete(table_name="catalog.schema.my_table") +``` + +--- + +## Output Tables + +When a monitor is created, two metric tables are generated in the specified output schema: + +| Table | Naming Convention | Contents | +|-------|-------------------|----------| +| **Profile Metrics** | `{table_name}_profile_metrics` | Per-column statistics (nulls, min, max, mean, distinct count, etc.) | +| **Drift Metrics** | `{table_name}_drift_metrics` | Statistical tests comparing current vs. baseline or previous window | + +### Query Output Tables + +```sql +-- View latest profile metrics +SELECT * +FROM catalog.schema.my_table_profile_metrics +ORDER BY window_end DESC +LIMIT 100; + +-- View latest drift metrics +SELECT * +FROM catalog.schema.my_table_drift_metrics +ORDER BY window_end DESC +LIMIT 100; +``` + +--- + +## Common Issues + +| Issue | Cause | Solution | +|-------|-------|----------| +| `FEATURE_NOT_ENABLED` | Lakehouse Monitoring not enabled on workspace | Contact workspace admin to enable the feature | +| `PERMISSION_DENIED` | Missing `MANAGE` privilege on the table | Grant `MANAGE` on the table to your user/group | +| Monitor refresh stuck in `PENDING` | No SQL warehouse available | Ensure a SQL warehouse is running or set `warehouse_id` | +| Profile metrics table empty | Refresh has not completed yet | Check refresh state with `list_refreshes`; wait for `SUCCESS` | +| Snapshot monitor on large table fails | Table exceeds 4TB limit | Switch to TimeSeries profile type instead | +| TimeSeries shows limited data | Only processes last 30 days | Expected behavior; contact account team to adjust | + +--- + +## Resources + +- [Lakehouse Monitors SDK Reference](https://databricks-sdk-py.readthedocs.io/en/stable/workspace/catalog/lakehouse_monitors.html) diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 9b77fed9..ae7e24ac 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -17,6 +17,7 @@ Use this skill when: - Tracking **compute resources** (cluster usage, warehouse metrics) - Reviewing **job execution** (run history, success rates, failures) - Analyzing **query performance** (slow queries, warehouse utilization) +- Profiling **data quality** (lakehouse monitors, drift detection, metric tables) ## Reference Files @@ -24,6 +25,7 @@ Use this skill when: |-------|------|-------------| | System Tables | [5-system-tables.md](5-system-tables.md) | Lineage, audit, billing, compute, jobs, query history | | Volumes | [6-volumes.md](6-volumes.md) | Volume file operations, permissions, best practices | +| Lakehouse Monitoring | [7-lakehouse-monitoring.md](7-lakehouse-monitoring.md) | Quality monitors, drift detection, profile metrics | ## Quick Start From 4ca586ace892c615eea5a1fcaefea19a9e38b6d0 Mon Sep 17 00:00:00 2001 From: Kevin Nguyen Date: Mon, 2 Mar 2026 10:19:06 -0600 Subject: [PATCH 11/20] feat: updated lakehouse monitoring to use new naming data profiling - SKILL.md: updated trigger bullet and reference table to data profiling - Renamed 7-lakehouse-monitoring.md to 7-data-profiling.md with new w.data_quality SDK examples - Added new Data Quality docs and SDK references, kept legacy Lakehouse Monitoring SDK link for backward compatibility --- .../7-data-profiling.md | 287 ++++++++++++++++++ .../7-lakehouse-monitoring.md | 229 -------------- .../databricks-unity-catalog/SKILL.md | 4 +- 3 files changed, 289 insertions(+), 231 deletions(-) create mode 100644 databricks-skills/databricks-unity-catalog/7-data-profiling.md delete mode 100644 databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md diff --git a/databricks-skills/databricks-unity-catalog/7-data-profiling.md b/databricks-skills/databricks-unity-catalog/7-data-profiling.md new file mode 100644 index 00000000..36eceff4 --- /dev/null +++ b/databricks-skills/databricks-unity-catalog/7-data-profiling.md @@ -0,0 +1,287 @@ +# Data Profiling (formerly Lakehouse Monitoring) + +Comprehensive reference for Data Profiling: create quality monitors on Unity Catalog tables to track data profiles, detect drift, and monitor ML model performance. + +## Overview + +Data profiling automatically computes statistical profiles and drift metrics for tables over time. When you create a monitor, Databricks generates two output Delta tables (profile metrics + drift metrics) and an optional dashboard. + +| Component | Description | +|-----------|-------------| +| **Monitor** | Configuration attached to a UC table | +| **Profile Metrics Table** | Summary statistics computed per column | +| **Drift Metrics Table** | Statistical drift compared to baseline or previous time window | +| **Dashboard** | Auto-generated visualization of metrics | + +### Requirements + +- Unity Catalog enabled workspace +- Databricks SQL access +- Privileges: `USE CATALOG`, `USE SCHEMA`, `SELECT`, and `MANAGE` on the table +- Only Delta tables supported (managed, external, views, materialized views, streaming tables) + +--- + +## Profile Types + +| Type | Use Case | Key Params | Limitations | +|------|----------|------------|-------------| +| **Snapshot** | General-purpose tables without time column | None required | Max 4TB table size | +| **TimeSeries** | Tables with a timestamp column | `timestamp_column`, `granularities` | Last 30 days only | +| **InferenceLog** | ML model monitoring | `timestamp_column`, `granularities`, `model_id_column`, `problem_type`, `prediction_column` | Last 30 days only | + +### Granularities (for TimeSeries and InferenceLog) + +Supported `AggregationGranularity` values: `AGGREGATION_GRANULARITY_5_MINUTES`, `AGGREGATION_GRANULARITY_30_MINUTES`, `AGGREGATION_GRANULARITY_1_HOUR`, `AGGREGATION_GRANULARITY_1_DAY`, `AGGREGATION_GRANULARITY_1_WEEK` – `AGGREGATION_GRANULARITY_4_WEEKS`, `AGGREGATION_GRANULARITY_1_MONTH`, `AGGREGATION_GRANULARITY_1_YEAR` + +--- + +## MCP Tools + +Use the `manage_uc_monitors` tool for all monitor operations: + +| Action | Description | +|--------|-------------| +| `create` | Create a quality monitor on a table | +| `get` | Get monitor details and status | +| `run_refresh` | Trigger a metric refresh | +| `list_refreshes` | List refresh history | +| `delete` | Delete the monitor (assets are not deleted) | + +### Create a Monitor + +> **Note:** The MCP tool currently only creates **snapshot** monitors. For TimeSeries or InferenceLog monitors, use the Python SDK directly (see below). + +```python +manage_uc_monitors( + action="create", + table_name="catalog.schema.my_table", + output_schema_name="catalog.schema", +) +``` + +### Get Monitor Status + +```python +manage_uc_monitors( + action="get", + table_name="catalog.schema.my_table", +) +``` + +### Trigger a Refresh + +```python +manage_uc_monitors( + action="run_refresh", + table_name="catalog.schema.my_table", +) +``` + +### Delete a Monitor + +```python +manage_uc_monitors( + action="delete", + table_name="catalog.schema.my_table", +) +``` + +--- + +## Python SDK Examples + +**Doc:** https://databricks-sdk-py.readthedocs.io/en/stable/workspace/dataquality/data_quality.html + +The new SDK provides full control over all profile types via `w.data_quality`. + +### Create Snapshot Monitor + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.dataquality import ( + Monitor, DataProfilingConfig, SnapshotConfig, +) + +w = WorkspaceClient() + +# Look up UUIDs — the new API uses object_id and output_schema_id (both UUIDs) +table_info = w.tables.get("catalog.schema.my_table") +schema_info = w.schemas.get(f"{table_info.catalog_name}.{table_info.schema_name}") + +monitor = w.data_quality.create_monitor( + monitor=Monitor( + object_type="table", + object_id=table_info.table_id, + data_profiling_config=DataProfilingConfig( + assets_dir="/Workspace/Users/user@example.com/monitoring/my_table", + output_schema_id=schema_info.schema_id, + snapshot=SnapshotConfig(), + ), + ), +) +print(f"Monitor status: {monitor.data_profiling_config.status}") +``` + +### Create TimeSeries Monitor + +```python +from databricks.sdk.service.dataquality import ( + Monitor, DataProfilingConfig, TimeSeriesConfig, AggregationGranularity, +) + +table_info = w.tables.get("catalog.schema.events") +schema_info = w.schemas.get(f"{table_info.catalog_name}.{table_info.schema_name}") + +monitor = w.data_quality.create_monitor( + monitor=Monitor( + object_type="table", + object_id=table_info.table_id, + data_profiling_config=DataProfilingConfig( + assets_dir="/Workspace/Users/user@example.com/monitoring/events", + output_schema_id=schema_info.schema_id, + time_series=TimeSeriesConfig( + timestamp_column="event_timestamp", + granularities=[AggregationGranularity.AGGREGATION_GRANULARITY_1_DAY], + ), + ), + ), +) +``` + +### Create InferenceLog Monitor + +```python +from databricks.sdk.service.dataquality import ( + Monitor, DataProfilingConfig, InferenceLogConfig, + AggregationGranularity, InferenceProblemType, +) + +table_info = w.tables.get("catalog.schema.model_predictions") +schema_info = w.schemas.get(f"{table_info.catalog_name}.{table_info.schema_name}") + +monitor = w.data_quality.create_monitor( + monitor=Monitor( + object_type="table", + object_id=table_info.table_id, + data_profiling_config=DataProfilingConfig( + assets_dir="/Workspace/Users/user@example.com/monitoring/predictions", + output_schema_id=schema_info.schema_id, + inference_log=InferenceLogConfig( + timestamp_column="prediction_timestamp", + granularities=[AggregationGranularity.AGGREGATION_GRANULARITY_1_HOUR], + model_id_column="model_version", + problem_type=InferenceProblemType.INFERENCE_PROBLEM_TYPE_CLASSIFICATION, + prediction_column="prediction", + label_column="label", + ), + ), + ), +) +``` + +### Schedule a Monitor + +```python +from databricks.sdk.service.dataquality import ( + Monitor, DataProfilingConfig, SnapshotConfig, CronSchedule, +) + +table_info = w.tables.get("catalog.schema.my_table") +schema_info = w.schemas.get(f"{table_info.catalog_name}.{table_info.schema_name}") + +monitor = w.data_quality.create_monitor( + monitor=Monitor( + object_type="table", + object_id=table_info.table_id, + data_profiling_config=DataProfilingConfig( + assets_dir="/Workspace/Users/user@example.com/monitoring/my_table", + output_schema_id=schema_info.schema_id, + snapshot=SnapshotConfig(), + schedule=CronSchedule( + quartz_cron_expression="0 0 12 * * ?", # Daily at noon + timezone_id="UTC", + ), + ), + ), +) +``` + +### Get, Refresh, and Delete + +```python +# Get monitor details +monitor = w.data_quality.get_monitor( + object_type="table", + object_id=table_info.table_id, +) + +# Trigger refresh +from databricks.sdk.service.dataquality import Refresh + +refresh = w.data_quality.create_refresh( + object_type="table", + object_id=table_info.table_id, + refresh=Refresh( + object_type="table", + object_id=table_info.table_id, + ), +) + +# Delete monitor (does not delete output tables or dashboard) +w.data_quality.delete_monitor( + object_type="table", + object_id=table_info.table_id, +) +``` + +--- + +## Output Tables + +When a monitor is created, two metric tables are generated in the specified output schema: + +| Table | Naming Convention | Contents | +|-------|-------------------|----------| +| **Profile Metrics** | `{table_name}_profile_metrics` | Per-column statistics (nulls, min, max, mean, distinct count, etc.) | +| **Drift Metrics** | `{table_name}_drift_metrics` | Statistical tests comparing current vs. baseline or previous window | + +### Query Output Tables + +```sql +-- View latest profile metrics +SELECT * +FROM catalog.schema.my_table_profile_metrics +ORDER BY window_end DESC +LIMIT 100; + +-- View latest drift metrics +SELECT * +FROM catalog.schema.my_table_drift_metrics +ORDER BY window_end DESC +LIMIT 100; +``` + +--- + +## Common Issues + +| Issue | Cause | Solution | +|-------|-------|----------| +| `FEATURE_NOT_ENABLED` | Data profiling not enabled on workspace | Contact workspace admin to enable the feature | +| `PERMISSION_DENIED` | Missing `MANAGE` privilege on the table | Grant `MANAGE` on the table to your user/group | +| Monitor refresh stuck in `PENDING` | No SQL warehouse available | Ensure a SQL warehouse is running or set `warehouse_id` | +| Profile metrics table empty | Refresh has not completed yet | Check refresh state with `list_refreshes`; wait for `SUCCESS` | +| Snapshot monitor on large table fails | Table exceeds 4TB limit | Switch to TimeSeries profile type instead | +| TimeSeries shows limited data | Only processes last 30 days | Expected behavior; contact account team to adjust | + +--- + +> **Note:** Data profiling was formerly known as Lakehouse Monitoring. The legacy SDK accessor +> `w.lakehouse_monitors` and the MCP tool `manage_uc_monitors` still use the previous API. + +## Resources + +- [Data Quality Monitoring Documentation](https://docs.databricks.com/aws/en/data-quality-monitoring/) +- [Data Quality SDK Reference](https://databricks-sdk-py.readthedocs.io/en/stable/workspace/dataquality/data_quality.html) +- [Legacy Lakehouse Monitors SDK Reference](https://databricks-sdk-py.readthedocs.io/en/stable/workspace/catalog/lakehouse_monitors.html) diff --git a/databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md b/databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md deleted file mode 100644 index 01298d08..00000000 --- a/databricks-skills/databricks-unity-catalog/7-lakehouse-monitoring.md +++ /dev/null @@ -1,229 +0,0 @@ -# Lakehouse Monitoring - -Comprehensive reference for Lakehouse Monitoring: create quality monitors on Unity Catalog tables to track data profiles, detect drift, and monitor ML model performance. - -## Overview - -Lakehouse Monitoring automatically computes statistical profiles and drift metrics for tables over time. When you create a monitor, Databricks generates two output Delta tables (profile metrics + drift metrics) and an optional dashboard. - -| Component | Description | -|-----------|-------------| -| **Monitor** | Configuration attached to a UC table | -| **Profile Metrics Table** | Summary statistics computed per column | -| **Drift Metrics Table** | Statistical drift compared to baseline or previous time window | -| **Dashboard** | Auto-generated visualization of metrics | - -### Requirements - -- Unity Catalog enabled workspace -- Databricks SQL access -- Privileges: `USE CATALOG`, `USE SCHEMA`, `SELECT`, and `MANAGE` on the table -- Only Delta tables supported (managed, external, views, materialized views, streaming tables) - ---- - -## Profile Types - -| Type | Use Case | Key Params | Limitations | -|------|----------|------------|-------------| -| **Snapshot** | General-purpose tables without time column | None required | Max 4TB table size | -| **TimeSeries** | Tables with a timestamp column | `timestamp_col`, `granularities` | Last 30 days only | -| **InferenceLog** | ML model monitoring | `timestamp_col`, `granularities`, `model_id_col`, `problem_type`, `prediction_col` | Last 30 days only | - -### Granularities (for TimeSeries and InferenceLog) - -Supported values: `"5 minutes"`, `"30 minutes"`, `"1 hour"`, `"1 day"`, `" week(s)"`, `"1 month"`, `"1 year"` - ---- - -## MCP Tools - -Use the `manage_uc_monitors` tool for all monitor operations: - -| Action | Description | -|--------|-------------| -| `create` | Create a quality monitor on a table | -| `get` | Get monitor details and status | -| `run_refresh` | Trigger a metric refresh | -| `list_refreshes` | List refresh history | -| `delete` | Delete the monitor (assets are not deleted) | - -### Create a Monitor - -> **Note:** The MCP tool currently only creates **snapshot** monitors. For TimeSeries or InferenceLog monitors, use the Python SDK directly (see below). - -```python -manage_uc_monitors( - action="create", - table_name="catalog.schema.my_table", - output_schema_name="catalog.schema", -) -``` - -### Get Monitor Status - -```python -manage_uc_monitors( - action="get", - table_name="catalog.schema.my_table", -) -``` - -### Trigger a Refresh - -```python -manage_uc_monitors( - action="run_refresh", - table_name="catalog.schema.my_table", -) -``` - -### Delete a Monitor - -```python -manage_uc_monitors( - action="delete", - table_name="catalog.schema.my_table", -) -``` - ---- - -## Python SDK Examples - -**Doc:** https://databricks-sdk-py.readthedocs.io/en/stable/workspace/catalog/lakehouse_monitors.html - -The SDK provides full control over all profile types via `w.lakehouse_monitors`. - -### Create Snapshot Monitor - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.catalog import MonitorSnapshot - -w = WorkspaceClient() - -monitor = w.lakehouse_monitors.create( - table_name="catalog.schema.my_table", - assets_dir="/Workspace/Users/user@example.com/monitoring/my_table", - output_schema_name="catalog.schema", - snapshot=MonitorSnapshot(), -) -print(f"Monitor status: {monitor.status}") -``` - -### Create TimeSeries Monitor - -```python -from databricks.sdk.service.catalog import MonitorTimeSeries - -monitor = w.lakehouse_monitors.create( - table_name="catalog.schema.events", - assets_dir="/Workspace/Users/user@example.com/monitoring/events", - output_schema_name="catalog.schema", - time_series=MonitorTimeSeries( - timestamp_col="event_timestamp", - granularities=["1 day"], - ), -) -``` - -### Create InferenceLog Monitor - -```python -from databricks.sdk.service.catalog import MonitorInferenceLog - -monitor = w.lakehouse_monitors.create( - table_name="catalog.schema.model_predictions", - assets_dir="/Workspace/Users/user@example.com/monitoring/predictions", - output_schema_name="catalog.schema", - inference_log=MonitorInferenceLog( - timestamp_col="prediction_timestamp", - granularities=["1 hour"], - model_id_col="model_version", - problem_type="classification", # or "regression" - prediction_col="prediction", - label_col="label", - ), -) -``` - -### Schedule a Monitor - -```python -from databricks.sdk.service.catalog import MonitorSnapshot, MonitorCronSchedule - -monitor = w.lakehouse_monitors.create( - table_name="catalog.schema.my_table", - assets_dir="/Workspace/Users/user@example.com/monitoring/my_table", - output_schema_name="catalog.schema", - snapshot=MonitorSnapshot(), - schedule=MonitorCronSchedule( - quartz_cron_expression="0 0 12 * * ?", # Daily at noon - timezone_id="UTC", - ), -) -``` - -### Get, Refresh, and Delete - -```python -# Get monitor details -monitor = w.lakehouse_monitors.get(table_name="catalog.schema.my_table") - -# Trigger refresh -refresh = w.lakehouse_monitors.run_refresh(table_name="catalog.schema.my_table") - -# List refresh history -for r in w.lakehouse_monitors.list_refreshes(table_name="catalog.schema.my_table"): - print(r) - -# Delete monitor (does not delete output tables or dashboard) -w.lakehouse_monitors.delete(table_name="catalog.schema.my_table") -``` - ---- - -## Output Tables - -When a monitor is created, two metric tables are generated in the specified output schema: - -| Table | Naming Convention | Contents | -|-------|-------------------|----------| -| **Profile Metrics** | `{table_name}_profile_metrics` | Per-column statistics (nulls, min, max, mean, distinct count, etc.) | -| **Drift Metrics** | `{table_name}_drift_metrics` | Statistical tests comparing current vs. baseline or previous window | - -### Query Output Tables - -```sql --- View latest profile metrics -SELECT * -FROM catalog.schema.my_table_profile_metrics -ORDER BY window_end DESC -LIMIT 100; - --- View latest drift metrics -SELECT * -FROM catalog.schema.my_table_drift_metrics -ORDER BY window_end DESC -LIMIT 100; -``` - ---- - -## Common Issues - -| Issue | Cause | Solution | -|-------|-------|----------| -| `FEATURE_NOT_ENABLED` | Lakehouse Monitoring not enabled on workspace | Contact workspace admin to enable the feature | -| `PERMISSION_DENIED` | Missing `MANAGE` privilege on the table | Grant `MANAGE` on the table to your user/group | -| Monitor refresh stuck in `PENDING` | No SQL warehouse available | Ensure a SQL warehouse is running or set `warehouse_id` | -| Profile metrics table empty | Refresh has not completed yet | Check refresh state with `list_refreshes`; wait for `SUCCESS` | -| Snapshot monitor on large table fails | Table exceeds 4TB limit | Switch to TimeSeries profile type instead | -| TimeSeries shows limited data | Only processes last 30 days | Expected behavior; contact account team to adjust | - ---- - -## Resources - -- [Lakehouse Monitors SDK Reference](https://databricks-sdk-py.readthedocs.io/en/stable/workspace/catalog/lakehouse_monitors.html) diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index ae7e24ac..553eba97 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -17,7 +17,7 @@ Use this skill when: - Tracking **compute resources** (cluster usage, warehouse metrics) - Reviewing **job execution** (run history, success rates, failures) - Analyzing **query performance** (slow queries, warehouse utilization) -- Profiling **data quality** (lakehouse monitors, drift detection, metric tables) +- Profiling **data quality** (data profiling, drift detection, metric tables) ## Reference Files @@ -25,7 +25,7 @@ Use this skill when: |-------|------|-------------| | System Tables | [5-system-tables.md](5-system-tables.md) | Lineage, audit, billing, compute, jobs, query history | | Volumes | [6-volumes.md](6-volumes.md) | Volume file operations, permissions, best practices | -| Lakehouse Monitoring | [7-lakehouse-monitoring.md](7-lakehouse-monitoring.md) | Quality monitors, drift detection, profile metrics | +| Data Profiling | [7-data-profiling.md](7-data-profiling.md) | Data profiling, drift detection, profile metrics | ## Quick Start From 0953f6ffb1a1f4c3f57ebcdde78522cfbbda9dad Mon Sep 17 00:00:00 2001 From: dbderek Date: Sat, 28 Feb 2026 18:19:38 -0700 Subject: [PATCH 12/20] updated client files with latest sdk syntax recommendations --- .../2-python-client.md | 81 +++++++++++++------ .../3-multilanguage-clients.md | 23 +++--- .../5-operations-and-limits.md | 34 ++++---- .../databricks-zerobus-ingest/SKILL.md | 6 +- 4 files changed, 93 insertions(+), 51 deletions(-) diff --git a/databricks-skills/databricks-zerobus-ingest/2-python-client.md b/databricks-skills/databricks-zerobus-ingest/2-python-client.md index ac95cd4c..5cb9cdee 100644 --- a/databricks-skills/databricks-zerobus-ingest/2-python-client.md +++ b/databricks-skills/databricks-zerobus-ingest/2-python-client.md @@ -11,12 +11,14 @@ Python SDK patterns for Zerobus Ingest: synchronous and asynchronous APIs, JSON from zerobus.sdk.sync import ZerobusSdk # Asynchronous API (equivalent capabilities) -from zerobus.sdk.asyncio import ZerobusSdk as AsyncZerobusSdk +from zerobus.sdk.aio import ZerobusSdk as AsyncZerobusSdk # Shared types (used by both sync and async) from zerobus.sdk.shared import ( RecordType, - IngestRecordResponse, + AckCallback, + ZerobusException, + NonRetriableException, StreamConfigurationOptions, TableProperties, ) @@ -49,8 +51,8 @@ stream = sdk.create_stream(client_id, client_secret, table_props, options) try: for i in range(100): record = {"device_name": f"sensor-{i}", "temp": 22, "humidity": 55} - ack = stream.ingest_record(record) - ack.wait_for_ack() # Block until durably written + offset = stream.ingest_record_offset(record) + stream.wait_for_offset(offset) # Block until durably written finally: stream.close() ``` --> @@ -90,8 +92,8 @@ try: temp=22, humidity=55, ) - ack = stream.ingest_record(record) - ack.wait_for_ack() + offset = stream.ingest_record_offset(record) + stream.wait_for_offset(offset) finally: stream.close() ``` @@ -100,17 +102,21 @@ finally: ## ACK Callback (Asynchronous Acknowledgment) -Instead of blocking on each ACK, register a callback for background durability confirmation: +Instead of blocking on each ACK, register an `AckCallback` subclass for background durability confirmation: ```python -from zerobus.sdk.shared import IngestRecordResponse, StreamConfigurationOptions, RecordType +from zerobus.sdk.shared import AckCallback, StreamConfigurationOptions, RecordType -def on_ack(response: IngestRecordResponse) -> None: - print(f"Durable up to offset: {response.durability_ack_up_to_offset}") +class MyAckHandler(AckCallback): + def on_ack(self, offset: int) -> None: + print(f"Durable up to offset: {offset}") + + def on_error(self, offset: int, message: str) -> None: + print(f"Error at offset {offset}: {message}") options = StreamConfigurationOptions( record_type=RecordType.JSON, - ack_callback=on_ack, + ack_callback=MyAckHandler(), ) # Create stream with callback @@ -119,7 +125,7 @@ stream = sdk.create_stream(client_id, client_secret, table_props, options) try: for i in range(1000): record = {"device_name": f"sensor-{i}", "temp": 22, "humidity": 55} - stream.ingest_record(record) # Non-blocking, ACKs arrive via callback + stream.ingest_record_nowait(record) # Fire-and-forget, ACKs arrive via callback stream.flush() # Ensure all buffered records are sent finally: stream.close() @@ -135,12 +141,12 @@ A production-ready wrapper with retry logic, reconnection, and both JSON and Pro import os import time import logging -from typing import Optional, Callable +from typing import Optional from zerobus.sdk.sync import ZerobusSdk from zerobus.sdk.shared import ( RecordType, - IngestRecordResponse, + AckCallback, StreamConfigurationOptions, TableProperties, ) @@ -159,7 +165,7 @@ class ZerobusClient: client_id: str, client_secret: str, record_type: RecordType = RecordType.JSON, - ack_callback: Optional[Callable[[IngestRecordResponse], None]] = None, + ack_callback: Optional[AckCallback] = None, proto_descriptor=None, ): self.server_endpoint = server_endpoint @@ -199,8 +205,8 @@ class ZerobusClient: try: if self.stream is None: self.init_stream() - ack = self.stream.ingest_record(payload) - ack.wait_for_ack() + offset = self.stream.ingest_record_offset(payload) + self.stream.wait_for_offset(offset) return True except Exception as e: err = str(e).lower() @@ -275,7 +281,7 @@ The SDK provides an equivalent async API for use with `asyncio`: ```python import asyncio -from zerobus.sdk.asyncio import ZerobusSdk as AsyncZerobusSdk +from zerobus.sdk.aio import ZerobusSdk as AsyncZerobusSdk from zerobus.sdk.shared import RecordType, StreamConfigurationOptions, TableProperties @@ -289,8 +295,8 @@ async def ingest_async(): try: for i in range(100): record = {"device_name": f"sensor-{i}", "temp": 22, "humidity": 55} - ack = await stream.ingest_record(record) - await ack.wait_for_ack() + offset = await stream.ingest_record_offset(record) + await stream.wait_for_offset(offset) finally: await stream.close() @@ -304,7 +310,7 @@ asyncio.run(ingest_async()) ## Batch Pattern -For higher throughput, send records without blocking on each ACK and flush at the end: +For higher throughput, use `ingest_record_nowait` (fire-and-forget) or batch methods, and flush at the end: ```python with ZerobusClient( @@ -314,10 +320,39 @@ with ZerobusClient( client_id=os.environ["DATABRICKS_CLIENT_ID"], client_secret=os.environ["DATABRICKS_CLIENT_SECRET"], record_type=RecordType.JSON, - ack_callback=lambda resp: None, # Discard individual ACKs ) as client: for i in range(10_000): record = {"device_name": f"sensor-{i}", "temp": 22, "humidity": 55} - client.stream.ingest_record(record) # Non-blocking + client.stream.ingest_record_nowait(record) # Fire-and-forget # flush() and close() called automatically by context manager ``` + +For true batch ingestion, use the batch variants: + +```python +records = [ + {"device_name": f"sensor-{i}", "temp": 22, "humidity": 55} + for i in range(10_000) +] +# Fire-and-forget batch +stream.ingest_records_nowait(records) +stream.flush() + +# Or with offset tracking +offset = stream.ingest_records_offset(records) +stream.wait_for_offset(offset) +``` + +--- + +## Ingestion Method Comparison + +| Method | Returns | Blocks? | Best For | +|--------|---------|---------|----------| +| `ingest_record_offset(record)` | offset | No (enqueues) | Single record with durability tracking | +| `ingest_record_nowait(record)` | None | No | Max single-record throughput | +| `ingest_records_offset(records)` | last offset | No (enqueues) | Batch with durability tracking | +| `ingest_records_nowait(records)` | None | No | Max batch throughput | +| `wait_for_offset(offset)` | None | Yes (until ACK) | Durability confirmation | +| `flush()` | None | Yes (until sent) | Ensure all buffered records are sent | +| `ingest_record(record)` | RecordAcknowledgment | No | **Deprecated** — use `ingest_record_offset` | diff --git a/databricks-skills/databricks-zerobus-ingest/3-multilanguage-clients.md b/databricks-skills/databricks-zerobus-ingest/3-multilanguage-clients.md index 217398c8..4eba1015 100644 --- a/databricks-skills/databricks-zerobus-ingest/3-multilanguage-clients.md +++ b/databricks-skills/databricks-zerobus-ingest/3-multilanguage-clients.md @@ -51,7 +51,8 @@ public class ZerobusProducer { .setTemp(22) .setHumidity(55) .build(); - stream.ingestRecord(record).join(); + long offset = stream.ingestRecordOffset(record); + stream.waitForOffset(offset); } } finally { stream.close(); @@ -126,12 +127,12 @@ func main() { record := fmt.Sprintf( `{"device_name": "sensor-%d", "temp": 22, "humidity": 55}`, i, ) - ack, err := stream.IngestRecord(record) + offset, err := stream.IngestRecordOffset(record) if err != nil { log.Printf("Ingest failed for record %d: %v", i, err) continue } - ack.Await() + stream.WaitForOffset(offset) } stream.Flush() @@ -187,7 +188,8 @@ const stream = await sdk.createStream( try { for (let i = 0; i < 100; i++) { const record = { device_name: `sensor-${i}`, temp: 22, humidity: 55 }; - await stream.ingestRecord(record); + const offset = await stream.ingestRecordOffset(record); + await stream.waitForOffset(offset); } await stream.flush(); } finally { @@ -207,7 +209,8 @@ async function ingestWithRetry( ): Promise { for (let attempt = 0; attempt < maxRetries; attempt++) { try { - await stream.ingestRecord(record); + const offset = await stream.ingestRecordOffset(record); + await stream.waitForOffset(offset); return true; } catch (error) { console.warn(`Attempt ${attempt + 1}/${maxRetries} failed:`, error); @@ -268,8 +271,8 @@ async fn main() -> Result<(), Box> { r#"{{"device_name": "sensor-{}", "temp": 22, "humidity": 55}}"#, i ); - let ack = stream.ingest_record(record.into_bytes()).await?; - ack.await?; + let offset = stream.ingest_record_offset(record.into_bytes()).await?; + stream.wait_for_offset(offset).await?; } stream.close().await?; @@ -296,8 +299,8 @@ let mut stream = sdk // Ingest serialized protobuf bytes let record_bytes = my_proto_message.encode_to_vec(); -let ack = stream.ingest_record(record_bytes).await?; -ack.await?; +let offset = stream.ingest_record_offset(record_bytes).await?; +stream.wait_for_offset(offset).await?; ``` --- @@ -310,5 +313,5 @@ ack.await?; | Package | `databricks-zerobus-ingest-sdk` | `com.databricks:zerobus-ingest-sdk` | `github.com/databricks/zerobus-sdk-go` | `@databricks/zerobus-ingest-sdk` | `databricks-zerobus-ingest-sdk` | | Default serialization | JSON | Protobuf | JSON | JSON | JSON | | Async API | Yes (separate module) | CompletableFuture | Goroutines | Native async/await | Tokio async/await | -| ACK pattern | `ack.wait_for_ack()` or callback | `.join()` | `ack.Await()` | Implicit in `await` | `ack.await?` | +| ACK pattern | `wait_for_offset(offset)` or `AckCallback` | `waitForOffset(offset)` | `WaitForOffset(offset)` | `await waitForOffset(offset)` | `wait_for_offset(offset).await?` | | Proto generation | `python -m zerobus.tools.generate_proto` | JAR CLI tool | External `protoc` | External `protoc` | External `protoc` | diff --git a/databricks-skills/databricks-zerobus-ingest/5-operations-and-limits.md b/databricks-skills/databricks-zerobus-ingest/5-operations-and-limits.md index 7b8cb2b8..004774d7 100644 --- a/databricks-skills/databricks-zerobus-ingest/5-operations-and-limits.md +++ b/databricks-skills/databricks-zerobus-ingest/5-operations-and-limits.md @@ -12,40 +12,44 @@ Every ingested record returns a durability acknowledgment. An ACK indicates that | Strategy | When to Use | Trade-off | |----------|-------------|-----------| -| **Sync block per record** | Low-volume, strict ordering | Simplest; lower throughput | -| **ACK callback** | High-volume producers | Higher throughput; more complex | -| **Periodic flush** | Batch-oriented workloads | Best throughput; eventual consistency | +| **`ingest_record_offset` + `wait_for_offset`** | Low-volume, strict ordering | Simplest; lower throughput | +| **`ingest_record_nowait` + `AckCallback`** | High-volume producers | Higher throughput; more complex | +| **`ingest_record_nowait` + periodic `flush`** | Batch-oriented workloads | Best throughput; eventual consistency | ### Sync Block (Python) ```python -ack = stream.ingest_record(record) -ack.wait_for_ack() # Blocks until durable +offset = stream.ingest_record_offset(record) +stream.wait_for_offset(offset) # Blocks until durable ``` ### ACK Callback (Python) ```python -from zerobus.sdk.shared import IngestRecordResponse +from zerobus.sdk.shared import AckCallback -last_acked_offset = 0 +class MyAckHandler(AckCallback): + def __init__(self): + self.last_acked_offset = 0 -def on_ack(response: IngestRecordResponse) -> None: - global last_acked_offset - last_acked_offset = response.durability_ack_up_to_offset + def on_ack(self, offset: int) -> None: + self.last_acked_offset = offset + + def on_error(self, offset: int, message: str) -> None: + print(f"Error at offset {offset}: {message}") options = StreamConfigurationOptions( record_type=RecordType.JSON, - ack_callback=on_ack, + ack_callback=MyAckHandler(), ) ``` ### Flush-Based ```python -# Send many records without blocking +# Send many records without blocking (fire-and-forget) for record in batch: - stream.ingest_record(record) + stream.ingest_record_nowait(record) # Flush ensures all buffered records are sent stream.flush() @@ -89,8 +93,8 @@ def ingest_with_retry(stream_factory, record, max_retries=5): for attempt in range(max_retries): try: - ack = stream.ingest_record(record) - ack.wait_for_ack() + offset = stream.ingest_record_offset(record) + stream.wait_for_offset(offset) return stream # Return the (possibly new) stream except Exception as e: err = str(e).lower() diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md index d29dc00f..efd52b0d 100644 --- a/databricks-skills/databricks-zerobus-ingest/SKILL.md +++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md @@ -95,8 +95,8 @@ table_props = TableProperties(table_name) stream = sdk.create_stream(client_id, client_secret, table_props, options) try: record = {"device_name": "sensor-1", "temp": 22, "humidity": 55} - ack = stream.ingest_record(record) - ack.wait_for_ack() + offset = stream.ingest_record_offset(record) + stream.wait_for_offset(offset) finally: stream.close() ``` @@ -193,7 +193,7 @@ The timestamp generation must use microseconds for Databricks. - **gRPC + Protobuf**: Zerobus uses gRPC as its transport protocol. Any application that can communicate via gRPC and construct Protobuf messages can produce to Zerobus. - **JSON or Protobuf serialization**: JSON for quick starts; Protobuf for type safety, forward compatibility, and performance. - **At-least-once delivery**: The connector provides at-least-once guarantees. Design consumers to handle duplicates. -- **Durability ACKs**: Each ingested record returns an ACK confirming durable write. ACKs indicate all records up to that offset have been durably written. +- **Durability ACKs**: Each ingested record returns an offset. Use `wait_for_offset(offset)` to confirm durable write. ACKs indicate all records up to that offset have been durably written. - **No table management**: Zerobus does not create or alter tables. You must pre-create your target table and manage schema evolution yourself. - **Single-AZ durability**: The service runs in a single availability zone. Plan for potential zone outages. From 3929bc6b012261e99fe7691c40519168cbeec350 Mon Sep 17 00:00:00 2001 From: Natyra Date: Mon, 2 Mar 2026 22:23:32 +0100 Subject: [PATCH 13/20] install databricks parsing skill --- databricks-skills/install_skills.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index 30339ade..ff8d9b86 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -42,7 +42,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills" MLFLOW_REPO_REF="main" # Databricks skills (hosted in this repo) -DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +DATABRICKS_SKILLS="databricks-agent-bricks databricks-aibi-dashboards databricks-asset-bundles databricks-app-apx databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-parsing databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-generation databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" @@ -71,6 +71,7 @@ get_skill_description() { "databricks-lakebase-provisioned") echo "Lakebase Provisioned - data connections and reverse ETL" ;; "databricks-metric-views") echo "Unity Catalog Metric Views - governed business metrics in YAML" ;; "databricks-model-serving") echo "Model Serving - deploy MLflow models and AI agents" ;; + "databricks-parsing") echo "Document parsing with ai_parse_document and custom RAG pipelines" ;; "databricks-mlflow-evaluation") echo "MLflow evaluation and trace analysis" ;; "databricks-spark-declarative-pipelines") echo "Spark Declarative Pipelines (SDP/LDP/DLT)" ;; "spark-python-data-source") echo "Spark custom Python data sources" ;; From 93292d141c20ffb924a5e37edf19dbda5d32154a Mon Sep 17 00:00:00 2001 From: CAholder Date: Mon, 2 Mar 2026 21:05:57 -0800 Subject: [PATCH 14/20] refactoring to use mlflow --- .test/README.md | 1120 +++++------------ .../gepa_skill_optimization_demo.ipynb | 521 +++++--- .test/pyproject.toml | 2 + .test/scripts/generate_examples.py | 2 +- .test/scripts/optimize.py | 61 +- .test/src/skill_test/optimize/__init__.py | 12 +- .test/src/skill_test/optimize/alignment.py | 99 ++ .test/src/skill_test/optimize/asi.py | 200 +-- .test/src/skill_test/optimize/evaluator.py | 581 --------- .test/src/skill_test/optimize/judges.py | 289 +++++ .test/src/skill_test/optimize/review.py | 131 +- .test/src/skill_test/optimize/runner.py | 193 +-- .../optimize/skillbench_evaluator.py | 557 +++----- .test/src/skill_test/optimize/splitter.py | 4 +- .test/src/skill_test/optimize/tools.py | 4 +- .test/src/skill_test/optimize/utils.py | 67 + .test/src/skill_test/scorers/dynamic.py | 6 +- .test/tests/test_optimize_e2e.py | 9 +- databricks-builder-app/alembic/env.py | 11 + databricks-builder-app/app.yaml | 2 + databricks-builder-app/scripts/deploy.sh | 5 + databricks-builder-app/server/db/database.py | 5 + .../server/services/clusters.py | 23 +- 23 files changed, 1432 insertions(+), 2472 deletions(-) create mode 100644 .test/src/skill_test/optimize/alignment.py delete mode 100644 .test/src/skill_test/optimize/evaluator.py create mode 100644 .test/src/skill_test/optimize/judges.py create mode 100644 .test/src/skill_test/optimize/utils.py diff --git a/.test/README.md b/.test/README.md index 82b0de43..92371239 100644 --- a/.test/README.md +++ b/.test/README.md @@ -1,958 +1,398 @@ -# Skill Testing & Optimization Framework +# Skill Evaluation & Optimization -Evaluate and optimize Databricks SKILL.md files using automated scorers and [GEPA](https://github.com/gepa-ai/gepa)-powered optimization. +Automatically evaluate and optimize SKILL.md files using [GEPA](https://github.com/gepa-ai/gepa) `optimize_anything` and MLflow judges. -## Quick Start: Optimize a Skill +## How It Works -One command evaluates a skill's current quality, runs GEPA optimization, and shows the results: +SKILL.md files teach AI agents (like Claude Code) how to use Databricks features. Every token in a skill consumes the agent's context window, so skills must be **correct** (teach the right patterns) and **concise** (waste no tokens). This framework measures both and uses GEPA to improve them. -```bash -uv run python .test/scripts/optimize.py databricks-model-serving --preset quick --apply +### The Core Loop + +``` + ┌──────────────────────────────────────────────────┐ + │ GEPA optimize_anything │ + │ │ + │ seed_candidate ─► evaluator(candidate, task) │ + │ │ │ │ + │ │ (score, side_info) │ + │ │ │ │ + │ │ reflection LM reads │ + │ │ side_info rationale │ + │ │ │ │ + │ │ proposes mutation │ + │ │ │ │ + │ └──── best_candidate (Pareto frontier) ◄───┘│ + └──────────────────────────────────────────────────┘ ``` -This will: -1. Load the SKILL.md and its test cases from `ground_truth.yaml` -2. Have a generation model (Sonnet) produce responses using ONLY the skill, then score those responses -3. Also score the SKILL.md itself for pattern/fact coverage -4. Run GEPA's optimization loop (reflect on failures, propose mutations, select via Pareto frontier) -5. Show a diff with quality improvement and token reduction -6. Apply the optimized SKILL.md back to disk +**GEPA** ([Generalized Evolutionary Prompt Architect](https://github.com/gepa-ai/gepa)) treats the SKILL.md as a text artifact to optimize. Its `optimize_anything` API takes: +- A **seed candidate** (the current SKILL.md text) +- An **evaluator** function: `(candidate, task_example) -> (score, side_info)` +- A **dataset** of test cases from `ground_truth.yaml` -## Setup +GEPA's reflection LM reads the `side_info` diagnostics, proposes mutations, evaluates them, and selects the best via Pareto frontier. The critical insight: the richer the `side_info` diagnostics, the better GEPA's mutations. -```bash -# Install with optimization dependencies -uv pip install -e ".test/[all]" +### MLflow Judges as the Evaluator -# Authentication for models (pick one) -# Option A: Databricks Model Serving (default for both gen + reflection) -export DATABRICKS_API_KEY="dapi..." -export DATABRICKS_API_BASE="https://.cloud.databricks.com/serving-endpoints" +The evaluator uses [MLflow's `make_judge`](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html) to score responses. Three judges replace the previous 6-judge + binary-assertion system: -# Option B: OpenAI -export OPENAI_API_KEY="sk-..." -export GEPA_REFLECTION_LM="openai/gpt-4o" -export GEPA_GEN_LM="openai/gpt-4o" +| Judge | What it does | Returns | +|-------|-------------|---------| +| **quality_judge** | Scores a single response against expected facts, patterns, and guidelines | `float` (0.0-1.0) + rationale | +| **effectiveness_judge** | Compares WITH-skill vs WITHOUT-skill responses | `"improved"` / `"same"` / `"regressed"` + rationale | +| **regression_judge** | Identifies specific ways the skill harms responses | `bool` + rationale of what to fix | -# Optional: override generation model (default: databricks/databricks-claude-sonnet-4-6) -# export GEPA_GEN_LM="databricks/databricks-claude-sonnet-4-6" +Each judge returns **full rationale** — not truncated — so GEPA's reflection LM sees exactly what failed and why: -# Optional: set a global token budget ceiling for optimization -# export GEPA_TOKEN_BUDGET=50000 +```python +side_info = { + "Judge_quality_with": { + "score": 0.65, + "rationale": "The response correctly uses CREATE OR REPLACE VIEW but misses " + "the MEASURE() wrapping requirement for measure references. " + "Pattern adherence: 2/3 found. Fact coverage: 3/5 present." + }, + "Judge_quality_without": { + "score": 0.2, + "rationale": "Without the skill, the model invented a non-existent " + "CREATE METRIC VIEW syntax. Only 1/5 expected facts present." + }, + "Judge_effectiveness": { + "verdict": "improved", + "rationale": "The skill successfully teaches WITH METRICS LANGUAGE YAML " + "syntax (+0.45 delta). However, the MEASURE() wrapping example " + "uses incorrect syntax that the model copies." + } +} ``` ---- +### Scoring Weights -## Optimization Commands +| Weight | Dimension | Source | +|--------|-----------|--------| +| **35%** | Skill Effectiveness | `quality_with - quality_without` (the delta) | +| **25%** | Absolute Quality | `quality_with` score from judge | +| **10%** | Judge Effectiveness | Effectiveness verdict score | +| **5%** | Structure | Python/SQL syntax validation | +| **25%** | Token Efficiency | Smaller = higher score (bonus up to 1.15x) | + +--- -### Evaluate + Optimize a Skill +## Quick Start ```bash -# Standard optimization (50 iterations per component, up to 5 passes) -uv run python .test/scripts/optimize.py +# Install +uv pip install -e ".test/[all]" -# Quick pass (15 iterations, good for initial check) -uv run python .test/scripts/optimize.py --preset quick +# Auth (pick one) +export DATABRICKS_API_KEY="dapi..." +export DATABRICKS_API_BASE="https://.cloud.databricks.com/serving-endpoints" +# OR +export OPENAI_API_KEY="sk-..." +export GEPA_REFLECTION_LM="openai/gpt-4o" +export GEPA_GEN_LM="openai/gpt-4o" -# Thorough optimization (150 iterations, production quality) -uv run python .test/scripts/optimize.py --preset thorough +# Optimize +uv run python .test/scripts/optimize.py databricks-metric-views --preset quick --apply +``` -# Dry run: see scores and config without calling GEPA -uv run python .test/scripts/optimize.py --dry-run +--- -# Apply the last saved result (no re-run!) -uv run python .test/scripts/optimize.py --apply-last +## What Can Be Optimized -# Run optimization and immediately apply -uv run python .test/scripts/optimize.py --apply +GEPA treats any text artifact as a candidate for optimization. The framework supports three types of artifacts, independently or together: -# Use a specific generation model for evaluation -uv run python .test/scripts/optimize.py --gen-model "openai/gpt-4o" +### Skills (SKILL.md files) -# Control iteration depth (default: 5 passes) -uv run python .test/scripts/optimize.py --max-passes 3 +SKILL.md files teach agents Databricks patterns — API syntax, code examples, best practices. Each skill is a standalone GEPA component (`skill_md`). -# Set a token budget ceiling (candidates exceeding this are penalized) -uv run python .test/scripts/optimize.py --token-budget 50000 +```bash +# Optimize a skill +uv run python .test/scripts/optimize.py databricks-metric-views --preset quick # Optimize all skills that have test cases uv run python .test/scripts/optimize.py --all --preset quick ``` -After each run, the optimized result is automatically saved to `.test/skills//optimized_SKILL.md`. You can review it, diff it against the original, and apply when ready with `--apply-last` — no need to re-run the optimization. +### MCP Tool Descriptions -### Optimize MCP Tool Descriptions - -GEPA can also optimize the `@mcp.tool` docstrings in `databricks-mcp-server/`. Tool descriptions are what the AI agent sees when deciding which tool to call -- concise, accurate descriptions lead to better tool selection. +`@mcp.tool` docstrings in `databricks-mcp-server/` are what the agent sees when deciding which tool to call. Concise, accurate descriptions improve tool selection. Each tool module becomes a separate GEPA component (`tools_sql`, `tools_serving`, etc.). ```bash -# Optimize a skill AND its related tool modules together -uv run python .test/scripts/optimize.py databricks-model-serving --include-tools --tool-modules serving sql - -# Optimize specific tool modules alongside a skill -uv run python .test/scripts/optimize.py databricks-model-serving --include-tools --tool-modules serving compute jobs - -# Optimize ALL tool modules alongside a skill -uv run python .test/scripts/optimize.py databricks-model-serving --include-tools +# Optimize tool descriptions ONLY (no SKILL.md) +uv run python .test/scripts/optimize.py databricks-metric-views --tools-only --tool-modules sql -# Optimize ONLY tool descriptions (no SKILL.md) -uv run python .test/scripts/optimize.py databricks-model-serving --tools-only --tool-modules serving - -# Dry run to see components and token counts -uv run python .test/scripts/optimize.py databricks-model-serving --include-tools --dry-run +# Optimize specific tool modules +uv run python .test/scripts/optimize.py databricks-metric-views --tools-only --tool-modules sql serving compute ``` -When `--include-tools` is used, GEPA creates one component per tool module (e.g., `tools_sql`, `tools_serving`) and round-robins through them alongside `skill_md`. The `--apply` flag writes optimized docstrings back to the MCP server source files. - -**Note:** The SkillBench evaluator (default) automatically includes tools even without `--include-tools`, since tool descriptions are the primary token consumer (~17K tokens across 88 tools). Use `--tools-only` to optimize only tool descriptions without the SKILL.md. +When applied (`--apply`), optimized docstrings are written back to the MCP server source files via AST, preserving all surrounding code. -The iteration budget (`max_metric_calls`) is automatically scaled by the number of components so each one gets the preset's full budget. Additionally, the optimizer runs **up to 5 passes** (configurable with `--max-passes`), re-seeding from the previous best each time. It stops early if a pass produces no improvement. +### Skills + Tools Together -Available tool modules (88 tools across 16 modules): -`agent_bricks`, `aibi_dashboards`, `apps`, `compute`, `file`, `genie`, `jobs`, `lakebase`, `manifest`, `pipelines`, `serving`, `sql`, `unity_catalog`, `user`, `vector_search`, `volume_files` +The most powerful mode: optimize the skill and its related tool descriptions in a single run. GEPA round-robins across all components, so each gets dedicated reflection and mutation budget. -### Changing the Generation Model - -The evaluator uses a **generation model** to simulate an agent reading the SKILL.md and producing a response. Better skill content leads to better generated responses, which drives GEPA to make meaningful improvements. - -The default is `databricks/databricks-claude-sonnet-4-6`. - -| Method | Example | -|--------|---------| -| Environment variable | `export GEPA_GEN_LM="databricks/databricks-claude-sonnet-4-6"` | -| CLI flag | `--gen-model "openai/gpt-4o"` | -| Python | `optimize_skill("my-skill", gen_model="anthropic/claude-sonnet-4-5-20250514")` | - -### Changing the Reflection Model - -GEPA uses a reflection LM to analyze scorer failures and propose skill improvements. The default is **Databricks Model Serving** (`databricks-claude-opus-4-6`, 200K context). - -| Method | Example | -|--------|---------| -| Environment variable | `export GEPA_REFLECTION_LM="databricks/databricks-claude-opus-4-6"` | -| CLI flag | `--reflection-lm "openai/gpt-4o"` | -| Python | `optimize_skill("my-skill", reflection_lm="anthropic/claude-sonnet-4-5-20250514")` | - -Model strings use [litellm provider prefixes](https://docs.litellm.ai/docs/providers): - -| Provider | Prefix | Example | Context | -|----------|--------|---------|---------| -| Databricks Model Serving | `databricks/` | `databricks/databricks-claude-opus-4-6` | 200K | -| OpenAI | `openai/` | `openai/gpt-4o` | 128K | -| Anthropic | `anthropic/` | `anthropic/claude-sonnet-4-5-20250514` | 200K | +```bash +# Skill + specific tool modules +uv run python .test/scripts/optimize.py databricks-metric-views --include-tools --tool-modules sql -**Context window requirement:** The reflection model must have a context window large enough to hold the full candidate (all components) plus GEPA's reflection overhead (~3x the raw candidate tokens). Models with small context windows (e.g., 8K) will fail with `BadRequestError` during reflection. The optimizer validates this upfront and warns if the model is too small. +# Skill + ALL tool modules (auto-scaled budget) +uv run python .test/scripts/optimize.py databricks-metric-views --include-tools -### Authentication +# Dry run to see all components and their token counts +uv run python .test/scripts/optimize.py databricks-metric-views --include-tools --dry-run +``` -| Provider | Required Environment Variables | -|----------|-------------------------------| -| Databricks | `DATABRICKS_API_KEY`, `DATABRICKS_API_BASE` (must end with `/serving-endpoints`) | -| OpenAI | `OPENAI_API_KEY` | -| Anthropic | `ANTHROPIC_API_KEY` | +By default, **tools are auto-included** even without `--include-tools` — tool descriptions are typically the largest token consumer (~17K tokens across 88 tools in 16 modules). Use `--tools-only` to skip the SKILL.md, or omit `--include-tools` to let the auto-include handle it. -Optional overrides: `GEPA_REFLECTION_LM` (reflection model), `GEPA_GEN_LM` (generation model for evaluation), `GEPA_TOKEN_BUDGET` (token ceiling for optimization). +Available tool modules: `agent_bricks`, `aibi_dashboards`, `apps`, `compute`, `file`, `genie`, `jobs`, `lakebase`, `manifest`, `pipelines`, `serving`, `sql`, `unity_catalog`, `user`, `vector_search`, `volume_files` --- -## How Evaluation Works - -The evaluation system answers a single question: **does this SKILL.md teach an AI agent the right things?** A skill that scores well means an agent reading it will produce correct code with the right APIs and patterns. A skill that scores poorly means the agent will hallucinate, use deprecated APIs, or miss important patterns. - -Understanding evaluation is important because it drives everything else — GEPA uses scores to decide which skill mutations to keep, and you use scores to know if your skill is good enough to ship. - -### Why These Files Exist - -Each skill under `.test/skills//` has two key files: - -**`ground_truth.yaml`** — The test cases. Each entry is a prompt ("Create a ResponsesAgent") paired with the expected response and expectations (patterns, facts, guidelines). These define *what the skill should teach*. Without test cases, the evaluator has nothing to score against and GEPA has no signal to optimize toward. - -**`manifest.yaml`** — The scorer configuration. Controls *which scorers run* and *what thresholds apply*. Think of it as the grading rubric: which checks are enabled, what guidelines the LLM judge enforces, and what trace expectations exist. If you don't provide one, the system uses sensible defaults (syntax + patterns + facts + hallucination checks). - -The test cases in `ground_truth.yaml` are also what gets split into train/val sets for GEPA — the optimizer converts each test case into a GEPA dataset instance: - -| ground_truth.yaml field | GEPA field | Purpose | -|------------------------|------------|---------| -| `inputs.prompt` | `input` | The task the reflection LM sees | -| `outputs.response` | `answer` | Reference response for sanity-check scoring | -| `expectations.*` | `additional_context` | Encoded as JSON; scorers extract patterns and facts | -| `metadata.category` | (stratification) | Ensures balanced train/val split | - -### The Layered Evaluation - -Rather than scoring a single static response, the evaluator runs five layers that give GEPA progressively richer signal: - -| Layer | Weight | What it does | Source | -|-------|--------|-------------|--------| -| **Generated response quality** | 20% | An LLM reads ONLY the SKILL.md and answers the test prompt. Its response is scored for patterns/facts. | `evaluator.py` → litellm generation | -| **Skill content coverage** | 35% | Checks if the SKILL.md itself contains the patterns and facts needed. If a pattern is missing from the skill, this drops immediately. | `evaluator.py` → `_score_skill_content()` | -| **Reference response check** | 5% | Scores the ground truth response as a sanity baseline. This is mostly static — it ensures the test case itself is valid. | `evaluator.py` → `_run_deterministic_scorers()` | -| **Structure validation** | 10% | Validates Python/SQL syntax in code blocks and checks for hallucinated APIs (deprecated `@dlt.table`, old `mlflow.evaluate`, etc). | `evaluator.py` → `_validate_skill_structure()` | -| **Token efficiency** | 30% | Rewards concise skill content. Shrinking below original size earns a bonus (up to 1.15x), same size = 1.0, linear penalty to 0.0 at 2x original. | `evaluator.py` → token counting | - -**Why this works:** The key insight is that Layer 1 (generated response) creates a causal chain — if the SKILL.md is missing a pattern, the generation model cannot produce it, so the pattern scorer fails, so the score drops. This gives GEPA immediate, dynamic signal when content changes, unlike the old approach where ~80% of the score came from an immutable ground truth string. - -**Fallback mode:** When no generation model is available (no `GEPA_GEN_LM`), the weights shift to 40% skill content + 20% reference + 10% structure + 30% efficiency. - -### SkillBench Evaluator (Default) - -The default evaluator (`--evaluator skillbench`) measures **skill effectiveness**: how much does the skill help an agent answer correctly? It runs each test case twice — once WITH the skill and once WITHOUT — then scores the delta. - -| Weight | Dimension | What it measures | -|--------|-----------|-----------------| -| **45%** | Skill Effectiveness | `pass_rate_with - pass_rate_without` — the delta. Only rewards content the agent doesn't already know. | -| **25%** | Absolute Quality | `pass_rate_with` — overall correctness with the skill present. | -| **5%** | Structure | Syntax validity (Python/SQL) and no hallucinated APIs. | -| **25%** | Token Efficiency | Smaller candidates score higher. Linear penalty for growth (0.0 at 2x original). Bonus for reduction (up to 1.15 at 0% of original). | - -**Key difference from the legacy evaluator:** SkillBench uses binary pass/fail assertions (from `expectations` in `ground_truth.yaml`) rather than fuzzy scorer scores. Assertions are classified as: -- **NEEDS_SKILL** — fails both with and without the skill (the skill must teach this) -- **REGRESSION** — passes without, fails with (the skill confuses the agent — simplify or remove) -- **POSITIVE** — fails without, passes with (the skill is helping — keep it) -- **NEUTRAL** — same result either way (the agent already knows this — adding it wastes tokens) - -The reflection LM sees these labels in the `Error` field of each example's side info, guiding it to add NEEDS_SKILL content and remove REGRESSION content. +## Example Workflow: `databricks-metric-views` -**Token budget:** Use `--token-budget N` to set a hard ceiling. Candidates exceeding the budget receive a steep penalty on top of the normal efficiency score. Set via CLI or `GEPA_TOKEN_BUDGET` env var. +This walks through the full lifecycle of evaluating and optimizing the metric views skill. -To use the legacy evaluator instead: `--evaluator legacy`. +### 1. Inspect the skill and test cases -### Built-in Scorers - -The system ships with four tiers of scorers: - -**Tier 1: Deterministic (fast, reliable, ~$0/eval)** - -| Scorer | What it checks | Configured via | -|--------|---------------|----------------| -| `python_syntax` | Python code blocks parse with `ast.parse()` | `manifest.yaml` → `scorers.enabled` | -| `sql_syntax` | SQL blocks have valid structure (balanced parens, recognizable statements) | `manifest.yaml` → `scorers.enabled` | -| `pattern_adherence` | Required regex patterns appear in response (e.g., `ResponsesAgent`, `CLUSTER BY`) | `ground_truth.yaml` → `expectations.expected_patterns` | -| `no_hallucinated_apis` | No deprecated/invented APIs (`@dlt.table`, `dlt.read`, `PARTITION BY`, old `mlflow.evaluate`) | `manifest.yaml` → `scorers.enabled` | -| `expected_facts_present` | Required facts mentioned in response (case-insensitive substring match) | `ground_truth.yaml` → `expectations.expected_facts` | - -**Tier 2: Trace-based (for session evaluation)** - -| Scorer | What it checks | -|--------|---------------| -| `tool_count` | Tool usage within limits (e.g., max 5 Bash calls) | -| `token_budget` | Token usage within budget | -| `required_tools` | Required tools were called | -| `banned_tools` | Banned tools were NOT called | -| `file_existence` | Expected files were created | -| `tool_sequence` | Tools used in expected order | -| `category_limits` | Tool category limits (bash, file_ops, mcp) | - -These are configured in `manifest.yaml` under `trace_expectations`. - -**Tier 3: LLM judges (expensive, nuanced, ~$0.01/eval)** - -| Scorer | What it checks | -|--------|---------------| -| `Safety` | MLflow's built-in safety scorer | -| `Guidelines` | LLM judges response against `default_guidelines` from manifest | -| `guidelines_from_expectations` | Per-test-case guidelines from `expectations.guidelines` in ground_truth.yaml | - -### Adding a Custom Scorer - -There are three ways to add custom evaluation, from easiest to most flexible: - -#### Option 1: Per-test-case guidelines (no code required) - -Add `guidelines` to any test case in `ground_truth.yaml`. An LLM judge evaluates the response against these: +The skill lives at `databricks-skills/databricks-metric-views/SKILL.md`. Test cases live at `.test/skills/databricks-metric-views/ground_truth.yaml`: ```yaml test_cases: - - id: my_test_001 + - id: metric-views_create_sql_001 inputs: - prompt: "Deploy a model to serving" + prompt: "Create a metric view for order analytics with revenue and order count measures" + outputs: + response: | + ```sql + CREATE OR REPLACE VIEW main.default.order_metrics + WITH METRICS LANGUAGE YAML + $$ + source: main.default.orders + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Total Revenue + expr: SUM(amount) + $$ + ``` expectations: - guidelines: - - "Must use Unity Catalog three-level namespace" - - "Must recommend job-based deployment over synchronous" - - "Should warn about cold start latency" expected_facts: - - "ResponsesAgent" -``` - -Then enable the scorer in `manifest.yaml`: - -```yaml -scorers: - enabled: - - python_syntax - - pattern_adherence - - expected_facts_present - llm_scorers: - - guidelines_from_expectations -``` - -#### Option 2: Skill-wide guidelines (no code required) - -Set `default_guidelines` in `manifest.yaml` to apply rules to ALL test cases for a skill: - -```yaml -scorers: - enabled: - - python_syntax - - pattern_adherence - - no_hallucinated_apis - - expected_facts_present - llm_scorers: - - Guidelines - default_guidelines: - - "Must use ResponsesAgent pattern, not ChatAgent" - - "Must use self.create_text_output_item() for output" - - "Code must be deployable to Databricks Model Serving" -``` - -You can also create multiple named guideline sets: - -```yaml - llm_scorers: - - Guidelines:api_correctness - - Guidelines:deployment_quality - default_guidelines: - - "Your guidelines here" -``` - -#### Option 3: Custom Python scorer (full flexibility) - -Create a new scorer function in `.test/src/skill_test/scorers/` and register it. Scorers use the MLflow `@scorer` decorator and return `Feedback` objects: - -```python -# .test/src/skill_test/scorers/my_custom.py -from mlflow.genai.scorers import scorer -from mlflow.entities import Feedback -from typing import Dict, Any - -@scorer -def my_custom_check(outputs: Dict[str, Any], expectations: Dict[str, Any]) -> Feedback: - """Check for something specific to my use case.""" - response = outputs.get("response", "") - - # Your custom logic here - issues = [] - if "spark.sql(" in response and "spark.read.table(" not in response: - issues.append("Should prefer spark.read.table() over spark.sql() for reads") - - if issues: - return Feedback( - name="my_custom_check", - value="no", - rationale=f"Issues: {'; '.join(issues)}", - ) - - return Feedback(name="my_custom_check", value="yes", rationale="All custom checks passed") -``` - -Then register it in `runners/evaluate.py` → `build_scorers()`: - -```python -SCORER_MAP = { - # ... existing scorers ... - "my_custom_check": my_custom_check, -} -``` - -And enable it in your skill's `manifest.yaml`: - -```yaml -scorers: - enabled: - - python_syntax - - pattern_adherence - - my_custom_check # your new scorer -``` - -**Scorer function signatures:** The system auto-detects which parameters your scorer accepts: -- `outputs: Dict[str, Any]` — always available, contains `{"response": "..."}` -- `expectations: Dict[str, Any]` — from ground_truth.yaml `expectations` field -- `inputs: Dict[str, Any]` — contains `{"prompt": "..."}` - -Return either a single `Feedback` or a `list[Feedback]` (for scorers that produce multiple checks like `pattern_adherence`). - -### Manifest Configuration Examples - -Here are manifest patterns for different skill types: - -**Python SDK skill** — emphasizes syntax and API correctness: -```yaml -scorers: - enabled: [python_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] - llm_scorers: [guidelines_from_expectations] - default_guidelines: - - "Must use ResponsesAgent pattern for GenAI agents" -quality_gates: - syntax_valid: 1.0 - pattern_adherence: 0.9 -``` - -**SQL-heavy skill** — adds SQL validation: -```yaml -scorers: - enabled: [python_syntax, sql_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] - default_guidelines: - - "Must use SDP syntax (CREATE OR REFRESH STREAMING TABLE)" -``` + - "Uses CREATE OR REPLACE VIEW with WITH METRICS LANGUAGE YAML" + - "Defines dimensions with name and expr fields" + - "Defines measures with name and expr using aggregate functions" + expected_patterns: + - pattern: "WITH METRICS LANGUAGE YAML" + description: "Metric view DDL syntax" + - pattern: "MEASURE\\(" + description: "MEASURE() function for querying" + guidelines: + - "Must use WITH METRICS LANGUAGE YAML syntax" + - "Must define dimensions and measures in YAML block" -**Skill with trace expectations** — limits tool usage during session evaluation: -```yaml -scorers: - enabled: [python_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] - default_guidelines: - - "Must use correct MCP tools (manage_ka, manage_mas)" - trace_expectations: - tool_limits: - manage_ka: 10 - manage_mas: 10 - required_tools: [Read] - banned_tools: [] + - id: metric-views_query_measure_002 + inputs: + prompt: "Query a metric view to get total revenue and order count by month" + expectations: + expected_facts: + - "Uses MEASURE() function to reference measures" + - "SELECT * is NOT supported on metric views" + expected_patterns: + - pattern: "MEASURE\\(" + description: "MEASURE() wrapping for measures" + - pattern: "GROUP BY ALL" + description: "GROUP BY ALL for metric view queries" ``` ---- - -## Best Practices for Optimization - -These practices are derived from the [optimize_anything API guide](https://gepa-ai.github.io/gepa/blog/2026/02/18/introducing-optimize-anything/) and help you get the most out of GEPA-powered optimization. - -### 1. Write Rich Evaluators with Actionable Side Information (ASI) - -GEPA's reflection LM learns from diagnostic feedback, not just scores. The more context your evaluator surfaces, the better the proposals. - -- **Return diagnostics**, not just a number. Use `oa.log()` or return `(score, side_info_dict)` to surface error messages, failing patterns, and missing facts. -- **Break scoring into multiple dimensions**. Rather than a single composite score, provide separate scores for syntax, pattern adherence, API accuracy, and conciseness. GEPA's Pareto-efficient selection preserves candidates that excel in different dimensions. -- Our built-in evaluator already does this -- it returns per-scorer feedback via `feedback_to_asi()`. - -### 2. Build a Diverse Dataset of Test Cases - -GEPA operates in three modes depending on what data you provide: -- **Single-task** (no dataset): evaluator scores the artifact directly -- **Multi-task** (dataset only): Pareto-efficient search across tasks -- **Generalization** (dataset + valset): trains on tasks, validates on held-out examples - -For best results: -- Aim for **5+ test cases** to enable a train/val split (Generalization mode). Fewer than 5 defaults to single-task mode. -- Cover **different categories** of usage (e.g., simple queries, complex joins, error handling). The automatic stratified splitter ensures balanced representation. -- Use `/skill-test add` to interactively generate test cases, then review with `/skill-test review`. - -### 3. Tune reflection_minibatch_size for Focused Improvement - -GEPA's default `reflection_minibatch_size=2` shows the reflection LM feedback from 2 tasks per iteration. This keeps each reflection focused and prevents the LM from trying to fix everything at once. - -- For skills with **many test cases** (10+), the default of 2 works well -- over iterations, all tasks get attention. -- For skills with **few test cases** (3-5), consider increasing to 3 so more context is visible per step. +Each test case defines: +- **`inputs.prompt`** — what the user asks +- **`expectations.expected_facts`** — facts the response must mention +- **`expectations.expected_patterns`** — regex patterns the response must contain +- **`expectations.guidelines`** — soft rules for the MLflow quality judge -### 4. Use Multi-Component Optimization for Skills + Tools +### 2. Dry run to check baseline -When you optimize a SKILL.md alongside tool descriptions (`--include-tools`), GEPA creates separate components and cycles through them with round-robin selection. This means: - -- Each component gets its **own** optimization budget (the preset's `max_metric_calls` is multiplied by the component count). -- Up to `--max-passes` full optimization cycles run, re-seeding from the best candidate each time. -- Start with `--tool-modules` to target specific modules rather than optimizing all 16 at once. - -### 5. Choose the Right Preset - -| Preset | Budget per Component | Use Case | -|--------|---------------------|----------| -| `quick` | 15 calls | Fast feedback loop, initial exploration | -| `standard` | 50 calls | Default, good balance of quality and cost | -| `thorough` | 150 calls | Production-quality optimization | - -For multi-component runs, the actual `max_metric_calls` = budget x number of components. - -### 6. Leverage the Background Context - -The `background` parameter tells the reflection LM domain-specific constraints. Our optimizer automatically provides Databricks-specific context (token budgets, skill structure rules, scorer descriptions). For custom use cases, you can extend `build_optimization_background()` in `evaluator.py`. - -### 7. Iterate with Dry Runs First - -Always start with `--dry-run` to verify your setup: ```bash -uv run python .test/scripts/optimize.py --include-tools --dry-run +uv run python .test/scripts/optimize.py databricks-metric-views --dry-run ``` -This shows the component list, token counts, current score, and config without calling GEPA. Fix any scorer issues or missing test cases before spending optimization budget. - ---- - -## Building Test Cases for a Skill - -Skills need test cases before optimization can work well. There are three ways to add them: - -### Quick: Extract from MLflow Traces - -If you have MLflow traces (from `mlflow autolog claude`), extract test cases directly from them. You can find your traces at your workspace's MLflow experiment page, e.g.: -`https://.cloud.databricks.com/ml/experiments//traces` -```bash -# Step 1: Set authentication -export DATABRICKS_HOST="https://e2-demo-field-eng.cloud.databricks.com" -export DATABRICKS_TOKEN="dapi..." - -# Step 2: List traces in your experiment to see what's available -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 --list - -# Step 3: Extract examples from all recent traces -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 \ - --skill databricks-model-serving - -# Or extract from a specific trace ID (from the UI or --list output) -uv run python .test/scripts/trace_to_examples.py \ - --trace-id tr-d416fccdab46e2dea6bad1d0bd8aaaa8 \ - --skill databricks-model-serving - -# Or extract from a specific MLflow run ID -uv run python .test/scripts/trace_to_examples.py \ - --run-id abc123def456 \ - --skill databricks-model-serving - -# Refine auto-extracted expectations with an LLM -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 \ - --skill databricks-model-serving --refine - -# Auto-append directly to ground_truth.yaml (skip manual review) -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 \ - --skill databricks-model-serving --trust - -# Limit how many traces to process -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 \ - --skill databricks-model-serving --limit 5 ``` - -You can also extract from local session.jsonl files (Claude Code stores these at `~/.claude/projects/`): - -```bash -uv run python .test/scripts/trace_to_examples.py \ - --trace ~/.claude/projects/.../session.jsonl \ - --skill databricks-model-serving +=== Dry Run: databricks-metric-views (skillbench) === +SKILL.md path: databricks-skills/databricks-metric-views/SKILL.md +Components: ['skill_md', 'tools_sql', ...] +Total original tokens: 17,991 +Train tasks: 8 +Evaluator: skillbench (judge-driven) +Current score: 0.909 + metric-views_create_sql_001: 0.952 + metric-views_query_measure_002: 0.871 + metric-views_create_mcp_003: 0.934 + ... ``` -The script extracts user prompt / assistant response pairs, auto-generates `expected_patterns` from code blocks and `expected_facts` from API references, and saves to `candidates.yaml` for review (or directly to `ground_truth.yaml` with `--trust`). - -#### Workflow: MLflow Traces to Optimized Skill +The evaluator runs each test case **twice** — once WITH the skill in context and once WITHOUT — then judges the delta. Test case 002 scores lower because the MEASURE() wrapping example in the skill has a syntax gap. -The end-to-end workflow for turning real agent sessions into skill improvements: +### 3. Run optimization ```bash -# 1. Extract examples from your traces -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 \ - --skill databricks-model-serving --refine - -# 2. Review the extracted candidates -cat .test/skills/databricks-model-serving/candidates.yaml - -# 3. Promote good candidates to ground_truth.yaml -# (edit candidates.yaml, keep the good ones, then) -uv run python .test/scripts/trace_to_examples.py \ - --experiment-id 2452310130108632 \ - --skill databricks-model-serving --trust - -# 4. Run optimization with the enriched dataset -uv run python .test/scripts/optimize.py databricks-model-serving --preset quick - -# 5. If score improves, apply -uv run python .test/scripts/optimize.py databricks-model-serving --preset standard --apply +uv run python .test/scripts/optimize.py databricks-metric-views --preset quick ``` -### Quick: Add a Single Example Manually +GEPA runs 15 iterations per component across up to 5 passes. Each iteration: +1. Mutates the SKILL.md based on judge rationale +2. Generates responses WITH the mutated skill +3. Judges score the responses +4. GEPA keeps mutations that improve the Pareto frontier -```bash -# Interactive mode — prompts for each field -uv run python .test/scripts/add_example.py databricks-model-serving - -# Inline mode — provide prompt and response directly -uv run python .test/scripts/add_example.py databricks-model-serving \ - --prompt "Create a ChatAgent with tool calling" \ - --response-file /path/to/response.md \ - --facts "Uses ChatAgent class" "Implements predict method" \ - --patterns "ChatAgent" "def predict" - -# From clipboard (paste prompt + response separated by ---) -uv run python .test/scripts/add_example.py databricks-model-serving --from-clipboard ``` + Starting multi-pass optimization (up to 5 passes, 1 component(s), 15 metric calls/pass) -The script auto-generates an ID, detects code language, extracts patterns from code blocks, and confirms before saving. - -### Full Workflow: Initialize + Add + Review + --- Pass 1/5 (best score so far: 0.9090) --- + Pass 1 score: 0.9350 (delta: +0.0260) -### 1. Initialize Test Scaffolding - -``` -/skill-test init + --- Pass 2/5 (best score so far: 0.9350) --- + No significant improvement in pass 2 -- stopping early. ``` -Generates `manifest.yaml` with scorer config, empty `ground_truth.yaml`, and `candidates.yaml`. +### 4. Review and apply -### 2. Add Test Cases - -``` -/skill-test add ``` +============================================================ + Optimization Results: databricks-metric-views +============================================================ + Score: 0.909 -> 0.935 (+0.026) + Skill Effectiveness: 0.42 + Quality (with): 0.78 + Quality (without): 0.36 (baseline) + Tokens: 17,991 -> 18,265 (+1.5%) -Interactively generates test cases. Passing tests go to `ground_truth.yaml`, failing ones to `candidates.yaml` for review. - -### 3. Review Candidates + Per-task: + metric-views_create_sql_001 WITH 0.85 WITHOUT 0.35 delta +0.50 [OK] + metric-views_query_measure_002 WITH 0.79 WITHOUT 0.22 delta +0.57 [OK] + ... -``` -/skill-test review -/skill-test review --batch --filter-success + Saved: .test/skills/databricks-metric-views/optimized_SKILL.md + Apply: uv run python .test/scripts/optimize.py databricks-metric-views --apply-last +============================================================ ``` -### 4. Configure Scorers (Optional) +Review the diff, then apply: -Edit `.test/skills//manifest.yaml` or: -``` -/skill-test scorers update --add-guideline "Must use CLUSTER BY" -``` - -### 5. Run Evaluation - -``` -/skill-test run -``` - -### 6. Save Baseline + Check Regressions +```bash +# Review what changed +diff databricks-skills/databricks-metric-views/SKILL.md \ + .test/skills/databricks-metric-views/optimized_SKILL.md -``` -/skill-test baseline -/skill-test regression +# Apply +uv run python .test/scripts/optimize.py databricks-metric-views --apply-last ``` --- -## Test Case Examples - -Test cases live in `ground_truth.yaml` and tell GEPA what the skill should produce for a given prompt. Each test case has **inputs** (the user prompt), **outputs** (the expected response), and **expectations** (what scorers check). Here are real examples from the repo. - -### Example 1: Code Generation Test Case - -A test case that checks whether the skill produces correct Python code with the right API calls: - -```yaml -test_cases: - - id: serving_responses_agent_001 - inputs: - prompt: "Create a ResponsesAgent that uses UC functions for tool calling" - outputs: - response: | - ```python - from databricks.agents import ResponsesAgent - - agent = ResponsesAgent( - model="databricks-claude-sonnet-4", - tools=[{"type": "function", "function": {"name": "catalog.schema.my_function"}}], - instructions="You are a helpful assistant.", - ) - ``` - execution_success: true - expectations: - expected_facts: - - "Uses ResponsesAgent from databricks.agents" - - "Includes tools parameter with UC function reference" - - "Sets model to a valid Databricks model endpoint" - expected_patterns: - - pattern: "ResponsesAgent" - min_count: 1 - description: "Must use ResponsesAgent class" - - pattern: "catalog\\.\\w+\\.\\w+" - min_count: 1 - description: "UC function in three-level namespace" - metadata: - category: happy_path - difficulty: medium -``` - -**What GEPA sees:** The evaluator scores the skill against this test case. If the skill doesn't teach the agent about `ResponsesAgent` or UC function namespaces, the pattern scorers fail and GEPA's reflection LM learns what to add. - -### Example 2: SQL/Pipeline Test Case - -A test case for Spark Declarative Pipelines (SDP) with SQL syntax validation: +## CLI Reference -```yaml - - id: sdp_bronze_ingestion_001 - inputs: - prompt: "Create a bronze ingestion streaming table that reads from cloud storage using Auto Loader" - outputs: - response: | - ```sql - CREATE OR REFRESH STREAMING TABLE bronze_events - CLUSTER BY (event_date) - AS SELECT * - FROM STREAM read_files('s3://bucket/events/', format => 'json') - ``` - execution_success: null - expectations: - expected_facts: - - "Uses CREATE OR REFRESH STREAMING TABLE syntax" - - "Uses CLUSTER BY for data layout optimization" - - "Uses STREAM read_files() for Auto Loader ingestion" - expected_patterns: - - pattern: "CREATE OR REFRESH STREAMING TABLE" - min_count: 1 - description: "SDP streaming table DDL" - - pattern: "CLUSTER BY" - min_count: 1 - description: "Liquid clustering" - - pattern: "read_files\\(" - min_count: 1 - description: "Auto Loader function" - guidelines: - - "Must use SDP syntax (CREATE OR REFRESH), not legacy DLT syntax" - metadata: - category: happy_path - difficulty: easy -``` - -### Example 3: Error Handling Test Case - -Test cases don't have to be happy paths. Testing that a skill warns about common mistakes: - -```yaml - - id: serving_error_raw_dict_001 - inputs: - prompt: "Why does my ChatAgent return raw dicts instead of proper messages?" - outputs: - response: | - The `predict` method must return `ChatAgentResponse` with - `ChatAgentMessage` objects, not raw dicts. Use - `self.create_text_output_item(text)` to build response items. - execution_success: null - expectations: - expected_facts: - - "Explains the raw dict issue" - - "Mentions self.create_text_output_item()" - expected_patterns: - - pattern: "create_text_output_item" - min_count: 1 - description: "Correct helper method" - metadata: - category: error_handling - difficulty: easy -``` - -### Example 4: Minimal Test Case - -The simplest possible test case -- just a prompt and expected facts: - -```yaml - - id: genie_create_space_001 - inputs: - prompt: "Create a Genie Space for our sales data" - outputs: - response: "I'll create a Genie Space connected to your sales tables." - expectations: - expected_facts: - - "Creates a Genie Space" - - "Connects to data tables" - metadata: - category: happy_path - difficulty: easy -``` +```bash +# Presets +uv run python .test/scripts/optimize.py --preset quick # 15 iterations +uv run python .test/scripts/optimize.py --preset standard # 50 iterations (default) +uv run python .test/scripts/optimize.py --preset thorough # 150 iterations + +# Options +--dry-run # Show scores without optimizing +--apply # Run + apply immediately +--apply-last # Apply saved result without re-running +--gen-model "..." # Override generation model (default: databricks/databricks-claude-sonnet-4-6) +--reflection-lm "..." # Override reflection model (default: databricks/databricks-claude-opus-4-6) +--max-passes N # Max optimization passes (default: 5) +--token-budget N # Hard token ceiling +--include-tools # Include MCP tool descriptions as components +--tool-modules sql ... # Specific tool modules to include +--tools-only # Optimize only tool descriptions +--all # Optimize all skills with ground_truth.yaml + +# Test case generation +--generate-from FILE # Generate test cases from requirements file +--requirement "..." # Inline requirement (repeatable) +``` + +### Model Configuration + +| Env Var | Default | Purpose | +|---------|---------|---------| +| `GEPA_GEN_LM` | `databricks/databricks-claude-sonnet-4-6` | Generation model (produces responses from skill) | +| `GEPA_REFLECTION_LM` | `databricks/databricks-claude-opus-4-6` | Reflection model (proposes mutations) | +| `GEPA_TOKEN_BUDGET` | none | Hard token ceiling for candidates | + +Model strings use [litellm provider prefixes](https://docs.litellm.ai/docs/providers): `databricks/`, `openai/`, `anthropic/`. --- -## End-to-End Walkthrough +## Writing Test Cases -Here's a complete example of adding test cases and running optimization for a new skill. - -### Step 1: Initialize scaffolding - -```bash -# Creates manifest.yaml, ground_truth.yaml, candidates.yaml -/skill-test my-new-skill init -``` - -### Step 2: Write test cases - -Edit `.test/skills/my-new-skill/ground_truth.yaml`: +Test cases in `ground_truth.yaml` define what each skill should teach. Minimal example: ```yaml metadata: - skill_name: my-new-skill + skill_name: my-skill version: "1.0" test_cases: - id: basic_001 inputs: - prompt: "Show me how to create a Delta table with liquid clustering" + prompt: "Show me how to create a streaming table" outputs: response: | ```sql - CREATE TABLE catalog.schema.events ( - event_id BIGINT, - event_date DATE, - payload STRING - ) - CLUSTER BY (event_date) - ``` - expectations: - expected_facts: - - "Uses CREATE TABLE with CLUSTER BY" - expected_patterns: - - pattern: "CLUSTER BY" - min_count: 1 - description: "Liquid clustering syntax" - metadata: - category: happy_path - difficulty: easy - - - id: basic_002 - inputs: - prompt: "How do I read from a Delta table using Spark?" - outputs: - response: | - ```python - df = spark.read.table("catalog.schema.my_table") + CREATE OR REFRESH STREAMING TABLE bronze_events + AS SELECT * FROM STREAM read_files('s3://bucket/events/') ``` expectations: expected_facts: - - "Uses three-level namespace" + - "Uses CREATE OR REFRESH STREAMING TABLE syntax" expected_patterns: - - pattern: "spark\\.read\\.table" - min_count: 1 - description: "Spark table reader" + - pattern: "CREATE OR REFRESH STREAMING TABLE" + description: "SDP DDL syntax" + guidelines: + - "Must use SDP syntax, not legacy DLT syntax" metadata: category: happy_path - difficulty: easy - - # ... add at least 5 test cases for train/val split -``` - -### Step 3: Verify setup with a dry run - -```bash -uv run python .test/scripts/optimize.py my-new-skill --dry-run -``` - -Output: -``` -=== Dry Run: my-new-skill (skillbench) === -SKILL.md path: .claude/skills/my-new-skill/SKILL.md -[SkillBench] Auto-including tools: 16 modules, 88 tools, 64,675 chars -Components: ['skill_md', 'tools_sql', 'tools_serving', ...] -Total original tokens: 20,147 - skill_md: 2,847 tokens - tools_sql: 3,200 tokens - ... -Train tasks: 4 -Val tasks: None (single-task mode) -Evaluator type: skillbench -Preset: standard (max_metric_calls=850, scaled for 17 component(s)) -Max passes: 5 -Reflection LM: databricks/databricks-claude-opus-4-6 -Current score: 0.723 ``` -### Step 4: Run optimization - -```bash -# Quick first pass to see if GEPA can improve -uv run python .test/scripts/optimize.py my-new-skill --preset quick - -# Review the saved result -cat .test/skills/my-new-skill/optimized_SKILL.md -diff .claude/skills/my-new-skill/SKILL.md .test/skills/my-new-skill/optimized_SKILL.md - -# Happy with it? Apply without re-running -uv run python .test/scripts/optimize.py my-new-skill --apply-last - -# Or run standard for better results and apply immediately -uv run python .test/scripts/optimize.py my-new-skill --preset standard --apply -``` - -### Step 5: Save baseline for regression checking - -```bash -/skill-test my-new-skill baseline - -# Later, after making changes: -/skill-test my-new-skill regression -``` - -### Tips for Writing Good Test Cases - -- **Cover different categories**: happy_path, error_handling, edge cases. The splitter stratifies by `metadata.category` so each category is represented in both train and val sets. -- **Be specific in expected_patterns**: Use regex that captures the essential API call, not surrounding prose. `"ResponsesAgent"` is better than `".*ResponsesAgent.*"`. -- **Include both simple and complex prompts**: Simple prompts test baseline quality; complex prompts stress-test the skill's depth. -- **Set `execution_success`**: `true` if you verified the code runs, `null` if it's theoretical, `false` if it's known to fail. This helps scorers weight results. -- **Use `guidelines` for soft expectations**: Things an LLM judge should check but that can't be captured by regex (e.g., "Should explain why CLUSTER BY is preferred over partitioning"). +**Tips:** +- **5+ test cases** enables a train/val split for generalization +- **Cover categories**: happy_path, error_handling, edge cases — the splitter stratifies by `metadata.category` +- **`expected_patterns`** use regex — be specific (`"MEASURE\\("` not `".*MEASURE.*"`) +- **`guidelines`** are evaluated by the MLflow quality judge — use for soft expectations that can't be regex-matched +- **Generate from requirements**: `--requirement "Must explain MEASURE() wrapping"` auto-generates test cases --- -## Trace Evaluation - -Capture Claude Code sessions and evaluate against skill expectations. - -### Enable MLflow Tracing - -```bash -export DATABRICKS_CONFIG_PROFILE=aws-apps -export MLFLOW_EXPERIMENT_NAME="/Users//Claude Code Skill Traces" - -pip install mlflow[databricks] -mlflow autolog claude -u databricks -n "$MLFLOW_EXPERIMENT_NAME" . -``` - -### Evaluate Traces - -``` -/skill-test trace-eval --trace ~/.claude/projects/.../session.jsonl -/skill-test trace-eval --run-id abc123 -/skill-test list-traces --experiment "$MLFLOW_EXPERIMENT_NAME" -``` - ---- - -## Command Reference - -| Command | Description | -|------------|------------------------------------------| -| `run` | Execute tests against ground truth | -| `init` | Generate test scaffolding from skill docs| -| `add` | Add test cases interactively | -| `review` | Review and promote candidates | -| `baseline` | Save current results as baseline | -| `regression` | Compare against baseline | -| `mlflow` | Full evaluation with LLM judges | -| `optimize` | Optimize skill with GEPA | -| `trace-eval` | Evaluate session traces | -| `list-traces` | List available traces | -| `scorers` | View/update scorer config | - ---- - -## Files - -``` -.test/skills// -├── manifest.yaml # Scorers, guidelines, trace expectations -├── ground_truth.yaml # Verified test cases -├── candidates.yaml # Pending review -├── optimized_SKILL.md # Last optimization output (auto-saved) -└── last_optimization.json # Metadata for --apply-last - -.test/baselines// -└── baseline.yaml # Regression baseline - -.test/scripts/ -├── optimize.py # CLI for GEPA optimization -├── trace_to_examples.py # Extract test cases from session.jsonl traces -├── add_example.py # Manually add test cases to ground_truth.yaml -└── _common.py # Shared CLI utilities -``` - ---- - -## CI/CD - -```bash -uv pip install -e ".test/" -uv run pytest .test/tests/ -uv run python .test/scripts/regression.py +## Architecture + +``` +.test/ +├── scripts/ +│ ├── optimize.py # CLI entry point +│ ├── generate_examples.py # Generate test cases from requirements +│ └── trace_to_examples.py # Extract test cases from MLflow traces +├── src/skill_test/optimize/ +│ ├── judges.py # MLflow make_judge factories (quality, effectiveness, regression) +│ ├── skillbench_evaluator.py # WITH vs WITHOUT evaluator using judges +│ ├── runner.py # GEPA optimize_anything orchestrator +│ ├── utils.py # Token counting, path resolution +│ ├── asi.py # MLflow Feedback → side_info conversion +│ ├── alignment.py # MemAlign judge alignment (future) +│ ├── config.py # GEPA presets, model registration +│ ├── splitter.py # Train/val dataset splitting +│ └── tools.py # MCP tool description extraction +├── src/skill_test/scorers/ +│ ├── universal.py # Deterministic: python_syntax, sql_syntax, etc. +│ ├── trace.py # Trace-based: tool_count, token_budget, etc. +│ └── routing.py # Skill routing accuracy (deprecated) +└── skills// + ├── ground_truth.yaml # Test cases + ├── manifest.yaml # Scorer configuration + ├── optimized_SKILL.md # Last optimization output + └── last_optimization.json # Metadata for --apply-last ``` diff --git a/.test/notebooks/gepa_skill_optimization_demo.ipynb b/.test/notebooks/gepa_skill_optimization_demo.ipynb index b403bf1f..8099460d 100644 --- a/.test/notebooks/gepa_skill_optimization_demo.ipynb +++ b/.test/notebooks/gepa_skill_optimization_demo.ipynb @@ -1,27 +1,342 @@ { "cells": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# GEPA Skill Optimization Demo\n", - "\n", - "This notebook demonstrates how the skill-test framework uses [GEPA](https://github.com/gepa-ai/gepa) to automatically optimize Databricks SKILL.md files for **quality** and **token efficiency**.\n", - "\n", - "SKILL.md files teach AI agents (like Claude Code) Databricks patterns. Every token in a skill consumes agent context window budget, so skills should be as concise and high-quality as possible.\n", - "\n", - "**What GEPA does:**\n", - "1. Scores the current SKILL.md against deterministic scorers (syntax, patterns, APIs, facts)\n", - "2. Reflects on failures and proposes mutations to improve the skill\n", - "3. Selects the best candidate via Pareto frontier optimization\n", - "4. Repeats until quality converges or budget is exhausted" - ] - }, - { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ - "## Setup" + "{\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"# GEPA Skill Optimization Demo\\n\",\n", + " \"\\n\",\n", + " \"This notebook demonstrates how the skill-test framework uses [GEPA](https://github.com/gepa-ai/gepa) to automatically optimize Databricks SKILL.md files for **quality** and **token efficiency**.\\n\",\n", + " \"\\n\",\n", + " \"SKILL.md files teach AI agents (like Claude Code) Databricks patterns. Every token in a skill consumes agent context window budget, so skills should be as concise and high-quality as possible.\\n\",\n", + " \"\\n\",\n", + " \"**What GEPA does:**\\n\",\n", + " \"1. Scores the current SKILL.md against deterministic scorers (syntax, patterns, APIs, facts)\\n\",\n", + " \"2. Reflects on failures and proposes mutations to improve the skill\\n\",\n", + " \"3. Selects the best candidate via Pareto frontier optimization\\n\",\n", + " \"4. Repeats until quality converges or budget is exhausted\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Setup\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"import sys\\n\",\n", + " \"from pathlib import Path\\n\",\n", + " \"\\n\",\n", + " \"# Add skill-test to path\\n\",\n", + " \"repo_root = Path(\\\".\\\").resolve()\\n\",\n", + " \"while not (repo_root / \\\".test\\\" / \\\"src\\\").exists() and repo_root != repo_root.parent:\\n\",\n", + " \" repo_root = repo_root.parent\\n\",\n", + " \"sys.path.insert(0, str(repo_root / \\\".test\\\" / \\\"src\\\"))\\n\",\n", + " \"\\n\",\n", + " \"print(f\\\"Repo root: {repo_root}\\\")\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": \"import os\\n\\n# Configure the reflection model -- pick ONE:\\n\\n# Option A: Databricks Model Serving (default, recommended)\\n# IMPORTANT: DATABRICKS_API_BASE must end with /serving-endpoints\\n# os.environ[\\\"DATABRICKS_API_KEY\\\"] = \\\"dapi...\\\" \\n# os.environ[\\\"DATABRICKS_API_BASE\\\"] = \\\"https://.cloud.databricks.com/serving-endpoints\\\"\\n# os.environ[\\\"GEPA_REFLECTION_LM\\\"] = \\\"databricks/databricks-gpt-5-2\\\"\\n\\n# Option B: OpenAI\\n# os.environ[\\\"OPENAI_API_KEY\\\"] = \\\"sk-...\\\"\\n# os.environ[\\\"GEPA_REFLECTION_LM\\\"] = \\\"openai/gpt-4o\\\"\\n\\nprint(f\\\"Reflection LM: {os.environ.get('GEPA_REFLECTION_LM', 'databricks/databricks-gpt-5-2 (default)')}\\\")\"\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Step 1: Inspect the Skill\\n\",\n", + " \"\\n\",\n", + " \"Let's look at the `databricks-model-serving` skill -- its current size, test cases, and baseline score.\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": \"SKILL_NAME = \\\"databricks-model-serving\\\"\\n\\nfrom skill_test.optimize.utils import find_skill_md, count_tokens\\nfrom skill_test.optimize.splitter import create_gepa_datasets\\n\\n# Load skill\\nskill_path = find_skill_md(SKILL_NAME)\\noriginal_content = skill_path.read_text()\\noriginal_tokens = count_tokens(original_content)\\n\\n# Load test cases\\ntrain, val = create_gepa_datasets(SKILL_NAME)\\n\\nprint(f\\\"Skill: {SKILL_NAME}\\\")\\nprint(f\\\"Path: {skill_path}\\\")\\nprint(f\\\"Lines: {len(original_content.splitlines())}\\\")\\nprint(f\\\"Tokens: {original_tokens:,}\\\")\\nprint(f\\\"Train cases: {len(train)}\\\")\\nprint(f\\\"Val cases: {len(val) if val else 'None'}\\\")\"\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# Show first few test cases\\n\",\n", + " \"for t in train[:3]:\\n\",\n", + " \" print(f\\\"\\\\n--- {t['id']} ---\\\")\\n\",\n", + " \" print(f\\\"Prompt: {t['input'][:100]}...\\\")\\n\",\n", + " \" if t.get('answer'):\\n\",\n", + " \" print(f\\\"Answer: {t['answer'][:100]}...\\\")\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Step 2: Evaluate Current Quality (Baseline)\\n\",\n", + " \"\\n\",\n", + " \"Before optimizing, measure the current skill quality using the scorer pipeline.\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": \"from skill_test.optimize.skillbench_evaluator import create_skillbench_evaluator\\nfrom skill_test.optimize.utils import SKILL_KEY\\nfrom skill_test.optimize.splitter import to_gepa_instances\\n\\n# Note: SkillBench evaluator requires a gen_model for WITH/WITHOUT comparison\\n# If you don't have API access, this cell will raise an error\\nGEN_MODEL = os.environ.get(\\\"GEPA_GEN_LM\\\", \\\"databricks/databricks-claude-sonnet-4-6\\\")\\n\\nevaluator = create_skillbench_evaluator(SKILL_NAME, gen_model=GEN_MODEL)\\nseed_candidate = {SKILL_KEY: original_content}\\n\\n# Evaluate on all train tasks\\ngepa_instances = to_gepa_instances(train)\\n\\nprint(f\\\"{'Task ID':<35} {'Score':>8}\\\")\\nprint(\\\"-\\\" * 45)\\nfor i, inst in enumerate(gepa_instances):\\n score, side_info = evaluator(seed_candidate, inst)\\n task_id = train[i]['id']\\n status = 'PASS' if score >= 0.5 else 'FAIL'\\n print(f\\\"{task_id:<35} {score:>7.3f} {status}\\\")\\n\\n# Quick baseline\\nscores = [evaluator(seed_candidate, inst)[0] for inst in gepa_instances]\\nbaseline_score = sum(scores) / len(scores)\\nprint(f\\\"\\\\nBaseline Score: {baseline_score:.3f}\\\")\\nprint(f\\\"Token Count: {original_tokens:,}\\\")\"\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Step 3: Run GEPA Optimization\\n\",\n", + " \"\\n\",\n", + " \"Now run the optimization. GEPA will:\\n\",\n", + " \"- Use the current SKILL.md as the seed candidate\\n\",\n", + " \"- Run scorers against each test case\\n\",\n", + " \"- Reflect on failures to propose mutations\\n\",\n", + " \"- Select the best candidate via Pareto frontier\\n\",\n", + " \"- Penalize token bloat (80% quality, 20% efficiency weighting)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"from skill_test.optimize.runner import optimize_skill\\n\",\n", + " \"\\n\",\n", + " \"result = optimize_skill(\\n\",\n", + " \" skill_name=SKILL_NAME,\\n\",\n", + " \" mode=\\\"static\\\",\\n\",\n", + " \" preset=\\\"quick\\\", # 15 iterations -- increase to \\\"standard\\\" (50) or \\\"thorough\\\" (150) for better results\\n\",\n", + " \")\\n\",\n", + " \"\\n\",\n", + " \"print(f\\\"Optimization complete!\\\")\\n\",\n", + " \"print(f\\\"GEPA metric calls: {result.gepa_result.total_metric_calls}\\\")\\n\",\n", + " \"print(f\\\"Candidates explored: {result.gepa_result.num_candidates}\\\")\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Step 4: Results Comparison\\n\",\n", + " \"\\n\",\n", + " \"Compare the original vs. optimized skill across quality and token efficiency.\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"print(\\\"=\\\" * 60)\\n\",\n", + " \"print(f\\\" OPTIMIZATION RESULTS: {SKILL_NAME}\\\")\\n\",\n", + " \"print(\\\"=\\\" * 60)\\n\",\n", + " \"print()\\n\",\n", + " \"\\n\",\n", + " \"# Quality comparison\\n\",\n", + " \"quality_delta = result.improvement\\n\",\n", + " \"quality_pct = (quality_delta / result.original_score * 100) if result.original_score > 0 else 0\\n\",\n", + " \"print(f\\\" Quality Score\\\")\\n\",\n", + " \"print(f\\\" Before: {result.original_score:.3f}\\\")\\n\",\n", + " \"print(f\\\" After: {result.optimized_score:.3f}\\\")\\n\",\n", + " \"print(f\\\" Delta: {quality_delta:+.3f} ({quality_pct:+.1f}%)\\\")\\n\",\n", + " \"print()\\n\",\n", + " \"\\n\",\n", + " \"# Token comparison \\n\",\n", + " \"token_delta = result.original_token_count - result.optimized_token_count\\n\",\n", + " \"print(f\\\" Token Count\\\")\\n\",\n", + " \"print(f\\\" Before: {result.original_token_count:,}\\\")\\n\",\n", + " \"print(f\\\" After: {result.optimized_token_count:,}\\\")\\n\",\n", + " \"print(f\\\" Saved: {token_delta:,} tokens ({result.token_reduction_pct:.1f}% reduction)\\\")\\n\",\n", + " \"print()\\n\",\n", + " \"\\n\",\n", + " \"# Line count comparison\\n\",\n", + " \"orig_lines = len(result.original_content.splitlines())\\n\",\n", + " \"opt_lines = len(result.optimized_content.splitlines())\\n\",\n", + " \"print(f\\\" Lines\\\")\\n\",\n", + " \"print(f\\\" Before: {orig_lines}\\\")\\n\",\n", + " \"print(f\\\" After: {opt_lines}\\\")\\n\",\n", + " \"print(f\\\" Saved: {orig_lines - opt_lines} lines\\\")\\n\",\n", + " \"print()\\n\",\n", + " \"\\n\",\n", + " \"# Validation scores\\n\",\n", + " \"if result.val_scores:\\n\",\n", + " \" avg_val = sum(result.val_scores.values()) / len(result.val_scores)\\n\",\n", + " \" print(f\\\" Validation (held-out test cases)\\\")\\n\",\n", + " \" for tid, score in result.val_scores.items():\\n\",\n", + " \" print(f\\\" {tid}: {score:.3f}\\\")\\n\",\n", + " \" print(f\\\" Average: {avg_val:.3f}\\\")\\n\",\n", + " \"\\n\",\n", + " \"print()\\n\",\n", + " \"print(\\\"=\\\" * 60)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# Visual comparison bar chart\\n\",\n", + " \"try:\\n\",\n", + " \" import matplotlib.pyplot as plt\\n\",\n", + " \" import matplotlib\\n\",\n", + " \" matplotlib.rcParams['font.family'] = 'monospace'\\n\",\n", + " \"\\n\",\n", + " \" fig, axes = plt.subplots(1, 2, figsize=(12, 5))\\n\",\n", + " \"\\n\",\n", + " \" # Quality scores\\n\",\n", + " \" ax = axes[0]\\n\",\n", + " \" bars = ax.bar(\\n\",\n", + " \" ['Before', 'After'],\\n\",\n", + " \" [result.original_score, result.optimized_score],\\n\",\n", + " \" color=['#d4534b', '#4a9c5d'],\\n\",\n", + " \" width=0.5\\n\",\n", + " \" )\\n\",\n", + " \" ax.set_ylim(0, 1.1)\\n\",\n", + " \" ax.set_ylabel('Quality Score')\\n\",\n", + " \" ax.set_title(f'Quality: {result.original_score:.3f} → {result.optimized_score:.3f}')\\n\",\n", + " \" for bar, val in zip(bars, [result.original_score, result.optimized_score]):\\n\",\n", + " \" ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,\\n\",\n", + " \" f'{val:.3f}', ha='center', fontweight='bold')\\n\",\n", + " \"\\n\",\n", + " \" # Token counts\\n\",\n", + " \" ax = axes[1]\\n\",\n", + " \" bars = ax.bar(\\n\",\n", + " \" ['Before', 'After'],\\n\",\n", + " \" [result.original_token_count, result.optimized_token_count],\\n\",\n", + " \" color=['#d4534b', '#4a9c5d'],\\n\",\n", + " \" width=0.5\\n\",\n", + " \" )\\n\",\n", + " \" ax.set_ylabel('Token Count')\\n\",\n", + " \" ax.set_title(f'Tokens: {result.original_token_count:,} → {result.optimized_token_count:,} ({result.token_reduction_pct:.0f}% reduction)')\\n\",\n", + " \" for bar, val in zip(bars, [result.original_token_count, result.optimized_token_count]):\\n\",\n", + " \" ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 50,\\n\",\n", + " \" f'{val:,}', ha='center', fontweight='bold')\\n\",\n", + " \"\\n\",\n", + " \" fig.suptitle(f'GEPA Optimization: {SKILL_NAME}', fontsize=14, fontweight='bold')\\n\",\n", + " \" plt.tight_layout()\\n\",\n", + " \" plt.show()\\n\",\n", + " \"except ImportError:\\n\",\n", + " \" print(\\\"(matplotlib not installed -- skipping chart)\\\")\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Step 5: Review the Diff\\n\",\n", + " \"\\n\",\n", + " \"Inspect what GEPA changed in the SKILL.md.\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"from skill_test.optimize.review import review_optimization\\n\",\n", + " \"\\n\",\n", + " \"review_optimization(result)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Step 6: Apply (Optional)\\n\",\n", + " \"\\n\",\n", + " \"If the results look good, apply the optimized SKILL.md. Uncomment the cell below to write it.\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# Uncomment to apply:\\n\",\n", + " \"# from skill_test.optimize.review import apply_optimization\\n\",\n", + " \"# apply_optimization(result)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": \"## Multi-Component Optimization: Skills + Tools\\n\\nGEPA supports optimizing multiple text components simultaneously. You can optimize SKILL.md files **alongside** MCP tool descriptions in a single run.\\n\\nGEPA's `RoundRobinReflectionComponentSelector` cycles through components one at a time, so each gets dedicated reflection and mutation.\"\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": \"# Inspect available MCP tools\\nfrom skill_test.optimize.tools import get_tool_stats, extract_tool_descriptions, tools_to_gepa_components\\nfrom skill_test.optimize.utils import count_tokens\\n\\nstats = get_tool_stats()\\nprint(f\\\"MCP Tool Modules: {stats['modules']}\\\")\\nprint(f\\\"Total Tools: {stats['total_tools']}\\\")\\nprint(f\\\"Total Chars: {stats['total_description_chars']:,}\\\")\\nprint()\\nfor mod, info in stats[\\\"per_module\\\"].items():\\n print(f\\\" {mod:<20} {info['tools']:>2} tools {info['chars']:>6,} chars\\\")\\n\\n# Show what GEPA components look like for selected modules\\ntool_map = extract_tool_descriptions(modules=[\\\"serving\\\", \\\"sql\\\"])\\ncomponents = tools_to_gepa_components(tool_map, per_module=True)\\nprint(f\\\"\\\\nGEPA components for serving + sql: {list(components.keys())}\\\")\\nfor name, text in components.items():\\n print(f\\\" {name}: {count_tokens(text):,} tokens\\\")\"\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"source\": \"## Changing the Reflection Model\\n\\nBy default, GEPA uses `databricks/databricks-gpt-5-2` via Databricks Model Serving.\\nOverride per-call or via environment variable:\\n\\n```python\\n# Per-call\\nresult = optimize_skill(\\\"my-skill\\\", reflection_lm=\\\"openai/gpt-4o\\\")\\n\\n# Environment variable (persistent)\\nos.environ[\\\"GEPA_REFLECTION_LM\\\"] = \\\"databricks/databricks-gpt-5-2\\\"\\n```\\n\\nSee README.md for full model configuration options.\",\n", + " \"metadata\": {},\n", + " \"execution_count\": null,\n", + " \"outputs\": []\n", + " },\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"## Summary\\n\",\n", + " \"\\n\",\n", + " \"The GEPA optimization pipeline:\\n\",\n", + " \"\\n\",\n", + " \"| Metric | Before | After | Change |\\n\",\n", + " \"|--------|--------|-------|--------|\\n\",\n", + " \"| Quality Score | `result.original_score` | `result.optimized_score` | `result.improvement` |\\n\",\n", + " \"| Token Count | `result.original_token_count` | `result.optimized_token_count` | `result.token_reduction_pct`% |\\n\",\n", + " \"\\n\",\n", + " \"Key points:\\n\",\n", + " \"- **Quality gate**: Existing scorers (syntax, patterns, APIs, facts) are reused as-is\\n\",\n", + " \"- **Token efficiency**: 80/20 quality/efficiency weighting penalizes bloated skills\\n\",\n", + " \"- **Validation split**: Held-out test cases detect overfitting\\n\",\n", + " \"- **Reflection LM**: Configurable via `--reflection-lm` flag or `GEPA_REFLECTION_LM` env var\\n\",\n", + " \"- **Default model**: `databricks/databricks-gpt-5-2` via Databricks Model Serving\"\n", + " ]\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"kernelspec\": {\n", + " \"display_name\": \"Python 3\",\n", + " \"language\": \"python\",\n", + " \"name\": \"python3\"\n", + " },\n", + " \"language_info\": {\n", + " \"name\": \"python\",\n", + " \"version\": \"3.12.0\"\n", + " }\n", + " },\n", + " \"nbformat\": 4,\n", + " \"nbformat_minor\": 4\n", + "}\n" ] }, { @@ -42,13 +357,6 @@ "print(f\"Repo root: {repo_root}\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "import os\n\n# Configure the reflection model -- pick ONE:\n\n# Option A: Databricks Model Serving (default, recommended)\n# IMPORTANT: DATABRICKS_API_BASE must end with /serving-endpoints\n# os.environ[\"DATABRICKS_API_KEY\"] = \"dapi...\" \n# os.environ[\"DATABRICKS_API_BASE\"] = \"https://.cloud.databricks.com/serving-endpoints\"\n# os.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n\n# Option B: OpenAI\n# os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n# os.environ[\"GEPA_REFLECTION_LM\"] = \"openai/gpt-4o\"\n\nprint(f\"Reflection LM: {os.environ.get('GEPA_REFLECTION_LM', 'databricks/databricks-gpt-5-2 (default)')}\")" - }, { "cell_type": "markdown", "metadata": {}, @@ -58,33 +366,6 @@ "Let's look at the `databricks-model-serving` skill -- its current size, test cases, and baseline score." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SKILL_NAME = \"databricks-model-serving\"\n", - "\n", - "from skill_test.optimize.evaluator import _find_skill_md, count_tokens\n", - "from skill_test.optimize.splitter import create_gepa_datasets\n", - "\n", - "# Load skill\n", - "skill_path = _find_skill_md(SKILL_NAME)\n", - "original_content = skill_path.read_text()\n", - "original_tokens = count_tokens(original_content)\n", - "\n", - "# Load test cases\n", - "train, val = create_gepa_datasets(SKILL_NAME)\n", - "\n", - "print(f\"Skill: {SKILL_NAME}\")\n", - "print(f\"Path: {skill_path}\")\n", - "print(f\"Lines: {len(original_content.splitlines())}\")\n", - "print(f\"Tokens: {original_tokens:,}\")\n", - "print(f\"Train cases: {len(train)}\")\n", - "print(f\"Val cases: {len(val) if val else 'None'}\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -99,15 +380,6 @@ " print(f\"Answer: {t['answer'][:100]}...\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Evaluate Current Quality (Baseline)\n", - "\n", - "Before optimizing, measure the current skill quality using the scorer pipeline." - ] - }, { "cell_type": "code", "execution_count": null, @@ -115,20 +387,6 @@ "outputs": [], "source": "from skill_test.optimize.evaluator import create_skill_evaluator, SKILL_KEY\nfrom skill_test.optimize.splitter import to_gepa_instances\n\nevaluator = create_skill_evaluator(SKILL_NAME)\nseed_candidate = {SKILL_KEY: original_content}\n\n# Evaluate on all train tasks\ngepa_instances = to_gepa_instances(train)\n\nprint(f\"{'Task ID':<35} {'Score':>8}\")\nprint(\"-\" * 45)\nfor i, inst in enumerate(gepa_instances):\n score, side_info = evaluator(seed_candidate, inst)\n task_id = train[i]['id']\n status = 'PASS' if score >= 0.5 else 'FAIL'\n print(f\"{task_id:<35} {score:>7.3f} {status}\")\n\n# Quick baseline\nscores = [evaluator(seed_candidate, inst)[0] for inst in gepa_instances]\nbaseline_score = sum(scores) / len(scores)\nprint(f\"\\nBaseline Score: {baseline_score:.3f}\")\nprint(f\"Token Count: {original_tokens:,}\")" }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Run GEPA Optimization\n", - "\n", - "Now run the optimization. GEPA will:\n", - "- Use the current SKILL.md as the seed candidate\n", - "- Run scorers against each test case\n", - "- Reflect on failures to propose mutations\n", - "- Select the best candidate via Pareto frontier\n", - "- Penalize token bloat (80% quality, 20% efficiency weighting)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -148,15 +406,6 @@ "print(f\"Candidates explored: {result.gepa_result.num_candidates}\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Results Comparison\n", - "\n", - "Compare the original vs. optimized skill across quality and token efficiency." - ] - }, { "cell_type": "code", "execution_count": null, @@ -206,56 +455,6 @@ "print(\"=\" * 60)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visual comparison bar chart\n", - "try:\n", - " import matplotlib.pyplot as plt\n", - " import matplotlib\n", - " matplotlib.rcParams['font.family'] = 'monospace'\n", - "\n", - " fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n", - "\n", - " # Quality scores\n", - " ax = axes[0]\n", - " bars = ax.bar(\n", - " ['Before', 'After'],\n", - " [result.original_score, result.optimized_score],\n", - " color=['#d4534b', '#4a9c5d'],\n", - " width=0.5\n", - " )\n", - " ax.set_ylim(0, 1.1)\n", - " ax.set_ylabel('Quality Score')\n", - " ax.set_title(f'Quality: {result.original_score:.3f} → {result.optimized_score:.3f}')\n", - " for bar, val in zip(bars, [result.original_score, result.optimized_score]):\n", - " ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.02,\n", - " f'{val:.3f}', ha='center', fontweight='bold')\n", - "\n", - " # Token counts\n", - " ax = axes[1]\n", - " bars = ax.bar(\n", - " ['Before', 'After'],\n", - " [result.original_token_count, result.optimized_token_count],\n", - " color=['#d4534b', '#4a9c5d'],\n", - " width=0.5\n", - " )\n", - " ax.set_ylabel('Token Count')\n", - " ax.set_title(f'Tokens: {result.original_token_count:,} → {result.optimized_token_count:,} ({result.token_reduction_pct:.0f}% reduction)')\n", - " for bar, val in zip(bars, [result.original_token_count, result.optimized_token_count]):\n", - " ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 50,\n", - " f'{val:,}', ha='center', fontweight='bold')\n", - "\n", - " fig.suptitle(f'GEPA Optimization: {SKILL_NAME}', fontsize=14, fontweight='bold')\n", - " plt.tight_layout()\n", - " plt.show()\n", - "except ImportError:\n", - " print(\"(matplotlib not installed -- skipping chart)\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -265,17 +464,6 @@ "Inspect what GEPA changed in the SKILL.md." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from skill_test.optimize.review import review_optimization\n", - "\n", - "review_optimization(result)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -285,56 +473,17 @@ "If the results look good, apply the optimized SKILL.md. Uncomment the cell below to write it." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment to apply:\n", - "# from skill_test.optimize.review import apply_optimization\n", - "# apply_optimization(result)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": "## Multi-Component Optimization: Skills + Tools\n\nGEPA supports optimizing multiple text components simultaneously. You can optimize SKILL.md files **alongside** MCP tool descriptions in a single run.\n\nGEPA's `RoundRobinReflectionComponentSelector` cycles through components one at a time, so each gets dedicated reflection and mutation." }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Inspect available MCP tools\nfrom skill_test.optimize.tools import get_tool_stats, extract_tool_descriptions, tools_to_gepa_components\n\nstats = get_tool_stats()\nprint(f\"MCP Tool Modules: {stats['modules']}\")\nprint(f\"Total Tools: {stats['total_tools']}\")\nprint(f\"Total Chars: {stats['total_description_chars']:,}\")\nprint()\nfor mod, info in stats[\"per_module\"].items():\n print(f\" {mod:<20} {info['tools']:>2} tools {info['chars']:>6,} chars\")\n\n# Show what GEPA components look like for selected modules\ntool_map = extract_tool_descriptions(modules=[\"serving\", \"sql\"])\ncomponents = tools_to_gepa_components(tool_map, per_module=True)\nprint(f\"\\nGEPA components for serving + sql: {list(components.keys())}\")\nfor name, text in components.items():\n from skill_test.optimize.evaluator import count_tokens\n print(f\" {name}: {count_tokens(text):,} tokens\")" - }, { "cell_type": "code", "source": "## Changing the Reflection Model\n\nBy default, GEPA uses `databricks/databricks-gpt-5-2` via Databricks Model Serving.\nOverride per-call or via environment variable:\n\n```python\n# Per-call\nresult = optimize_skill(\"my-skill\", reflection_lm=\"openai/gpt-4o\")\n\n# Environment variable (persistent)\nos.environ[\"GEPA_REFLECTION_LM\"] = \"databricks/databricks-gpt-5-2\"\n```\n\nSee README.md for full model configuration options.", "metadata": {}, "execution_count": null, "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "The GEPA optimization pipeline:\n", - "\n", - "| Metric | Before | After | Change |\n", - "|--------|--------|-------|--------|\n", - "| Quality Score | `result.original_score` | `result.optimized_score` | `result.improvement` |\n", - "| Token Count | `result.original_token_count` | `result.optimized_token_count` | `result.token_reduction_pct`% |\n", - "\n", - "Key points:\n", - "- **Quality gate**: Existing scorers (syntax, patterns, APIs, facts) are reused as-is\n", - "- **Token efficiency**: 80/20 quality/efficiency weighting penalizes bloated skills\n", - "- **Validation split**: Held-out test cases detect overfitting\n", - "- **Reflection LM**: Configurable via `--reflection-lm` flag or `GEPA_REFLECTION_LM` env var\n", - "- **Default model**: `databricks/databricks-gpt-5-2` via Databricks Model Serving" - ] } ], "metadata": { @@ -350,4 +499,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/.test/pyproject.toml b/.test/pyproject.toml index 5e114eb4..88dd19ab 100644 --- a/.test/pyproject.toml +++ b/.test/pyproject.toml @@ -18,6 +18,8 @@ dependencies = [ databricks = ["databricks-sdk>=0.20.0"] dev = ["pytest>=8.0", "pytest-asyncio>=0.23"] optimize = ["gepa>=0.1.0", "tiktoken>=0.7.0"] +# judges group: install separately when mlflow-deepeval/mlflow-ragas are published +# judges = ["mlflow-deepeval>=0.1.0", "mlflow-ragas>=0.1.0"] all = ["skill-test[databricks,dev,optimize]"] [project.scripts] diff --git a/.test/scripts/generate_examples.py b/.test/scripts/generate_examples.py index 890b5bc8..6b0f764f 100644 --- a/.test/scripts/generate_examples.py +++ b/.test/scripts/generate_examples.py @@ -42,7 +42,7 @@ def _find_skill_md(skill_name: str) -> str: """Load SKILL.md content for the given skill.""" - from skill_test.optimize.evaluator import _find_skill_md as find_md + from skill_test.optimize.utils import find_skill_md as find_md path = find_md(skill_name) if path is None: raise FileNotFoundError(f"Could not find SKILL.md for '{skill_name}'") diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py index bbaee3ee..8feedd7b 100644 --- a/.test/scripts/optimize.py +++ b/.test/scripts/optimize.py @@ -57,29 +57,22 @@ def main(): default="standard", help="GEPA optimization preset (default: standard)", ) - parser.add_argument( - "--mode", "-m", - choices=["static", "generative"], - default="static", - help="Evaluation mode (default: static)", - ) - parser.add_argument( - "--task-lm", - default=None, - help="(Deprecated, use --gen-model) LLM model for generative mode", - ) parser.add_argument( "--gen-model", default=None, help="LLM model for generative evaluation (default: GEPA_GEN_LM env or " - "databricks/databricks-claude-sonnet-4-6). The evaluator sends the " - "candidate SKILL.md to this model and scores the generated response.", + "databricks/databricks-claude-sonnet-4-6).", ) parser.add_argument( "--reflection-lm", default=None, help="Override GEPA reflection model (default: GEPA_REFLECTION_LM env or databricks/databricks-claude-opus-4-6)", ) + parser.add_argument( + "--judge-model", + default=None, + help="Override judge model for quality/effectiveness evaluation (future use)", + ) parser.add_argument( "--dry-run", action="store_true", @@ -116,52 +109,38 @@ def main(): "--max-passes", type=int, default=5, - help="Maximum optimization passes per component (default: 5). " - "Each pass re-seeds from the previous best. Stops early if no improvement.", + help="Maximum optimization passes per component (default: 5).", ) parser.add_argument( "--max-metric-calls", type=int, default=None, - help="Override max metric calls per pass (default: auto-scaled by preset × components, " - "capped at 300 for non-Opus models). Example: --max-metric-calls 100", - ) - parser.add_argument( - "--evaluator", - choices=["legacy", "skillbench"], - default="skillbench", - help="Evaluator type: 'skillbench' (measures skill effectiveness via WITH vs " - "WITHOUT comparison, default) or 'legacy' (weighted scoring with keyword " - "matching and token efficiency)", + help="Override max metric calls per pass (default: auto-scaled by preset).", ) parser.add_argument( "--token-budget", type=int, default=None, - help="Token budget ceiling. Candidates exceeding this are penalized. " - "Recommended: 50000. Default: GEPA_TOKEN_BUDGET env or disabled.", + help="Token budget ceiling. Candidates exceeding this are penalized.", ) parser.add_argument( - "--use-judges", + "--align", action="store_true", - help="Enable MLflow LLM judges (Correctness + Guidelines) for richer NL " - "feedback to GEPA's reflection LM. Adds ~10%% judge_quality weight.", + help="Use MemAlign to align judges with human feedback (requires alignment traces)", ) parser.add_argument( "--generate-from", type=str, default=None, metavar="REQUIREMENTS_FILE", - help="Generate test cases from a requirements file before optimizing. " - "Each line in the file is a requirement.", + help="Generate test cases from a requirements file before optimizing.", ) parser.add_argument( "--requirement", action="append", default=None, dest="requirements", - help="Inline requirement for test case generation (repeatable). " - "Example: --requirement 'Must explain MEASURE() wrapping'", + help="Inline requirement for test case generation (repeatable).", ) args = parser.parse_args() @@ -198,7 +177,7 @@ def main(): skill_name=args.skill_name, requirements=requirements, gen_model=gen_model, - trust=True, # append directly since we're about to optimize + trust=True, ) print() @@ -240,9 +219,7 @@ def main(): try: result = optimize_skill( skill_name=name, - mode=args.mode, preset=args.preset, - task_lm=args.task_lm, gen_model=args.gen_model, reflection_lm=args.reflection_lm, include_tools=args.include_tools, @@ -251,9 +228,9 @@ def main(): dry_run=args.dry_run, max_passes=args.max_passes, max_metric_calls=args.max_metric_calls, - evaluator_type=args.evaluator, token_budget=args.token_budget, - use_judges=args.use_judges, + judge_model=args.judge_model, + align=args.align, ) review_optimization(result) if args.apply and not args.dry_run: @@ -278,9 +255,7 @@ def main(): try: result = optimize_skill( skill_name=args.skill_name, - mode=args.mode, preset=args.preset, - task_lm=args.task_lm, gen_model=args.gen_model, reflection_lm=args.reflection_lm, include_tools=args.include_tools, @@ -289,9 +264,9 @@ def main(): dry_run=args.dry_run, max_passes=args.max_passes, max_metric_calls=args.max_metric_calls, - evaluator_type=args.evaluator, token_budget=args.token_budget, - use_judges=args.use_judges, + judge_model=args.judge_model, + align=args.align, ) review_optimization(result) if args.apply and not args.dry_run: diff --git a/.test/src/skill_test/optimize/__init__.py b/.test/src/skill_test/optimize/__init__.py index 87cab788..f55bad01 100644 --- a/.test/src/skill_test/optimize/__init__.py +++ b/.test/src/skill_test/optimize/__init__.py @@ -1,21 +1,21 @@ """GEPA-powered skill optimization using optimize_anything API. Public API: - optimize_skill() - End-to-end optimize a SKILL.md (and optionally tools) - create_skill_evaluator() - Create a GEPA evaluator for a skill - OptimizationResult - Dataclass with optimization results - PRESETS - GEPA config presets (quick, standard, thorough) + optimize_skill() - End-to-end optimize a SKILL.md (and optionally tools) + create_skillbench_evaluator() - Create a judge-based evaluator for a skill + OptimizationResult - Dataclass with optimization results + PRESETS - GEPA config presets (quick, standard, thorough) """ from .runner import optimize_skill, OptimizationResult -from .evaluator import create_skill_evaluator +from .skillbench_evaluator import create_skillbench_evaluator from .config import PRESETS from .review import review_optimization, apply_optimization __all__ = [ "optimize_skill", "OptimizationResult", - "create_skill_evaluator", + "create_skillbench_evaluator", "PRESETS", "review_optimization", "apply_optimization", diff --git a/.test/src/skill_test/optimize/alignment.py b/.test/src/skill_test/optimize/alignment.py new file mode 100644 index 00000000..190d3456 --- /dev/null +++ b/.test/src/skill_test/optimize/alignment.py @@ -0,0 +1,99 @@ +"""MemAlign integration for aligning judges with human feedback. + +MemAlign aligns judges with human feedback via dual-memory: + - Semantic memory: generalizable evaluation principles + - Episodic memory: specific edge cases and corrections + +Alignment traces are stored per-skill in: + .test/skills//alignment_traces.yaml + +Populated via ``scripts/review.py --align`` where a human corrects +judge verdicts. MemAlign learns principles from corrections, +improving judge accuracy over time. + +Only 2-10 examples are needed for visible improvement. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Any + +import yaml + +logger = logging.getLogger(__name__) + + +def load_alignment_traces(skill_name: str) -> list[dict[str, Any]]: + """Load human-corrected alignment traces for a skill. + + Traces are stored in .test/skills//alignment_traces.yaml + with format: + - inputs: {prompt: "..."} + outputs: {response: "..."} + expected_value: true/false or 0.0-1.0 + rationale: "Human explanation of correct verdict" + + Returns: + List of trace dicts, or empty list if no traces found. + """ + traces_path = Path(".test/skills") / skill_name / "alignment_traces.yaml" + if not traces_path.exists(): + return [] + + try: + with open(traces_path) as f: + data = yaml.safe_load(f) + return data if isinstance(data, list) else [] + except Exception as e: + logger.warning("Failed to load alignment traces for %s: %s", skill_name, e) + return [] + + +def align_judge( + skill_name: str, + judge: Any, + reflection_lm: str = "openai:/gpt-4o-mini", +) -> Any: + """Align a judge with human feedback using MemAlign. + + If fewer than 3 alignment traces exist, returns the judge unchanged. + Otherwise, uses MemAlignOptimizer to learn evaluation principles + from human corrections and returns an aligned judge. + + Args: + skill_name: Name of the skill to load traces for. + judge: An MLflow judge (from make_judge or similar). + reflection_lm: LLM for MemAlign's reflection step. + + Returns: + Aligned judge if enough traces exist, otherwise original judge. + """ + traces = load_alignment_traces(skill_name) + if len(traces) < 3: + if traces: + logger.info( + "Only %d alignment traces for %s (need >=3). Using base judge.", + len(traces), skill_name, + ) + return judge + + try: + from mlflow.genai.judges.optimizers import MemAlignOptimizer + + optimizer = MemAlignOptimizer(reflection_lm=reflection_lm) + aligned = judge.align(traces=traces, optimizer=optimizer) + logger.info( + "Aligned judge with %d traces for %s", + len(traces), skill_name, + ) + return aligned + except ImportError: + logger.warning( + "MemAlignOptimizer not available. Install mlflow-deepeval for alignment support." + ) + return judge + except Exception as e: + logger.warning("MemAlign alignment failed for %s: %s", skill_name, e) + return judge diff --git a/.test/src/skill_test/optimize/asi.py b/.test/src/skill_test/optimize/asi.py index 9c0e6cff..fe79a6cf 100644 --- a/.test/src/skill_test/optimize/asi.py +++ b/.test/src/skill_test/optimize/asi.py @@ -1,30 +1,18 @@ """ASI diagnostics: convert MLflow Feedback to optimize_anything SideInfo. -Builds an Actionable Side Information dict from scorer feedback so GEPA's -reflection LM gets structured context about what went wrong with each scorer. -Failure details are surfaced via the ``_failures`` key in the returned dict. +Thin adapter that passes judge rationale through to GEPA's reflection LM +WITHOUT truncation. The critical fix: GEPA's reflection LM gets full +diagnostic text from judges, not truncated snippets. -Also provides ``skillbench_to_asi()`` for the SkillBench-style evaluator, -which produces GEPA-optimized side info with standard diagnostic keys -(Error, Expected, Actual) and ``skill_md_specific_info`` for per-component -routing. +Also provides ``feedback_to_score()`` for backward compatibility with tests. """ from __future__ import annotations -from typing import Any, TYPE_CHECKING +from typing import Any from mlflow.entities import Feedback -if TYPE_CHECKING: - from .assertions import AssertionResult - -from .assertions import ( - summarize_failures as _summarize_failures, - _classify_assertion, - _extract_content, -) - def feedback_to_score(feedback: Feedback) -> float | None: """Convert a single MLflow Feedback to a numeric score. @@ -53,19 +41,10 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: """Convert MLflow Feedback objects to optimize_anything (score, SideInfo). Computes the mean score across non-skipped feedbacks and builds a - SideInfo dict. Failure diagnostics are collected in the ``_failures`` - key so GEPA's reflection LM sees actionable context directly in the - side_info dict (no ``oa.log()`` needed). - - Args: - feedbacks: List of MLflow Feedback objects from running scorers - - Returns: - Tuple of (composite_score, side_info_dict) + SideInfo dict with full rationale (no truncation). """ scores = [] side_info: dict[str, Any] = {} - failures: list[str] = [] for fb in feedbacks: score = feedback_to_score(fb) @@ -88,12 +67,6 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: "status": "pass" if score >= 0.5 else "fail", } - # Collect failure diagnostics for GEPA reflection - if score < 1.0: - failures.append( - f"Scorer '{name}' returned {fb.value}: {fb.rationale or 'no rationale'}" - ) - composite = sum(scores) / len(scores) if scores else 0.0 side_info["_summary"] = { @@ -105,165 +78,4 @@ def feedback_to_asi(feedbacks: list[Feedback]) -> tuple[float, dict[str, Any]]: "failed": sum(1 for s in scores if s < 0.5), } - if failures: - side_info["_failures"] = "\n".join(failures) - - return composite, side_info - - -def build_rich_asi( - feedbacks: list[Feedback], - *, - generated_response: str | None = None, - skill_coverage: dict[str, Any] | None = None, - task_prompt: str | None = None, - per_dimension_scores: dict[str, float] | None = None, -) -> tuple[float, dict[str, Any]]: - """Build enriched ASI with categorized diagnostics for GEPA reflection. - - Extends ``feedback_to_asi()`` with additional context that helps GEPA's - reflection LM understand *why* scores changed and make better edits. - - Args: - feedbacks: MLflow Feedback objects from all scoring layers - generated_response: Truncated LLM output (so reflection sees what skill produced) - skill_coverage: Which patterns/facts found vs missing in SKILL.md - task_prompt: The test prompt (so reflection understands context) - per_dimension_scores: Per-dimension scores dict for Pareto-frontier selection - - Returns: - Tuple of (composite_score, enriched_side_info_dict) - """ - composite, side_info = feedback_to_asi(feedbacks) - - # Categorize feedbacks by layer - categories: dict[str, list[str]] = { - "skill_content": [], - "generated_response": [], - "reference": [], - "structure": [], - } - for fb in feedbacks: - name = fb.name or "" - score = feedback_to_score(fb) - if score is None: - continue - entry = f"{name}: {'pass' if score >= 0.5 else 'FAIL'} ({fb.rationale or ''})" - if name.startswith("skill_content_"): - categories["skill_content"].append(entry) - elif name.startswith("skill_"): - categories["structure"].append(entry) - else: - categories["generated_response"].append(entry) - - side_info["_diagnostics_by_layer"] = { - k: v for k, v in categories.items() if v - } - - if generated_response is not None: - side_info["_generated_response"] = generated_response[:2000] - - if skill_coverage: - side_info["_skill_coverage"] = skill_coverage - - if task_prompt: - side_info["_task_prompt"] = task_prompt[:500] - - if per_dimension_scores: - side_info["scores"] = per_dimension_scores - return composite, side_info - - -# --------------------------------------------------------------------------- -# SkillBench → GEPA side info -# --------------------------------------------------------------------------- - - -def skillbench_to_asi( - with_results: list[AssertionResult], - without_results: list[AssertionResult], - *, - task_prompt: str | None = None, - scores: dict[str, float] | None = None, - with_response: str | None = None, - without_response: str | None = None, - reference_answer: str | None = None, - candidate: dict[str, str] | None = None, -) -> dict[str, Any]: - """Convert SkillBench assertion results to GEPA-optimized side info. - - Produces a flat dict with GEPA's standard diagnostic keys plus actual - agent output and reference answers so the reflection LM can make - targeted SKILL.md edits. - - Budget: ~1480 chars/example (Task 200 + Error ~80 + Expected 500 + - Actual 500 + scores ~200). With minibatch=3: ~4440 chars (~1100 tokens). - - Keys produced (all optional, only non-empty included): - ``Task`` — the task prompt (truncated at 200 chars) - ``Error`` — compact NEEDS_SKILL/REGRESSION assertion labels - ``Expected`` — reference answer from ground_truth.yaml (truncated at 500 chars) - ``Actual`` — agent response WITH skill (truncated at 500 chars) - ``skill_md_specific_info`` — sub-dict with ``Regressions`` for per-component routing - ``scores`` — score breakdown promoted to objective_scores by GEPA - - Args: - with_results: Assertion results from the WITH-skill run. - without_results: Assertion results from the WITHOUT-skill run. - task_prompt: The test prompt (for reflection context). - scores: Score breakdown dict (effectiveness, pass_with, structure, final). - with_response: Agent output WITH skill (truncated at 500 chars). - without_response: Agent output WITHOUT skill (reserved for future use). - reference_answer: Ground truth answer from ground_truth.yaml. - candidate: Full candidate dict for tool-specific diagnostic routing. - - Returns: - Side info dict for optimize_anything. - """ - diag = _summarize_failures(with_results, without_results) - - side_info: dict[str, Any] = {} - - # 1. Task context (short — just enough for the reflection LM) - if task_prompt: - side_info["Task"] = task_prompt[:200] - - # 2. Error: what specific assertions fail (from assertions.py) - if diag.get("Error"): - side_info["Error"] = diag["Error"] - - # 3. Expected: reference answer (what correct output looks like) - if reference_answer: - side_info["Expected"] = reference_answer[:500] - - # 4. Actual: agent response WITH skill (what was produced) - if with_response is not None: - side_info["Actual"] = with_response[:500] - - # 5. Regressions: routed to skill_md component - if diag.get("Regressions"): - side_info["skill_md_specific_info"] = {"Regressions": diag["Regressions"]} - - # 5b. Route tool-specific failures to {component}_specific_info - if candidate: - tool_components = {k: v for k, v in candidate.items() if k.startswith("tools_")} - for comp_name, comp_text in tool_components.items(): - comp_text_lower = comp_text.lower() - tool_failures = [] - for w, wo in zip(with_results, without_results): - label = _classify_assertion(w, wo) - if label in ("NEEDS_SKILL", "REGRESSION"): - content = _extract_content(w) - if content.lower() in comp_text_lower: - tool_failures.append(f"{label}: {w.assertion_type} — '{content}'") - if tool_failures: - side_info[f"{comp_name}_specific_info"] = { - "Related_assertions": "\n".join(tool_failures) - } - - # 6. Scores: needed for GEPA Pareto tracking - if scores: - side_info["scores"] = scores - - return side_info diff --git a/.test/src/skill_test/optimize/evaluator.py b/.test/src/skill_test/optimize/evaluator.py deleted file mode 100644 index 88c8157b..00000000 --- a/.test/src/skill_test/optimize/evaluator.py +++ /dev/null @@ -1,581 +0,0 @@ -"""Evaluator bridge: wrap existing MLflow scorers into optimize_anything evaluators. - -Creates GEPA-compatible evaluator functions that take a candidate (str or dict) -and a task example, run existing scorers, and return (score, SideInfo). -""" - -import inspect -import re -from pathlib import Path -from typing import Any, Callable, Literal - -import tiktoken -import gepa.optimize_anything as oa -from mlflow.entities import Feedback - -from ..runners.evaluate import build_scorers, load_scorer_config -from ..scorers.universal import ( - python_syntax, - sql_syntax, - no_hallucinated_apis, - pattern_adherence, - expected_facts_present, -) -from .asi import feedback_to_asi -from .skillbench_evaluator import ( # noqa: F401 — re-exported for runner.py - create_skillbench_evaluator, - build_skillbench_background, -) - - -# --------------------------------------------------------------------------- -# Path utilities -# --------------------------------------------------------------------------- - -def _find_repo_root() -> Path: - """Find the repo root by searching upward for .test/src/.""" - current = Path(__file__).resolve().parent - while current != current.parent: - if (current / ".test" / "src").exists(): - return current - if (current / "src" / "skill_test").exists() and current.name == ".test": - return current.parent - current = current.parent - return Path.cwd() - - -def _find_skill_md(skill_name: str) -> Path | None: - """Locate the SKILL.md file for a given skill name.""" - repo_root = _find_repo_root() - candidates = [ - repo_root / ".claude" / "skills" / skill_name / "SKILL.md", - repo_root / "databricks-skills" / skill_name / "SKILL.md", - ] - for p in candidates: - if p.exists(): - return p - return None - - -# --------------------------------------------------------------------------- -# Token utilities -# --------------------------------------------------------------------------- - -def count_tokens(text: str) -> int: - """Count tokens using cl100k_base encoding.""" - enc = tiktoken.get_encoding("cl100k_base") - return len(enc.encode(text)) - - -def token_efficiency_score(candidate_text: str, original_token_count: int) -> float: - """Score based on how concise the candidate is vs. the original. - - Smaller than original = bonus up to 1.15, same size = 1.0, - larger = linear penalty to 0.0 at 2x. - """ - if original_token_count <= 0: - return 1.0 - enc = tiktoken.get_encoding("cl100k_base") - candidate_tokens = len(enc.encode(candidate_text)) - ratio = candidate_tokens / original_token_count - if ratio <= 1.0: - return 1.0 + 0.15 * (1.0 - ratio) - else: - return max(0.0, 2.0 - ratio) - - -# --------------------------------------------------------------------------- -# Scorer execution -# --------------------------------------------------------------------------- - -def _run_scorer(scorer_fn: Any, outputs: dict, expectations: dict, inputs: dict) -> list[Feedback]: - """Run a single scorer and normalize result to a list of Feedbacks.""" - sig = inspect.signature(scorer_fn) - params = list(sig.parameters.keys()) - - kwargs = {} - if "outputs" in params: - kwargs["outputs"] = outputs - if "expectations" in params: - kwargs["expectations"] = expectations - if "inputs" in params: - kwargs["inputs"] = inputs - - try: - result = scorer_fn(**kwargs) - except Exception as e: - return [Feedback(name=getattr(scorer_fn, "__name__", "unknown"), value="no", rationale=str(e))] - - if isinstance(result, list): - return result - elif isinstance(result, Feedback): - return [result] - return [] - - -def _run_deterministic_scorers( - response: str, - expectations: dict[str, Any], - prompt: str, - scorer_config: dict[str, Any], -) -> list[Feedback]: - """Run deterministic scorers against a response.""" - outputs = {"response": response} - inputs = {"prompt": prompt} - - if scorer_config: - scorers = build_scorers(scorer_config) - else: - scorers = [python_syntax, sql_syntax, pattern_adherence, no_hallucinated_apis, expected_facts_present] - - all_feedbacks = [] - for scorer_fn in scorers: - scorer_name = getattr(scorer_fn, "__name__", "") or getattr(scorer_fn, "name", "") - if scorer_name in ("Safety", "Guidelines", "skill_quality"): - continue - all_feedbacks.extend(_run_scorer(scorer_fn, outputs, expectations, inputs)) - - return all_feedbacks - - -def _validate_skill_structure(candidate_text: str) -> list[Feedback]: - """Validate the SKILL.md structure itself.""" - outputs = {"response": candidate_text} - feedbacks = [] - for scorer_fn in [python_syntax, sql_syntax, no_hallucinated_apis]: - result = _run_scorer(scorer_fn, outputs, {}, {}) - for fb in result: - feedbacks.append(Feedback(name=f"skill_{fb.name}", value=fb.value, rationale=fb.rationale)) - return feedbacks - - -_STOP_WORDS = frozenset({ - "the", "and", "for", "with", "that", "this", "from", "are", "was", - "were", "been", "being", "have", "has", "had", "does", "did", "but", - "not", "you", "all", "can", "her", "his", "its", "may", "our", - "out", "use", "uses", "will", "how", "who", "get", "which", "would", - "make", "like", "into", "than", "them", "then", "each", "other", - "should", "could", -}) - - -def _keyword_fact_score(fact: str, text: str) -> float: - """Score 0-1 based on keyword overlap between a fact and text.""" - words = [w for w in re.findall(r'\w{3,}', fact.lower()) if w not in _STOP_WORDS] - if not words: - return 1.0 - text_lower = text.lower() - found = sum(1 for w in words if w in text_lower) - return found / len(words) - - -def _score_skill_content_facts(candidate_text: str, expected_facts: list[str]) -> list[Feedback]: - """Score SKILL.md content against expected facts using keyword matching. - - Unlike the universal ``expected_facts_present`` scorer which requires exact - substring matches, this uses keyword extraction so descriptive facts like - "Uses CREATE OR REPLACE VIEW with WITH METRICS LANGUAGE YAML" match when - the individual keywords appear in the skill text. - """ - feedbacks = [] - for fact in expected_facts: - score = _keyword_fact_score(fact, candidate_text) - feedbacks.append(Feedback( - name=f"skill_content_fact_{fact[:40]}", - value=score, # continuous 0.0-1.0 - rationale=f"(skill content) Keyword match {score:.0%} for: {fact}", - )) - return feedbacks - - -def _score_skill_content(candidate_text: str, expectations: dict[str, Any]) -> list[Feedback]: - """Score the SKILL.md candidate itself for pattern/fact coverage. - - Runs pattern_adherence and expected_facts_present against the skill text - (not the response). This gives GEPA immediate dynamic signal: if a key - pattern is removed from SKILL.md, the score drops. - - Feedback names are prefixed with ``skill_content_`` to distinguish from - response-level scores. - """ - outputs = {"response": candidate_text} - feedbacks = [] - - # Pattern adherence on skill content - pa_results = _run_scorer(pattern_adherence, outputs, expectations, {}) - for fb in pa_results: - feedbacks.append(Feedback( - name=f"skill_content_{fb.name}", - value=fb.value, - rationale=f"(skill content) {fb.rationale or ''}", - )) - - # Expected facts on skill content (keyword matching for descriptive facts) - expected_facts = expectations.get("expected_facts", []) - if expected_facts: - feedbacks.extend(_score_skill_content_facts(candidate_text, expected_facts)) - - return feedbacks - - -# --------------------------------------------------------------------------- -# Evaluator factory (optimize_anything compatible) -# --------------------------------------------------------------------------- - -SKILL_KEY = "skill_md" - - -def create_skill_evaluator( - skill_name: str, - mode: Literal["static", "generative"] = "static", - task_lm: str | None = None, - gen_model: str | None = None, - original_token_counts: dict[str, int] | None = None, -) -> Callable: - """Create an optimize_anything-compatible evaluator for a skill. - - Returns a function: (candidate, example) -> (score, side_info) - - The candidate is dict[str, str] (may have "skill_md" + "tools_*" keys). - The example is a task dict from the dataset. - - Evaluation layers: - 1. Skill-content scoring: pattern/fact presence in SKILL.md itself - 2. Generative evaluation: LLM generates response from skill, scored - 3. Reference response check: fixed ground truth scoring (sanity) - 4. Structure validation: syntax, no hallucinated APIs on SKILL.md - 5. Token efficiency: conciseness vs original - - Args: - skill_name: Name of the skill being evaluated - mode: "static" uses ground truth response, "generative" generates fresh - task_lm: LLM for generative mode (deprecated, use gen_model) - gen_model: LLM model for generative evaluation - original_token_counts: Token counts of original artifacts - """ - scorer_config = load_scorer_config(skill_name) - effective_gen_model = gen_model or task_lm - - # Track whether we've warned about generation failure - _gen_warned = [False] - - # Compute original token count for efficiency scoring - if original_token_counts is None: - skill_path = _find_skill_md(skill_name) - original_token_counts = { - SKILL_KEY: count_tokens(skill_path.read_text()) if skill_path else 0 - } - total_original_tokens = sum(original_token_counts.values()) - - # Mutable closure state: per-task baseline scorer scores for comparison. - # Populated via evaluator.set_baseline() after evaluating the seed. - _baseline: dict[str, dict[str, float | None]] = {} - - def evaluator(candidate: dict[str, str], example: dict) -> tuple[float, dict]: - """Evaluate a candidate against a single task example. - - Args: - candidate: dict[str, str] with "skill_md" and/or "tools_*" keys - example: Task dict with "input", "answer", "additional_context" - - Returns: - (score, side_info) tuple for optimize_anything - """ - candidate_text = candidate.get(SKILL_KEY, "") - - # Decode expectations from additional_context - expectations = {} - expectations_json = example.get("additional_context", {}).get("expectations", "") - if expectations_json: - import json - try: - expectations = json.loads(expectations_json) - except (json.JSONDecodeError, TypeError): - pass - - # ------------------------------------------------------------------ - # Layer 1: Skill-content scoring (pattern/fact presence in SKILL.md) - # ------------------------------------------------------------------ - skill_content_feedbacks: list[Feedback] = [] - if candidate_text and expectations: - skill_content_feedbacks = _score_skill_content(candidate_text, expectations) - - skill_content_composite, skill_content_si = feedback_to_asi(skill_content_feedbacks) - - # ------------------------------------------------------------------ - # Layer 2: Generative evaluation (LLM generates from skill, score that) - # ------------------------------------------------------------------ - generated_response = None - gen_feedbacks: list[Feedback] = [] - gen_composite = 0.0 - - _gen_available = False # Track if generation actually worked - if effective_gen_model and candidate_text and example.get("input"): - import litellm - messages = [ - { - "role": "system", - "content": ( - "Use ONLY the following skill documentation to answer " - "the user's question. Do not use any other knowledge.\n\n" - f"{candidate_text}" - ), - }, - {"role": "user", "content": example.get("input", "")}, - ] - try: - from .skillbench_evaluator import _completion_with_backoff - resp = _completion_with_backoff(model=effective_gen_model, messages=messages) - generated_response = resp.choices[0].message.content - _gen_available = True - except Exception as e: - generated_response = None - gen_feedbacks.append(Feedback( - name="generation_error", - value="no", - rationale=f"LLM generation failed: {e}", - )) - if not _gen_warned[0]: - _gen_warned[0] = True - import warnings - warnings.warn( - f"\nGeneration model '{effective_gen_model}' failed: {e}\n" - "Falling back to skill-content + reference scoring (no generative eval).\n" - "The 20% 'generated response quality' layer will be inactive.\n" - "Fix: set DATABRICKS_API_KEY + DATABRICKS_API_BASE, or use " - "--gen-model with a working provider (e.g., --gen-model openai/gpt-4o).\n", - stacklevel=2, - ) - - if generated_response: - gen_feedbacks = _run_deterministic_scorers( - generated_response, expectations, example.get("input", ""), scorer_config - ) - - gen_composite, gen_si = feedback_to_asi(gen_feedbacks) - - # ------------------------------------------------------------------ - # Layer 3: Reference response check (ground truth — sanity baseline) - # ------------------------------------------------------------------ - reference_response = example.get("answer", "") - ref_feedbacks: list[Feedback] = [] - ref_composite = 0.0 - - if reference_response: - ref_feedbacks = _run_deterministic_scorers( - reference_response, expectations, example.get("input", ""), scorer_config - ) - ref_composite, _ = feedback_to_asi(ref_feedbacks) - - # ------------------------------------------------------------------ - # Layer 4: Validate skill structure - # ------------------------------------------------------------------ - structure_feedbacks: list[Feedback] = [] - if candidate_text: - structure_feedbacks = _validate_skill_structure(candidate_text) - - structure_composite, _ = feedback_to_asi(structure_feedbacks) - - # ------------------------------------------------------------------ - # Layer 5: Token efficiency across ALL components - # ------------------------------------------------------------------ - total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) - if total_original_tokens > 0: - ratio = total_candidate_tokens / total_original_tokens - if ratio <= 1.0: - efficiency = 1.0 + 0.15 * (1.0 - ratio) - else: - efficiency = max(0.0, 2.0 - ratio) - else: - efficiency = 1.0 - - # ------------------------------------------------------------------ - # Weighted final score - # ------------------------------------------------------------------ - # When generative eval succeeds, it gets the dominant weight. - # When gen fails (auth error, timeout, etc), fall back to - # skill-content-heavy weighting — this is the only layer that - # changes dynamically as GEPA mutates the skill. - if _gen_available and generated_response is not None: - # Full layered evaluation - final_score = ( - 0.20 * gen_composite # Generated response quality - + 0.35 * skill_content_composite # Skill content coverage - + 0.05 * ref_composite # Reference response (sanity) - + 0.10 * structure_composite # Structure validation - + 0.30 * efficiency # Token efficiency - ) - else: - # Fallback: no generative eval, emphasize skill content + efficiency - final_score = ( - 0.40 * skill_content_composite # Skill content coverage - + 0.20 * ref_composite # Reference response - + 0.10 * structure_composite # Structure validation - + 0.30 * efficiency # Token efficiency - ) - - # ------------------------------------------------------------------ - # Build unified side_info for GEPA reflection - # ------------------------------------------------------------------ - # Merge all feedbacks for the side_info dict - all_feedbacks = skill_content_feedbacks + gen_feedbacks + ref_feedbacks + structure_feedbacks - _, side_info = feedback_to_asi(all_feedbacks) - - side_info["scores"] = { - "generated_response_quality": gen_composite, - "skill_content_coverage": skill_content_composite, - "reference_response_check": ref_composite, - "structure_validation": structure_composite, - "token_efficiency": efficiency, - "final": final_score, - } - side_info["token_counts"] = { - "candidate_total": total_candidate_tokens, - "original_total": total_original_tokens, - } - - # Enrich ASI for GEPA reflection (Step 4 from plan) - if generated_response is not None: - side_info["_generated_response"] = generated_response[:2000] - side_info["_task_prompt"] = example.get("input", "")[:500] - - # Skill coverage summary - if skill_content_feedbacks: - found = [fb.name for fb in skill_content_feedbacks if fb.value == "yes"] - missing = [fb.name for fb in skill_content_feedbacks if fb.value == "no"] - side_info["_skill_coverage"] = { - "found": found, - "missing": missing, - "coverage_ratio": len(found) / max(len(found) + len(missing), 1), - } - - # Baseline comparison -- show GEPA's reflection LM what improved/regressed - task_key = example.get("input", "") - if task_key and task_key in _baseline: - comparisons = [] - for scorer_name, baseline_val in _baseline[task_key].items(): - current_val = side_info.get(scorer_name, {}).get("score") - if current_val is None or baseline_val is None: - continue - if current_val > baseline_val + 0.01: - comparisons.append( - f"Improved on {scorer_name} ({baseline_val:.2f} -> {current_val:.2f})" - ) - elif current_val < baseline_val - 0.01: - comparisons.append( - f"Regressed on {scorer_name} ({baseline_val:.2f} -> {current_val:.2f})" - ) - if comparisons: - side_info["_baseline_comparison"] = "; ".join(comparisons) - - return final_score, side_info - - def set_baseline(per_task_side_info: dict[str, dict]) -> None: - """Cache per-task per-scorer scores from the seed evaluation. - - Args: - per_task_side_info: {task_input_text: side_info_dict} from seed eval. - """ - for task_key, info in per_task_side_info.items(): - _baseline[task_key] = { - name: data.get("score") - for name, data in info.items() - if isinstance(data, dict) and "score" in data - and not name.startswith("_") - } - - evaluator.set_baseline = set_baseline # type: ignore[attr-defined] - return evaluator - - -def build_optimization_background( - skill_name: str, - original_token_count: int, - component_names: list[str] | None = None, - baseline_scores: dict[str, float] | None = None, - baseline_side_info: dict[str, dict] | None = None, -) -> str: - """Build the background context string for GEPA's reflection LM. - - Args: - skill_name: Name of the skill being optimized. - original_token_count: Total token count of the original artifacts. - component_names: Names of the candidate components (e.g. "skill_md", "tools_*"). - baseline_scores: Per-task overall scores from evaluating the seed candidate. - baseline_side_info: Per-task side_info dicts from evaluating the seed candidate. - """ - components_desc = "" - if component_names and any(c.startswith("tools_") for c in component_names): - tool_modules = [c.replace("tools_", "") for c in component_names if c.startswith("tools_")] - components_desc = ( - "\n\nYou are also optimizing MCP tool descriptions for these modules: " - f"{', '.join(tool_modules)}. " - "Tool descriptions are docstrings on @mcp.tool functions. Keep them " - "accurate, concise, and action-oriented.\n" - ) - - # Build baseline performance summary - baseline_desc = "" - if baseline_scores: - mean_score = sum(baseline_scores.values()) / len(baseline_scores) - perfect = [tid for tid, s in baseline_scores.items() if s >= 0.99] - weak = sorted( - [(tid, s) for tid, s in baseline_scores.items() if s < 0.99], - key=lambda x: x[1], - ) - - baseline_desc = ( - f"\n\nBASELINE PERFORMANCE (seed candidate):\n" - f" Mean score: {mean_score:.3f} across {len(baseline_scores)} test cases.\n" - ) - if perfect: - baseline_desc += f" Perfect/near-perfect ({len(perfect)}): {', '.join(perfect)}\n" - if weak: - baseline_desc += " Needs improvement:\n" - for tid, score in weak: - baseline_desc += f" - {tid}: {score:.3f}" - # Add per-scorer detail if available - if baseline_side_info and tid in baseline_side_info: - info = baseline_side_info[tid] - failing = [ - name for name, data in info.items() - if isinstance(data, dict) and data.get("status") == "fail" - and not name.startswith("_") - ] - if failing: - baseline_desc += f" (failing: {', '.join(failing)})" - baseline_desc += "\n" - - baseline_desc += ( - "\n PRIORITY: Focus optimization effort on the weak test cases above. " - "Do NOT break test cases that already score well.\n" - ) - - return ( - f"You are REFINING an existing, working SKILL.md file for the '{skill_name}' " - "Databricks skill. The seed candidate is a production skill that already works -- " - "preserve what already works and improve what doesn't.\n\n" - "SKILL.md files teach AI agents (like Claude Code) how to use specific Databricks features. " - "They contain patterns, code examples, API references, and best practices.\n\n" - "EVALUATION: The skill is evaluated by having a small LLM generate responses from it. " - "Better skill documentation produces more correct responses. Scores come from:\n" - "- Generated response quality (20%): An LLM reads ONLY the skill and answers a test prompt. " - "Its response is scored against expected patterns and facts.\n" - "- Skill content coverage (35%): Does the SKILL.md itself contain the patterns and facts " - "needed to answer test prompts? Removing key content directly drops this score.\n" - "- Reference response check (5%): Sanity check against a known-good response.\n" - "- Structure validation (10%): Python/SQL syntax, no hallucinated APIs.\n" - "- Token efficiency (30%): Conciseness vs original -- smaller is ACTIVELY REWARDED. " - "Shrinking the skill below its original size gives a bonus score (up to 1.15x at 0% of original). " - "Growing the skill is penalized linearly to 0.0 at 2x original size.\n\n" - "KEY INSIGHT: Token efficiency is the second-highest weight. Every token you remove " - "directly improves the score. SkillsBench research shows long skills hurt agent performance " - "via 'cognitive overhead' -- agents get confused by verbose docs. Be ruthlessly concise.\n\n" - f"IMPORTANT: The current artifacts total {original_token_count:,} tokens. " - "Optimized versions MUST be MORE CONCISE. Target at least 10-20% token reduction. " - "Remove redundant examples, consolidate similar patterns, " - "eliminate verbose explanations, and merge overlapping sections. " - "Every token consumed is agent context window budget -- keep skills lean and focused." - f"{baseline_desc}" - f"{components_desc}" - ) diff --git a/.test/src/skill_test/optimize/judges.py b/.test/src/skill_test/optimize/judges.py new file mode 100644 index 00000000..10869c84 --- /dev/null +++ b/.test/src/skill_test/optimize/judges.py @@ -0,0 +1,289 @@ +"""MLflow judge factories for skill evaluation. + +Replaces the 6 separate judge calls and binary assertion layer with three +focused judges that provide both scores AND rich rationale for GEPA's +reflection LM. + +Judges: + quality_judge — Scores a single response (0.0-1.0) against expectations. + effectiveness_judge — Compares WITH vs WITHOUT responses, returns verdict. + regression_judge — Identifies specific ways a skill harms responses. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Any + +from mlflow.genai.judges import make_judge + +logger = logging.getLogger(__name__) + + +@dataclass +class JudgeFeedback: + """Structured feedback from a judge call.""" + + value: float | str + rationale: str + name: str + + +def _safe_parse_score(raw_value: Any) -> float: + """Convert judge output to a float score in [0.0, 1.0]. + + Handles: bool, "yes"/"no", numeric, float-as-string. + """ + if isinstance(raw_value, (int, float)): + return max(0.0, min(1.0, float(raw_value))) + if isinstance(raw_value, bool): + return 1.0 if raw_value else 0.0 + if isinstance(raw_value, str): + low = raw_value.strip().lower() + if low == "yes": + return 1.0 + if low == "no": + return 0.0 + try: + return max(0.0, min(1.0, float(low))) + except ValueError: + pass + return 0.0 + + +# --------------------------------------------------------------------------- +# Quality judge — primary scorer for a single response +# --------------------------------------------------------------------------- + +_QUALITY_INSTRUCTIONS = """\ +You are an expert evaluator for Databricks skill documentation quality. +Rate the response on a scale from 0.0 to 1.0 based on how well it addresses +the user's question using correct, complete, and relevant information. + +## Evaluation Criteria + +1. **Relevance** (does the response address the question?) +2. **Completeness** (are all parts of the question answered?) +3. **Correctness** (are the facts and API references accurate?) +4. **Pattern adherence** (does the response follow expected code patterns?) +5. **API accuracy** (are function names, parameters, and syntax correct?) + +## Expected Facts and Patterns + +{{ expectations.expected_facts }} + +{{ expectations.expected_patterns }} + +## Skill-Specific Guidelines + +{{ expectations.guidelines }} + +## Input + +Question: {{ inputs.prompt }} +Response: {{ outputs.response }} + +## Instructions + +Return a score between 0.0 and 1.0 where: +- 1.0 = perfect response, all facts present, all patterns correct +- 0.7 = good response, most facts present, minor gaps +- 0.4 = partial response, significant gaps or inaccuracies +- 0.1 = poor response, mostly wrong or off-topic +- 0.0 = completely wrong or empty + +Provide detailed rationale explaining: +- Which expected facts are present vs missing +- Which patterns are correctly followed vs violated +- Specific API or syntax errors found +- What would need to change to improve the score +""" + + +def create_skill_quality_judge( + skill_guidelines: list[str] | None = None, +) -> Any: + """Create a universal quality judge for scoring responses. + + Uses ``make_judge`` with float output. Incorporates skill-specific + guidelines as "semantic memory" when available. + + Args: + skill_guidelines: Optional per-skill evaluation principles from + ground_truth.yaml guidelines across all test cases. + + Returns: + A callable judge that accepts (inputs, outputs, expectations) and + returns an MLflow Feedback object with float value + rationale. + """ + instructions = _QUALITY_INSTRUCTIONS + if skill_guidelines: + principles = "\n".join(f"- {g}" for g in skill_guidelines) + instructions += ( + "\n\n## Domain-Specific Principles\n" + f"{principles}\n" + ) + + return make_judge( + name="skill_quality", + instructions=instructions, + feedback_value_type=float, + ) + + +# --------------------------------------------------------------------------- +# Effectiveness judge — WITH vs WITHOUT comparison +# --------------------------------------------------------------------------- + +_EFFECTIVENESS_INSTRUCTIONS = """\ +You are comparing two responses to the same question to determine whether +a skill document helped or hurt the agent's response quality. + +## Context + +- **WITH-skill response**: Generated with the skill document in context. +- **WITHOUT-skill response**: Generated without any skill document. + +## Expected Information + +{{ expectations.expected_facts }} + +## Input + +Question: {{ inputs.prompt }} + +WITH-skill response: +{{ inputs.with_response }} + +WITHOUT-skill response: +{{ inputs.without_response }} + +## Instructions + +Determine whether the skill IMPROVED, maintained (SAME), or REGRESSED the +response quality. Return one of exactly: "improved", "same", "regressed". + +An "improved" verdict means the WITH-skill response is meaningfully better: +more accurate facts, better code patterns, correct API usage that the +WITHOUT response got wrong. + +A "regressed" verdict means the skill actively HURT the response: introduced +incorrect information, deprecated APIs, or confused the agent. + +"same" means no meaningful difference. + +Provide detailed rationale explaining: +- What the skill added or removed from the response +- Specific facts/patterns that differ between WITH and WITHOUT +- Whether the skill taught something the model didn't already know +- If regressed: what specifically the skill got wrong +""" + + +def create_effectiveness_judge() -> Any: + """Create a WITH vs WITHOUT comparison judge. + + Returns a judge that evaluates whether the skill helped, hurt, or made + no difference. Returns Feedback with value in {"improved", "same", "regressed"} + and detailed rationale for GEPA. + """ + return make_judge( + name="skill_effectiveness", + instructions=_EFFECTIVENESS_INSTRUCTIONS, + feedback_value_type=str, + ) + + +# --------------------------------------------------------------------------- +# Regression judge — identifies how a skill harms responses +# --------------------------------------------------------------------------- + +_REGRESSION_INSTRUCTIONS = """\ +You are a regression detector for Databricks skill documents. Your job is +to identify specific ways that a skill document HARMS agent responses. + +## Context + +The skill document was added to an agent's context. Compare the agent's +response WITH the skill to the response WITHOUT it. + +## Input + +Question: {{ inputs.prompt }} + +WITH-skill response: +{{ inputs.with_response }} + +WITHOUT-skill response: +{{ inputs.without_response }} + +## Instructions + +Identify specific regressions introduced by the skill. Return "yes" if +regressions are found, "no" if the skill is harmless. + +Common regression patterns: +1. **Deprecated APIs** — skill teaches old APIs the model already uses correctly +2. **Verbosity** — skill adds noise that confuses the model +3. **Contradicting correct knowledge** — model was right, skill made it wrong +4. **Wrong examples** — skill's code examples have errors the model copies +5. **Over-specification** — skill's rigid patterns prevent correct alternatives + +For each regression found, explain: +- WHAT specific content in the skill caused the regression +- WHY it made the response worse +- WHAT to remove or change in the skill to fix it +""" + + +def create_regression_judge() -> Any: + """Create a regression detection judge. + + Returns structured feedback about what to REMOVE from the skill. + Rationale goes directly to GEPA's reflection LM for targeted fixes. + """ + return make_judge( + name="skill_regression", + instructions=_REGRESSION_INSTRUCTIONS, + feedback_value_type=bool, + ) + + +# --------------------------------------------------------------------------- +# Helper: run a judge safely and return structured feedback +# --------------------------------------------------------------------------- + +def run_judge_safe( + judge: Any, + *, + inputs: dict[str, Any], + outputs: dict[str, Any] | None = None, + expectations: dict[str, Any] | None = None, + name: str = "judge", +) -> JudgeFeedback: + """Run a judge with error handling, return JudgeFeedback. + + Catches exceptions and returns a zero-score feedback with error rationale + so that evaluation never crashes from a judge failure. + """ + kwargs: dict[str, Any] = {"inputs": inputs} + if outputs is not None: + kwargs["outputs"] = outputs + if expectations is not None: + kwargs["expectations"] = expectations + + try: + fb = judge(**kwargs) + return JudgeFeedback( + value=fb.value, + rationale=fb.rationale or "", + name=name, + ) + except Exception as e: + logger.warning("Judge '%s' failed: %s", name, e) + return JudgeFeedback( + value=0.0, + rationale=f"Judge error: {e}", + name=name, + ) diff --git a/.test/src/skill_test/optimize/review.py b/.test/src/skill_test/optimize/review.py index c5bc4e93..3a0c1aa5 100644 --- a/.test/src/skill_test/optimize/review.py +++ b/.test/src/skill_test/optimize/review.py @@ -16,7 +16,7 @@ from pathlib import Path from .runner import OptimizationResult -from .evaluator import _find_skill_md +from .utils import find_skill_md as _find_skill_md def _get_results_dir(skill_name: str) -> Path: @@ -145,16 +145,19 @@ def load_last_result(skill_name: str) -> OptimizationResult | None: ) -def _review_skillbench(result: OptimizationResult) -> None: - """Print SkillBench-style effectiveness metrics.""" +def review_optimization(result: OptimizationResult) -> None: + """Print optimization summary for human review. + + Shows: score improvement, token reduction, judge-based effectiveness, + per-test-case score breakdown, and diff of changes. + """ print(f"\n{'=' * 60}") - print(f" Optimization Results: {result.skill_name} (skillbench)") + print(f" Optimization Results: {result.skill_name}") print(f"{'=' * 60}") - # Aggregate effectiveness metrics from side_info si = result.skillbench_side_info or {} - # Compute aggregate pass rates from per-task scores + # Aggregate judge-based scores from per-task side_info task_count = 0 sum_with = 0.0 sum_without = 0.0 @@ -164,37 +167,29 @@ def _review_skillbench(result: OptimizationResult) -> None: for task_id in sorted(si.keys()): info = si[task_id] scores = info.get("scores", {}) - pw = scores.get("pass_rate_with", 0.0) - pwo = scores.get("pass_rate_without", 0.0) + pw = scores.get("quality_with", 0.0) + pwo = scores.get("quality_without", 0.0) eff = scores.get("skill_effectiveness", 0.0) sum_with += pw sum_without += pwo sum_eff += eff task_count += 1 - # Build per-task notes from the new Error key + # Build per-task notes error = info.get("Error", "") notes = [] if "NEEDS_SKILL" in error: - # Extract assertion names from Error lines - failing = [ - line.split(":", 1)[1].strip().split(" — ")[0] - for line in error.split("\n") - if line.startswith("NEEDS_SKILL") - ] - notes.append(f"NEEDS: {', '.join(failing[:3])}") + notes.append("NEEDS_SKILL") if "REGRESSION" in error: - reg_count = error.count("REGRESSION") - notes.append(f"REGRESSION x{reg_count}") + notes.append("REGRESSION") if not notes: - notes.append("OK/NEUTRAL") + notes.append("OK") note_str = f" [{'; '.join(notes)}]" per_task_lines.append( f" {task_id:<30s} WITH {pw:.2f} WITHOUT {pwo:.2f} " f"delta {eff:+.2f}{note_str}" ) - # Aggregate rates if task_count > 0: agg_with = sum_with / task_count agg_without = sum_without / task_count @@ -207,18 +202,16 @@ def _review_skillbench(result: OptimizationResult) -> None: print(f" Score: {result.original_score:.3f} -> {result.optimized_score:.3f} " f"({improvement_sign}{result.improvement:.3f})") print(f" Skill Effectiveness: {agg_eff:.2f}") - print(f" Pass Rate (with): {agg_with:.2f}") - print(f" Pass Rate (without): {agg_without:.2f} (baseline)") + print(f" Quality (with): {agg_with:.2f}") + print(f" Quality (without): {agg_without:.2f} (baseline)") # Token counts reduction_sign = "+" if result.token_reduction_pct >= 0 else "" print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " f"({reduction_sign}{result.token_reduction_pct:.1f}%)") - # GEPA iterations if result.gepa_result and hasattr(result.gepa_result, "iterations"): print(f" Iterations: {result.gepa_result.iterations}") - if result.mlflow_run_id: print(f" MLflow run: {result.mlflow_run_id}") @@ -278,96 +271,6 @@ def _review_skillbench(result: OptimizationResult) -> None: print(f"{'=' * 60}\n") -def review_optimization(result: OptimizationResult) -> None: - """Print optimization summary for human review. - - Automatically saves the result to disk so it can be applied later - with ``--apply-last`` without re-running optimization. - - Shows: score improvement, token reduction, diff of changed sections, - per-test-case score breakdown, validation set performance. - - Delegates to ``_review_skillbench()`` when ``evaluator_type == "skillbench"``. - """ - if getattr(result, "evaluator_type", "legacy") == "skillbench": - return _review_skillbench(result) - - print(f"\n{'=' * 60}") - print(f" Optimization Results: {result.skill_name}") - print(f"{'=' * 60}") - - # Quality scores - improvement_sign = "+" if result.improvement >= 0 else "" - print(f" Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " - f"({improvement_sign}{result.improvement:.3f})") - - # Token counts - reduction_sign = "+" if result.token_reduction_pct >= 0 else "" - print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " - f"({reduction_sign}{result.token_reduction_pct:.1f}%)") - - # Validation scores - if result.val_scores: - avg_val = sum(result.val_scores.values()) / len(result.val_scores) - print(f" Validation: avg={avg_val:.3f} ({len(result.val_scores)} cases)") - - # GEPA iterations - if result.gepa_result and hasattr(result.gepa_result, "iterations"): - print(f" Iterations: {result.gepa_result.iterations}") - - # MLflow run - if result.mlflow_run_id: - print(f" MLflow run: {result.mlflow_run_id}") - - print() - - # Diff summary - if result.diff_summary and result.diff_summary != "No changes": - print(" Changes:") - for line in result.diff_summary.split("\n"): - print(f" {line}") - print() - - # Detailed diff (first 50 lines) - if result.original_content != result.optimized_content: - diff_lines = list(difflib.unified_diff( - result.original_content.splitlines(keepends=True), - result.optimized_content.splitlines(keepends=True), - fromfile="original SKILL.md", - tofile="optimized SKILL.md", - n=2, - )) - if len(diff_lines) > 50: - print(f" Diff (first 50 of {len(diff_lines)} lines):") - for line in diff_lines[:50]: - print(f" {line}", end="") - print(f"\n ... ({len(diff_lines) - 50} more lines)") - else: - print(" Diff:") - for line in diff_lines: - print(f" {line}", end="") - print() - else: - print(" No changes to SKILL.md content.") - - # Validation breakdown - if result.val_scores: - print(" Validation scores by test case:") - for task_id, score in sorted(result.val_scores.items()): - status = "PASS" if score >= 0.5 else "FAIL" - print(f" {status} {task_id}: {score:.3f}") - print() - - # Auto-save result to disk - saved_skill, saved_meta = save_result(result) - if saved_skill: - print(f" Saved: {saved_skill}") - print(f" Apply: uv run python .test/scripts/optimize.py {result.skill_name} --apply-last") - elif result.original_content == result.optimized_content: - print(f" No improvement found — nothing saved.") - print(f"{'=' * 60}\n") - - def apply_optimization(result: OptimizationResult) -> Path | None: """Apply optimized SKILL.md and/or tool descriptions. diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index 36f44dcc..8ec474ad 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -1,6 +1,7 @@ """End-to-end orchestrator for GEPA skill optimization. Uses optimize_anything API: evaluator function + GEPAConfig. +Single evaluator path using SkillBench judge-based evaluation. """ import copy @@ -8,19 +9,15 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal +from typing import Any from gepa.optimize_anything import optimize_anything, GEPAConfig from ..config import SkillTestConfig from ..runners.evaluate import setup_mlflow from .config import get_preset, validate_reflection_context, estimate_pass_duration, DEFAULT_GEN_LM, DEFAULT_TOKEN_BUDGET -from .evaluator import ( - SKILL_KEY, - create_skill_evaluator, - count_tokens, - build_optimization_background, - _find_skill_md, +from .utils import SKILL_KEY, count_tokens, find_skill_md +from .skillbench_evaluator import ( create_skillbench_evaluator, build_skillbench_background, ) @@ -52,7 +49,7 @@ class OptimizationResult: components: dict[str, str] | None = None original_components: dict[str, str] | None = None tool_map: Any = None - evaluator_type: str = "legacy" + evaluator_type: str = "skillbench" skillbench_side_info: dict[str, dict] | None = None @@ -93,9 +90,6 @@ def _evaluate_on_tasks(evaluator, candidate, tasks): Returns: (mean_score, per_task_scores, side_info_by_id, side_info_by_input) - - per_task_scores: {task_id: float} - - side_info_by_id: {task_id: side_info_dict} -- for background builder - - side_info_by_input: {input_text: side_info_dict} -- for evaluator baseline """ gepa_instances = to_gepa_instances(tasks) per_task = {} @@ -113,9 +107,7 @@ def _evaluate_on_tasks(evaluator, candidate, tasks): def optimize_skill( skill_name: str, - mode: Literal["static", "generative"] = "static", - preset: Literal["quick", "standard", "thorough"] = "standard", - task_lm: str | None = None, + preset: str = "standard", gen_model: str | None = None, reflection_lm: str | None = None, include_tools: bool = False, @@ -124,37 +116,38 @@ def optimize_skill( dry_run: bool = False, max_passes: int = 5, max_metric_calls: int | None = None, - evaluator_type: Literal["legacy", "skillbench"] = "skillbench", token_budget: int | None = None, - use_judges: bool = False, + judge_model: str | None = None, + align: bool = False, + # Deprecated params kept for backward compat + mode: str = "static", + task_lm: str | None = None, + evaluator_type: str = "skillbench", + use_judges: bool = True, ) -> OptimizationResult: """Run end-to-end GEPA optimization on a skill and/or tools. - Uses optimize_anything API with a simple evaluator function. + Uses optimize_anything API with judge-based evaluation. Runs up to ``max_passes`` optimization passes per component, feeding - each pass's best candidate as the seed for the next. Stops early - when a pass produces no score improvement. + each pass's best candidate as the seed for the next. Args: skill_name: Name of the skill to optimize - mode: "static" or "generative" preset: "quick" (15), "standard" (50), "thorough" (150) - task_lm: LLM for generative mode (deprecated, use gen_model) - gen_model: LLM for generative evaluation (default: GEPA_GEN_LM env) + gen_model: LLM for generative evaluation reflection_lm: Override reflection LM include_tools: Include MCP tool descriptions as additional components tool_modules: Specific tool modules (None = all) tools_only: Optimize ONLY tool descriptions dry_run: Show config without running - max_passes: Maximum optimization passes (default 5). Each pass - re-seeds from the previous best and runs a full GEPA cycle. - evaluator_type: "skillbench" (measures skill effectiveness delta) or - "legacy" (weighted scoring with keyword/token efficiency). - token_budget: Hard token ceiling; candidates exceeding this are penalized. - use_judges: Enable MLflow LLM judges (Correctness + Guidelines) for NL feedback. + max_passes: Maximum optimization passes (default 5) + max_metric_calls: Override max metric calls per pass + token_budget: Hard token ceiling + judge_model: Override judge model (future use) + align: Use MemAlign alignment (future use) """ # 1. Load SKILL.md - skill_path = _find_skill_md(skill_name) + skill_path = find_skill_md(skill_name) if not tools_only and skill_path is None: raise FileNotFoundError(f"Could not find SKILL.md for '{skill_name}'") @@ -184,8 +177,8 @@ def optimize_skill( total_original_tokens = sum(original_token_counts.values()) - # Default --include-tools for SkillBench (tools are the primary token consumer) - if evaluator_type == "skillbench" and not tools_only and not include_tools and not tool_components: + # Auto-include tools for SkillBench + if not tools_only and not include_tools and not tool_components: include_tools = True tool_map = extract_tool_descriptions(modules=tool_modules) tool_components = tools_to_gepa_components(tool_map, per_module=True) @@ -212,32 +205,22 @@ def optimize_skill( print(f"No test cases found. Using {len(train)} auto-generated tasks.") # 4. Build evaluator - # Resolve gen_model: explicit arg > task_lm (deprecated) > env default effective_gen_model = gen_model or task_lm or DEFAULT_GEN_LM if effective_gen_model: print(f"Generation model: {effective_gen_model}") - judges_label = " (with LLM judges)" if use_judges else "" - print(f"Evaluator: {evaluator_type}{judges_label}") - - if evaluator_type == "skillbench": - if not effective_gen_model: - raise ValueError( - "SkillBench evaluator requires a gen_model. " - "Pass --gen-model or set GEPA_GEN_LM env var." - ) - evaluator = create_skillbench_evaluator( - skill_name, - gen_model=effective_gen_model, - original_token_counts=original_token_counts, - token_budget=token_budget, - use_judges=use_judges, - ) - else: - evaluator = create_skill_evaluator( - skill_name, mode=mode, task_lm=task_lm, - gen_model=effective_gen_model, - original_token_counts=original_token_counts, + print("Evaluator: skillbench (judge-driven)") + + if not effective_gen_model: + raise ValueError( + "SkillBench evaluator requires a gen_model. " + "Pass --gen-model or set GEPA_GEN_LM env var." ) + evaluator = create_skillbench_evaluator( + skill_name, + gen_model=effective_gen_model, + original_token_counts=original_token_counts, + token_budget=token_budget, + ) # 5. Get config (scaled by component count) num_components = len(seed_candidate) @@ -256,7 +239,7 @@ def optimize_skill( # Dry run if dry_run: - print(f"\n=== Dry Run: {skill_name} ({evaluator_type}) ===") + print(f"\n=== Dry Run: {skill_name} (skillbench) ===") if not tools_only: print(f"SKILL.md path: {skill_path}") print(f"Components: {list(seed_candidate.keys())}") @@ -265,9 +248,7 @@ def optimize_skill( print(f" {comp}: {tokens:,} tokens") print(f"Train tasks: {len(train)}") print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") - print(f"Mode: {mode}") - print(f"Generation model: {effective_gen_model or 'None (static only)'}") - print(f"Evaluator type: {evaluator_type}") + print(f"Generation model: {effective_gen_model}") print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls}, " f"scaled for {num_components} component(s))") print(f"Max passes: {max_passes}") @@ -277,27 +258,16 @@ def optimize_skill( evaluator, seed_candidate, train ) print(f"Current score: {original_score:.3f}") - - # Show per-task baseline in dry-run output for task_id, score in original_per_task.items(): print(f" {task_id}: {score:.3f}") - if evaluator_type == "skillbench": - background = build_skillbench_background( - skill_name, total_original_tokens, - component_names=list(seed_candidate.keys()), - baseline_scores=original_per_task, - baseline_side_info=si_by_id, - token_budget=token_budget, - use_judges=use_judges, - ) - else: - background = build_optimization_background( - skill_name, total_original_tokens, - component_names=list(seed_candidate.keys()), - baseline_scores=original_per_task, - baseline_side_info=si_by_id, - ) + background = build_skillbench_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + token_budget=token_budget, + ) print(f"\nBackground preview:\n{background[:500]}...") return OptimizationResult( @@ -317,8 +287,8 @@ def optimize_skill( components=dict(seed_candidate), original_components=dict(seed_candidate), tool_map=tool_map, - evaluator_type=evaluator_type, - skillbench_side_info=si_by_id if evaluator_type == "skillbench" else None, + evaluator_type="skillbench", + skillbench_side_info=si_by_id, ) # Evaluate original and capture per-task detail for baseline context @@ -326,59 +296,37 @@ def optimize_skill( evaluator, seed_candidate, train ) - # Set baseline on evaluator so future calls include regression/improvement info - if hasattr(evaluator, "set_baseline"): - evaluator.set_baseline(si_by_input) - - # 6. Build background (with baseline scores) and objective - if evaluator_type == "skillbench": - background = build_skillbench_background( - skill_name, total_original_tokens, - component_names=list(seed_candidate.keys()), - baseline_scores=original_per_task, - baseline_side_info=si_by_id, - token_budget=token_budget, - use_judges=use_judges, - ) - objective = ( - f"Refine and improve the existing '{skill_name}' skill. " - "Score is based on SKILL EFFECTIVENESS (45%) and TOKEN EFFICIENCY (25%). " - "Adding content the agent already knows does NOT help and costs tokens. " - "Focus on what the agent would otherwise get wrong. " - "Be concise — remove redundant examples and verbose explanations." - ) - else: - background = build_optimization_background( - skill_name, total_original_tokens, - component_names=list(seed_candidate.keys()), - baseline_scores=original_per_task, - baseline_side_info=si_by_id, - ) - objective = ( - f"Refine and improve the existing '{skill_name}' skill. " - "Preserve patterns and examples that already score well. " - "Focus on fixing scorer failures and reducing token count without sacrificing correctness. " - "Higher quality scores and fewer tokens are both better." - ) + # 6. Build background and objective + background = build_skillbench_background( + skill_name, total_original_tokens, + component_names=list(seed_candidate.keys()), + baseline_scores=original_per_task, + baseline_side_info=si_by_id, + token_budget=token_budget, + ) + objective = ( + f"Refine and improve the existing '{skill_name}' skill. " + "Score is based on SKILL EFFECTIVENESS (35%) and TOKEN EFFICIENCY (25%). " + "Judge rationale in side_info explains exactly what failed. " + "Focus on what the agent would otherwise get wrong. " + "Be concise — remove redundant examples and verbose explanations." + ) # 7. Convert datasets to GEPA format trainset = to_gepa_instances(train) valset = to_gepa_instances(val) if val else None # 8. Multi-pass optimization loop - # Each pass feeds the previous best as the new seed. - # Stops early when a pass produces no score improvement. current_seed = dict(seed_candidate) best = dict(seed_candidate) best_score = original_score last_result = None total_metric_calls = 0 - improvement_threshold = 0.0005 # minimum improvement to continue + improvement_threshold = 0.0005 print(f"\n Starting multi-pass optimization (up to {max_passes} passes, " f"{num_components} component(s), {config.engine.max_metric_calls} metric calls/pass)") - # Print estimated time per pass est_secs = estimate_pass_duration( config.engine.max_metric_calls, config.reflection.reflection_lm, @@ -393,9 +341,6 @@ def optimize_skill( for pass_num in range(1, max_passes + 1): print(f"\n --- Pass {pass_num}/{max_passes} (best score so far: {best_score:.4f}) ---") - # Deep-copy config: optimize_anything mutates - # config.reflection.reflection_prompt_template when objective/background - # are provided, which would cause a "mutually exclusive" error on pass 2+. pass_config = copy.deepcopy(config) result = optimize_anything( @@ -409,7 +354,6 @@ def optimize_skill( ) total_metric_calls += result.total_metric_calls or 0 - # Evaluate this pass's best candidate candidate = result.best_candidate pass_score, _, _, _ = _evaluate_on_tasks(evaluator, candidate, train) improvement = pass_score - best_score @@ -421,7 +365,6 @@ def optimize_skill( best = dict(candidate) best_score = pass_score last_result = result - # Use the improved candidate as seed for next pass current_seed = dict(candidate) else: print(f" No significant improvement in pass {pass_num} -- stopping early.") @@ -458,7 +401,7 @@ def optimize_skill( stc = SkillTestConfig() setup_mlflow(stc) with mlflow.start_run(run_name=f"{skill_name}_optimize_{preset}"): - mlflow.set_tags({"optimizer": "gepa", "skill_name": skill_name, "preset": preset, "mode": mode, "evaluator_type": evaluator_type}) + mlflow.set_tags({"optimizer": "gepa", "skill_name": skill_name, "preset": preset, "evaluator_type": "skillbench"}) mlflow.log_metrics({ "original_score": original_score, "optimized_score": optimized_score, @@ -472,10 +415,8 @@ def optimize_skill( except Exception: pass - # Capture final side_info for skillbench review output - final_si_by_id = None - if evaluator_type == "skillbench": - _, _, final_si_by_id, _ = _evaluate_on_tasks(evaluator, best, train) + # Capture final side_info for review output + _, _, final_si_by_id, _ = _evaluate_on_tasks(evaluator, best, train) return OptimizationResult( skill_name=skill_name, @@ -494,6 +435,6 @@ def optimize_skill( components=dict(best), original_components=dict(seed_candidate), tool_map=tool_map, - evaluator_type=evaluator_type, + evaluator_type="skillbench", skillbench_side_info=final_si_by_id, ) diff --git a/.test/src/skill_test/optimize/skillbench_evaluator.py b/.test/src/skill_test/optimize/skillbench_evaluator.py index 8ddc3290..322feb72 100644 --- a/.test/src/skill_test/optimize/skillbench_evaluator.py +++ b/.test/src/skill_test/optimize/skillbench_evaluator.py @@ -1,28 +1,19 @@ -"""SkillBench-inspired evaluator: measure skill effectiveness via WITH vs WITHOUT comparison. +"""SkillBench evaluator: measure skill effectiveness via WITH vs WITHOUT comparison. Evaluates skills by measuring agent performance WITH the skill vs WITHOUT it -on real tasks, then computing a skill effectiveness delta. This replaces the -5-layer weighted scoring with a 3-phase approach: +on real tasks. Uses MLflow judges as the primary scoring mechanism — judges +provide both scores AND rich rationale for GEPA's reflection LM. Phase 1: WITH-SKILL -- LLM generates response with SKILL.md in context Phase 2: WITHOUT-SKILL -- LLM generates response with NO skill (cached once) - Phase 3: COMPUTE -- binary pass/fail assertions on both, derive effectiveness - -Scoring weights (default / with --use-judges): - 45% / 35% Skill Effectiveness (delta: pass_rate_with - pass_rate_without) - 25% / 25% Absolute Quality (pass_rate_with_skill) - 0% / 10% Judge Quality (LLM judge pass rate — 6 judges, see below) - 5% / 5% Structure (syntax validity) - 25% / 25% Token Efficiency (smaller candidates score higher) - -LLM Judges (when --use-judges is enabled): - 1. RelevanceToQuery — does the response address the user's input? (always runs) - 2. Completeness — does the response fully answer all parts? (always runs) - 3. Correctness — are expected facts present? (requires expected_facts) - 4. Guidelines — does response follow per-test rules? (requires guidelines) - 5. ExpectationsGuidelines — combined facts+guidelines check (when both present) - 6. Custom skill judge — MemAlign-inspired domain judge with skill-specific - evaluation criteria extracted from ground_truth guidelines + Phase 3: JUDGE -- quality_judge scores both, effectiveness_judge compares + +Scoring weights: + 35% Skill Effectiveness (quality_with - quality_without) + 25% Absolute Quality (quality_with score from judge) + 10% Judge Effectiveness (effectiveness verdict) + 5% Structure (syntax validity) + 25% Token Efficiency (smaller candidates score higher) """ from __future__ import annotations @@ -38,8 +29,15 @@ from mlflow.entities import Feedback from ..scorers.universal import python_syntax, sql_syntax, no_hallucinated_apis -from .assertions import AssertionResult, run_all_assertions -from .asi import skillbench_to_asi +from .judges import ( + JudgeFeedback, + create_skill_quality_judge, + create_effectiveness_judge, + create_regression_judge, + run_judge_safe, + _safe_parse_score, +) +from .utils import count_tokens logger = logging.getLogger(__name__) @@ -50,13 +48,7 @@ def _prompt_hash(prompt: str) -> str: class _RateLimiter: - """Thread-safe token-bucket rate limiter for LLM API calls. - - Limits both concurrency (via semaphore) and request rate (via - minimum inter-call spacing). When --include-tools sends large - contexts to Opus, this prevents bursts that exceed - token-per-minute quotas. - """ + """Thread-safe token-bucket rate limiter for LLM API calls.""" def __init__(self, max_concurrent: int = 2, min_interval: float = 1.0): self._semaphore = threading.Semaphore(max_concurrent) @@ -82,17 +74,11 @@ def release(self) -> None: def _completion_with_backoff(*, max_retries: int = 6, **kwargs) -> Any: - """Call litellm.completion with explicit exponential backoff for rate limits. - - This is a safety net on top of litellm's built-in retries. litellm's - global num_retries handles most transient errors, but sustained - token-per-minute exhaustion on Opus can outlast them. This wrapper - adds longer waits between retry bursts. - """ + """Call litellm.completion with explicit exponential backoff for rate limits.""" last_err: Exception | None = None for attempt in range(max_retries + 1): if attempt > 0: - delay = min(2 ** attempt, 60) # 2, 4, 8, 16, 32, 60 + delay = min(2 ** attempt, 60) logger.warning( "Rate limited (attempt %d/%d), backing off %.0fs", attempt, max_retries, delay, @@ -122,7 +108,6 @@ def _run_structure_scorers(text: str) -> float: scores.append(1.0) elif fb.value == "no": scores.append(0.0) - # skip doesn't count elif isinstance(result, Feedback): if result.value == "yes": scores.append(1.0) @@ -133,23 +118,40 @@ def _run_structure_scorers(text: str) -> float: return sum(scores) / len(scores) if scores else 1.0 +def _effectiveness_score(verdict: str | float) -> float: + """Convert effectiveness verdict to numeric score for weighting.""" + if isinstance(verdict, (int, float)): + return max(0.0, min(1.0, float(verdict))) + v = str(verdict).strip().lower() + if v == "improved": + return 1.0 + elif v == "same": + return 0.5 + elif v == "regressed": + return 0.0 + # Fallback: try bool-like + if v in ("yes", "true"): + return 1.0 + if v in ("no", "false"): + return 0.0 + return 0.5 + + class SkillBenchEvaluator: - """GEPA-compatible evaluator using SkillBench WITH vs WITHOUT methodology. + """GEPA-compatible evaluator using judges for scoring + diagnostics. Args: - gen_model: LLM model for generating responses. Required -- no silent fallback. - scorer_config: Optional scorer config (unused, kept for interface compat). + gen_model: LLM model for generating responses. Required. original_token_counts: Token counts of original artifacts for efficiency scoring. token_budget: Hard token ceiling; candidates exceeding this are penalized. + skill_guidelines: Deduplicated guidelines from ground_truth.yaml for the quality judge. """ def __init__( self, gen_model: str, - scorer_config: dict[str, Any] | None = None, original_token_counts: dict[str, int] | None = None, token_budget: int | None = None, - use_judges: bool = False, skill_guidelines: list[str] | None = None, ): if not gen_model: @@ -158,19 +160,18 @@ def __init__( "Pass --gen-model or set GEPA_GEN_LM env var." ) self.gen_model = gen_model - self._baseline_cache: dict[str, list[AssertionResult]] = {} self._baseline_response_cache: dict[str, str] = {} self._original_token_counts = original_token_counts or {} self._total_original_tokens = sum(self._original_token_counts.values()) self._token_budget = token_budget - self._use_judges = use_judges - self._skill_guidelines = skill_guidelines or [] - def _generate_response(self, prompt: str, skill_context: str | None = None) -> str: - """Generate a response with or without skill context. + # Create judge instances + self._quality_judge = create_skill_quality_judge(skill_guidelines) + self._effectiveness_judge = create_effectiveness_judge() + self._regression_judge = create_regression_judge() - Uses temperature=0 for deterministic outputs. - """ + def _generate_response(self, prompt: str, skill_context: str | None = None) -> str: + """Generate a response with or without skill context.""" messages = [] if skill_context: messages.append({ @@ -190,202 +191,13 @@ def _generate_response(self, prompt: str, skill_context: str | None = None) -> s ) return resp.choices[0].message.content or "" - def _get_baseline( - self, prompt: str, expectations: dict[str, Any], - ) -> tuple[list[AssertionResult], str]: - """Get WITHOUT-skill baseline, computing once then caching. - - Returns: - Tuple of (assertion_results, raw_response). - """ + def _get_baseline_response(self, prompt: str) -> str: + """Get WITHOUT-skill baseline response, computing once then caching.""" key = _prompt_hash(prompt) - if key not in self._baseline_cache: + if key not in self._baseline_response_cache: response = self._generate_response(prompt, skill_context=None) self._baseline_response_cache[key] = response - self._baseline_cache[key] = run_all_assertions(response, expectations) - return self._baseline_cache[key], self._baseline_response_cache[key] - - def _run_llm_judges( - self, response: str, expectations: dict, prompt: str, reference: str, - ) -> list[dict]: - """Run MLflow LLM judges and return NL feedback dicts. - - Runs up to 6 judges for comprehensive evaluation: - 1. RelevanceToQuery — always (no ground truth needed) - 2. Completeness — always (no ground truth needed) - 3. Correctness — when expected_facts present - 4. Guidelines — when per-test guidelines present - 5. ExpectationsGuidelines — when BOTH facts + guidelines present - 6. Custom skill judge — MemAlign-inspired domain-specific judge - - Each judge returns {judge, passed, rationale} for GEPA reflection. - """ - results = [] - - inputs_dict = {"prompt": prompt} - outputs_dict = {"response": response} - - facts = expectations.get("expected_facts", []) - guidelines = expectations.get("guidelines", []) - - # 1. RelevanceToQuery: does response address the user's input? - # Always runs — catches off-topic or confused responses that binary - # assertions can't detect. - try: - from mlflow.genai.scorers import RelevanceToQuery - judge = RelevanceToQuery() - fb = judge(inputs=inputs_dict, outputs=outputs_dict) - results.append({ - "judge": "relevance", - "passed": fb.value == "yes", - "rationale": fb.rationale or "", - }) - except Exception as e: - logger.debug("RelevanceToQuery judge failed: %s", e) - - # 2. Completeness: does response fully answer all parts of the prompt? - # Always runs — catches partial answers where a response mentions the - # right topic but skips sub-questions. - try: - from mlflow.genai.scorers import Completeness - judge = Completeness() - fb = judge(inputs=inputs_dict, outputs=outputs_dict) - results.append({ - "judge": "completeness", - "passed": fb.value == "yes", - "rationale": fb.rationale or "", - }) - except Exception as e: - logger.debug("Completeness judge failed: %s", e) - - # 3. Correctness: are expected facts present in the response? - if facts and reference: - try: - from mlflow.genai.scorers import Correctness - judge = Correctness() - fb = judge( - inputs=inputs_dict, - outputs=outputs_dict, - expectations={"expected_facts": facts}, - ) - results.append({ - "judge": "correctness", - "passed": fb.value == "yes", - "rationale": fb.rationale or "", - }) - except Exception as e: - logger.debug("Correctness judge failed: %s", e) - - # 4. Guidelines: per-test custom evaluation rules - if guidelines: - try: - from mlflow.genai.scorers import Guidelines - judge = Guidelines( - name="skill_guidelines", guidelines=guidelines, - ) - fb = judge(inputs=inputs_dict, outputs=outputs_dict) - results.append({ - "judge": "guidelines", - "passed": fb.value == "yes", - "rationale": fb.rationale or "", - }) - except Exception as e: - logger.debug("Guidelines judge failed: %s", e) - - # 5. ExpectationsGuidelines: combined facts + guidelines in one pass. - # When both are available, this judge evaluates them together and - # produces a single holistic rationale — often more insightful than - # separate Correctness + Guidelines calls. - if facts and guidelines: - try: - from mlflow.genai.scorers import ExpectationsGuidelines - judge = ExpectationsGuidelines( - name="expectations_guidelines", - guidelines=guidelines, - ) - fb = judge( - inputs=inputs_dict, - outputs=outputs_dict, - expectations={"expected_facts": facts}, - ) - results.append({ - "judge": "expectations_guidelines", - "passed": fb.value == "yes", - "rationale": fb.rationale or "", - }) - except Exception as e: - logger.debug("ExpectationsGuidelines judge failed: %s", e) - - # 6. Custom skill judge (MemAlign-inspired): uses domain-specific - # evaluation principles extracted from the skill's ground_truth - # guidelines. This mimics MemAlign's "semantic memory" — a set of - # generalizable principles learned from labeled examples — without - # requiring MLflow trace alignment infrastructure. - if self._skill_guidelines: - try: - self._run_custom_skill_judge( - results, prompt, response, facts, - ) - except Exception as e: - logger.debug("Custom skill judge failed: %s", e) - - return results - - def _run_custom_skill_judge( - self, - results: list[dict], - prompt: str, - response: str, - facts: list[str], - ) -> None: - """Run a MemAlign-inspired custom judge with skill-domain principles. - - Uses ``mlflow.genai.judges.make_judge`` to create a domain-specific - judge whose instructions incorporate evaluation principles extracted - from all ground_truth.yaml guidelines across the skill's test cases. - - This is the "semantic memory" component of the MemAlign approach: - rather than aligning from traces, we extract and deduplicate the - skill's evaluation principles upfront and inject them as judge - instructions. - - Appends result dicts directly to ``results``. - """ - from mlflow.genai.judges import make_judge - - # Build instruction prompt from collected skill guidelines - principles = "\n".join( - f"- {g}" for g in self._skill_guidelines - ) - - judge = make_judge( - name="skill_domain_judge", - instructions=( - "You are an expert evaluator for a Databricks skill. " - "Evaluate whether the response correctly follows the " - "domain-specific principles below.\n\n" - "## Domain Principles (from skill evaluation criteria)\n" - f"{principles}\n\n" - "## Evaluation\n" - "Given the user question and response, determine if the " - "response adheres to the domain principles above. Focus on " - "technical accuracy, correct API usage, and completeness " - "of the domain-specific guidance.\n\n" - "Question: {{ inputs.prompt }}\n" - "Response: {{ outputs.response }}" - ), - feedback_value_type=bool, - ) - - fb = judge( - inputs={"prompt": prompt}, - outputs={"response": response}, - ) - results.append({ - "judge": "skill_domain", - "passed": fb.value == "yes" if isinstance(fb.value, str) else bool(fb.value), - "rationale": fb.rationale or "", - }) + return self._baseline_response_cache[key] def __call__( self, candidate: dict[str, str], example: dict, @@ -417,53 +229,81 @@ def __call__( except (json.JSONDecodeError, TypeError): pass - # If no prompt or no expectations, return minimal score if not prompt or not expectations: return 0.0, {"_error": "No prompt or expectations for this task"} - # Phase 1: WITH skill + tools + # Phase 1: Generate WITH-skill response with_response = self._generate_response(prompt, skill_context=full_context) - with_results = run_all_assertions(with_response, expectations) - - # Phase 2: WITHOUT skill (cached) - without_results, without_response = self._get_baseline(prompt, expectations) - - # Phase 3: Compute scores - total = len(with_results) - if total == 0: - return 0.0, {"_error": "No assertions to evaluate"} - - pass_with = sum(r.passed for r in with_results) / total - pass_without = sum(r.passed for r in without_results) / total - effectiveness = pass_with - pass_without - - # LLM judge scoring (optional) - judge_results = [] - if self._use_judges: - judge_results = self._run_llm_judges( - with_response, expectations, prompt, - reference=example.get("answer", ""), - ) - judge_pass_rate = 1.0 # default when judges disabled - if judge_results: - judge_pass_rate = sum(j["passed"] for j in judge_results) / len(judge_results) + # Phase 2: Generate WITHOUT-skill response (cached) + without_response = self._get_baseline_response(prompt) + + # Phase 3: Judge-driven scoring + facts = expectations.get("expected_facts", []) + patterns = expectations.get("expected_patterns", []) + guidelines = expectations.get("guidelines", []) + + # Build expectations string for judge templates + expectations_for_judge = { + "expected_facts": "\n".join(f"- {f}" for f in facts) if facts else "None specified", + "expected_patterns": "\n".join( + f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" + for p in patterns + ) if patterns else "None specified", + "guidelines": "\n".join(f"- {g}" for g in guidelines) if guidelines else "None specified", + } + + # Quality judge: score WITH response + quality_with_fb = run_judge_safe( + self._quality_judge, + inputs={"prompt": prompt}, + outputs={"response": with_response}, + expectations=expectations_for_judge, + name="quality_with", + ) + + # Quality judge: score WITHOUT response + quality_without_fb = run_judge_safe( + self._quality_judge, + inputs={"prompt": prompt}, + outputs={"response": without_response}, + expectations=expectations_for_judge, + name="quality_without", + ) + + # Effectiveness judge: compare both + effectiveness_fb = run_judge_safe( + self._effectiveness_judge, + inputs={ + "prompt": prompt, + "with_response": with_response, + "without_response": without_response, + }, + expectations={ + "expected_facts": "\n".join(f"- {f}" for f in facts) if facts else "None specified", + }, + name="effectiveness", + ) + + # Parse scores + score_with = _safe_parse_score(quality_with_fb.value) + score_without = _safe_parse_score(quality_without_fb.value) + effectiveness_delta = score_with - score_without + effectiveness_verdict = _effectiveness_score(effectiveness_fb.value) # Structure validation on the skill itself structure = _run_structure_scorers(skill_md) if skill_md else 1.0 # Token efficiency scoring - from .evaluator import count_tokens total_candidate_tokens = sum(count_tokens(v) for v in candidate.values()) if self._total_original_tokens > 0: ratio = total_candidate_tokens / self._total_original_tokens if ratio <= 1.0: - efficiency = 1.0 + 0.15 * (1.0 - ratio) # Bonus for smaller + efficiency = 1.0 + 0.15 * (1.0 - ratio) else: - efficiency = max(0.0, 2.0 - ratio) # Penalty for growth + efficiency = max(0.0, 2.0 - ratio) - # Hard penalty if over explicit budget if self._token_budget and total_candidate_tokens > self._token_budget: over_ratio = total_candidate_tokens / self._token_budget efficiency = min(efficiency, max(0.0, 2.0 - over_ratio)) @@ -471,64 +311,55 @@ def __call__( efficiency = 1.0 # Weighted final score - # When judges enabled: steal 10% from effectiveness for judge_quality - # When judges disabled: 10% folds back into effectiveness (original weights) - if self._use_judges and judge_results: - final_score = ( - 0.35 * max(0.0, effectiveness) - + 0.25 * pass_with - + 0.10 * judge_pass_rate - + 0.05 * structure - + 0.25 * efficiency - ) - else: - final_score = ( - 0.45 * max(0.0, effectiveness) - + 0.25 * pass_with - + 0.05 * structure - + 0.25 * efficiency - ) + final_score = ( + 0.35 * max(0.0, effectiveness_delta) + + 0.25 * score_with + + 0.10 * effectiveness_verdict + + 0.05 * structure + + 0.25 * efficiency + ) - # Build side info via skillbench_to_asi - score_breakdown = { - "skill_effectiveness": effectiveness, - "pass_rate_with": pass_with, - "pass_rate_without": pass_without, + # Build side info with FULL judge rationale (not truncated!) + reference_answer = example.get("answer", "") + + side_info: dict[str, Any] = {} + + # Task context + if prompt: + side_info["Task"] = prompt[:200] + + # Full judge feedback — the critical fix for GEPA optimization + side_info["Judge_quality_with"] = { + "score": score_with, + "rationale": quality_with_fb.rationale, + } + side_info["Judge_quality_without"] = { + "score": score_without, + "rationale": quality_without_fb.rationale, + } + side_info["Judge_effectiveness"] = { + "verdict": str(effectiveness_fb.value), + "rationale": effectiveness_fb.rationale, + } + + # Expected vs Actual for GEPA reflection + if reference_answer: + side_info["Expected"] = reference_answer[:500] + if with_response: + side_info["Actual"] = with_response[:500] + + # Score breakdown + side_info["scores"] = { + "quality_with": score_with, + "quality_without": score_without, + "skill_effectiveness": effectiveness_delta, + "effectiveness_verdict": effectiveness_verdict, "structure": structure, "token_efficiency": efficiency, "final": final_score, } - if self._use_judges and judge_results: - score_breakdown["judge_quality"] = judge_pass_rate - reference_answer = example.get("answer", "") - - side_info = skillbench_to_asi( - with_results, - without_results, - task_prompt=prompt, - scores=score_breakdown, - with_response=with_response, - without_response=without_response, - reference_answer=reference_answer or None, - candidate=candidate, - ) - - # Feed judge rationale into side_info for GEPA reflection - if judge_results: - failing_judges = [j for j in judge_results if not j["passed"]] - if failing_judges: - rationale_lines = [ - f"{j['judge']}: {j['rationale'][:200]}" for j in failing_judges - ] - side_info["Judge_feedback"] = "\n".join(rationale_lines) - - # Also route to skill_md_specific_info for component targeting - if "skill_md_specific_info" not in side_info: - side_info["skill_md_specific_info"] = {} - side_info["skill_md_specific_info"]["Judge_analysis"] = "\n".join(rationale_lines) - - # Add token counts to side_info for GEPA Pareto tracking + # Token counts for GEPA Pareto tracking side_info["token_counts"] = { "candidate_total": total_candidate_tokens, "original_total": self._total_original_tokens, @@ -536,15 +367,26 @@ def __call__( if self._token_budget: side_info["token_counts"]["budget"] = self._token_budget + # Derive diagnostic labels from judge verdicts for backward compat + if effectiveness_delta < -0.05: + side_info["Error"] = ( + f"REGRESSION: skill_effectiveness delta={effectiveness_delta:.2f} " + f"(with={score_with:.2f}, without={score_without:.2f})" + ) + side_info["skill_md_specific_info"] = { + "Regressions": effectiveness_fb.rationale, + } + elif score_with < 0.5: + side_info["Error"] = ( + f"NEEDS_SKILL: quality_with={score_with:.2f}, missing content. " + f"Judge: {quality_with_fb.rationale[:200]}" + ) + return final_score, side_info def _collect_skill_guidelines(skill_name: str) -> list[str]: - """Collect and deduplicate all guidelines from a skill's ground_truth.yaml. - - These form the "semantic memory" for the MemAlign-inspired custom judge. - Returns a deduplicated list of guideline strings. - """ + """Collect and deduplicate all guidelines from a skill's ground_truth.yaml.""" from pathlib import Path import yaml @@ -575,37 +417,31 @@ def create_skillbench_evaluator( gen_model: str, original_token_counts: dict[str, int] | None = None, token_budget: int | None = None, - use_judges: bool = False, ) -> Callable: """Factory for SkillBench-style evaluator. Returns a GEPA-compatible callable: (candidate, example) -> (score, side_info) - When ``use_judges`` is enabled, collects all guidelines from the skill's - ground_truth.yaml to build a MemAlign-inspired custom domain judge - alongside the standard MLflow predefined judges. + Judges are always enabled — they are the primary scoring mechanism. + Guidelines from ground_truth.yaml are incorporated into the quality judge. Args: skill_name: Name of the skill being evaluated. gen_model: LLM model for generating responses. Required. original_token_counts: Token counts of original artifacts for efficiency scoring. token_budget: Hard token ceiling; candidates exceeding this are penalized. - use_judges: Enable MLflow LLM judges for NL feedback. """ - skill_guidelines: list[str] = [] - if use_judges: - skill_guidelines = _collect_skill_guidelines(skill_name) - if skill_guidelines: - logger.info( - "Loaded %d domain guidelines for custom skill judge", - len(skill_guidelines), - ) + skill_guidelines = _collect_skill_guidelines(skill_name) + if skill_guidelines: + logger.info( + "Loaded %d domain guidelines for quality judge", + len(skill_guidelines), + ) return SkillBenchEvaluator( gen_model=gen_model, original_token_counts=original_token_counts, token_budget=token_budget, - use_judges=use_judges, skill_guidelines=skill_guidelines, ) @@ -617,14 +453,12 @@ def build_skillbench_background( baseline_scores: dict[str, float] | None = None, baseline_side_info: dict[str, dict] | None = None, token_budget: int | None = None, - use_judges: bool = False, ) -> str: """Build concise GEPA reflection context for SkillBench optimization. Kept short so GEPA's reflection LM spends its context on the per-example - diagnostics (Error/Expected/Actual) rather than methodology. + diagnostics (judge rationale) rather than methodology. """ - # Concise per-task baseline summary baseline_desc = "" if baseline_scores: mean_score = sum(baseline_scores.values()) / len(baseline_scores) @@ -655,43 +489,28 @@ def build_skillbench_background( tool_modules = [c.replace("tools_", "") for c in component_names if c.startswith("tools_")] components_desc = ( f"\nAlso optimizing MCP tool descriptions for: {', '.join(tool_modules)}. " - "Tool descriptions are included in the agent's context alongside the skill. " - "The agent uses them to decide which tools to call and how. " "Keep docstrings accurate and concise — every token counts toward the budget." ) - # Token efficiency guidance token_desc = ( f"\nTOKEN EFFICIENCY (25% of score): Current artifacts total {original_token_count:,} tokens. " - "Smaller candidates score HIGHER. Remove redundant examples, consolidate " - "overlapping sections, eliminate verbose explanations. Be ruthlessly concise." + "Smaller candidates score HIGHER. Be ruthlessly concise." ) if token_budget: token_desc += ( f"\nTOKEN BUDGET: {token_budget:,} tokens. Candidates exceeding this " - "are heavily penalized. Stay well under the budget." - ) - - judge_desc = "" - if use_judges: - judge_desc = ( - "\nLLM JUDGES (10% of score): Six judges provide natural-language feedback " - "in 'Judge_feedback': RelevanceToQuery (on-topic?), Completeness (fully " - "answered?), Correctness (facts present?), Guidelines (rules followed?), " - "ExpectationsGuidelines (combined holistic check), and a custom skill " - "domain judge (MemAlign-inspired, evaluates domain-specific principles). " - "Use judge rationale to understand WHY responses fail — it's more " - "actionable than binary NEEDS_SKILL/REGRESSION labels." + "are heavily penalized." ) return ( f"You are refining SKILL.md for '{skill_name}'.\n" - "The skill is scored by how much it HELPS an agent answer correctly.\n" - "Assertions labeled NEEDS_SKILL = add this content. REGRESSION = simplify or remove.\n" + "The skill is scored by MLflow judges that evaluate how much it HELPS an agent.\n" + "Judge rationale in side_info explains exactly WHAT failed and WHY.\n" + "Use Judge_quality_with to see missing facts/patterns.\n" + "Use Judge_effectiveness to see if the skill helped or hurt.\n" "Focus on: specific API syntax, version requirements, non-obvious patterns.\n" - "Do NOT add generic knowledge the agent already has (NEUTRAL assertions)." + "Do NOT add generic knowledge the agent already has." f"{baseline_desc}" f"{components_desc}" f"{token_desc}" - f"{judge_desc}" ) diff --git a/.test/src/skill_test/optimize/splitter.py b/.test/src/skill_test/optimize/splitter.py index 98f6c871..f567ca23 100644 --- a/.test/src/skill_test/optimize/splitter.py +++ b/.test/src/skill_test/optimize/splitter.py @@ -174,9 +174,9 @@ def generate_bootstrap_tasks(skill_name: str, base_path: Path | None = None) -> """ if base_path is None: # Find repo root for path resolution - from .evaluator import _find_repo_root + from .utils import find_repo_root - repo_root = _find_repo_root() + repo_root = find_repo_root() skill_md_candidates = [ repo_root / ".claude" / "skills" / skill_name / "SKILL.md", repo_root / "databricks-skills" / skill_name / "SKILL.md", diff --git a/.test/src/skill_test/optimize/tools.py b/.test/src/skill_test/optimize/tools.py index 29ef6c24..bc43d46c 100644 --- a/.test/src/skill_test/optimize/tools.py +++ b/.test/src/skill_test/optimize/tools.py @@ -35,9 +35,9 @@ def _find_tools_dir() -> Path: if MCP_TOOLS_DIR.exists(): return MCP_TOOLS_DIR # Fallback: search from repo root - from .evaluator import _find_repo_root + from .utils import find_repo_root - repo_root = _find_repo_root() + repo_root = find_repo_root() candidate = repo_root / "databricks-mcp-server" / "databricks_mcp_server" / "tools" if candidate.exists(): return candidate diff --git a/.test/src/skill_test/optimize/utils.py b/.test/src/skill_test/optimize/utils.py new file mode 100644 index 00000000..3cb5f89f --- /dev/null +++ b/.test/src/skill_test/optimize/utils.py @@ -0,0 +1,67 @@ +"""Shared utilities for skill optimization. + +Extracted from evaluator.py — provides path resolution, token counting, +and the SKILL_KEY constant used across the optimization package. +""" + +from pathlib import Path + +import tiktoken + +SKILL_KEY = "skill_md" + + +# --------------------------------------------------------------------------- +# Path utilities +# --------------------------------------------------------------------------- + +def find_repo_root() -> Path: + """Find the repo root by searching upward for .test/src/.""" + current = Path(__file__).resolve().parent + while current != current.parent: + if (current / ".test" / "src").exists(): + return current + if (current / "src" / "skill_test").exists() and current.name == ".test": + return current.parent + current = current.parent + return Path.cwd() + + +def find_skill_md(skill_name: str) -> Path | None: + """Locate the SKILL.md file for a given skill name.""" + repo_root = find_repo_root() + candidates = [ + repo_root / ".claude" / "skills" / skill_name / "SKILL.md", + repo_root / "databricks-skills" / skill_name / "SKILL.md", + ] + for p in candidates: + if p.exists(): + return p + return None + + +# --------------------------------------------------------------------------- +# Token utilities +# --------------------------------------------------------------------------- + +def count_tokens(text: str) -> int: + """Count tokens using cl100k_base encoding.""" + enc = tiktoken.get_encoding("cl100k_base") + return len(enc.encode(text)) + + +def token_efficiency_score(candidate_text: str, original_token_count: int) -> float: + """Score based on how concise the candidate is vs. the original. + + Smaller than original = bonus up to 1.15, same size = 1.0, + larger = linear penalty to 0.0 at 2x. + """ + if original_token_count <= 0: + return 1.0 + enc = tiktoken.get_encoding("cl100k_base") + candidate_tokens = len(enc.encode(candidate_text)) + ratio = candidate_tokens / original_token_count + if ratio <= 1.0: + return 1.0 + 0.15 * (1.0 - ratio) + else: + return max(0.0, 2.0 - ratio) diff --git a/.test/src/skill_test/scorers/dynamic.py b/.test/src/skill_test/scorers/dynamic.py index 0c2e994e..adc8c164 100644 --- a/.test/src/skill_test/scorers/dynamic.py +++ b/.test/src/skill_test/scorers/dynamic.py @@ -1,4 +1,8 @@ -"""Dynamic scorers that use test case expectations from YAML config.""" +"""Dynamic scorers that use test case expectations from YAML config. + +DEPRECATED: For optimization, the quality judge in judges.py replaces this. +This module is kept for backward compatibility with runners/evaluate.py. +""" from mlflow.genai.scorers import scorer, Guidelines from mlflow.entities import Feedback diff --git a/.test/tests/test_optimize_e2e.py b/.test/tests/test_optimize_e2e.py index 355beba0..422a8828 100644 --- a/.test/tests/test_optimize_e2e.py +++ b/.test/tests/test_optimize_e2e.py @@ -11,7 +11,7 @@ import pytest -from skill_test.optimize.evaluator import token_efficiency_score, count_tokens, SKILL_KEY +from skill_test.optimize.utils import token_efficiency_score, count_tokens, SKILL_KEY from skill_test.optimize.splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances from skill_test.optimize.asi import feedback_to_score, feedback_to_asi @@ -39,8 +39,11 @@ def test_same_size_scores_one(self): tokens = count_tokens(text) assert token_efficiency_score(text, tokens) == 1.0 - def test_smaller_scores_one(self): - assert token_efficiency_score("short", 100) == 1.0 + def test_smaller_scores_bonus(self): + # Smaller than original gets a bonus (up to 1.15) + score = token_efficiency_score("short", 100) + assert score > 1.0 + assert score <= 1.15 def test_double_size_scores_zero(self): text = "word " * 200 diff --git a/databricks-builder-app/alembic/env.py b/databricks-builder-app/alembic/env.py index 07021cd5..0b8110db 100644 --- a/databricks-builder-app/alembic/env.py +++ b/databricks-builder-app/alembic/env.py @@ -150,6 +150,12 @@ def run_migrations_online(): """Run migrations in 'online' mode using sync engine.""" url, connect_args = get_url_and_connect_args() + # Get schema name from Alembic config or environment + schema_name = config.get_main_option('lakebase_schema_name') or os.environ.get('LAKEBASE_SCHEMA_NAME', 'builder_app') + + # Add search_path to connect_args so tables are created in the custom schema + connect_args.setdefault('options', f'-c search_path={schema_name},public') + connectable = create_engine( url, poolclass=pool.NullPool, @@ -157,6 +163,11 @@ def run_migrations_online(): ) with connectable.connect() as connection: + # Create the schema if it doesn't exist (SP has CREATE on the database) + from sqlalchemy import text + connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')) + connection.commit() + context.configure( connection=connection, target_metadata=target_metadata, diff --git a/databricks-builder-app/app.yaml b/databricks-builder-app/app.yaml index 94694545..a57d8526 100644 --- a/databricks-builder-app/app.yaml +++ b/databricks-builder-app/app.yaml @@ -55,6 +55,8 @@ env: value: "fe-shared-demo" - name: LAKEBASE_DATABASE_NAME value: "databricks_postgres" + - name: LAKEBASE_SCHEMA_NAME + value: "builder_app" # ============================================================================= # LLM Provider Configuration diff --git a/databricks-builder-app/scripts/deploy.sh b/databricks-builder-app/scripts/deploy.sh index 42651f9a..dc8cccc0 100755 --- a/databricks-builder-app/scripts/deploy.sh +++ b/databricks-builder-app/scripts/deploy.sh @@ -213,6 +213,11 @@ cp -r server "$STAGING_DIR/" cp app.yaml "$STAGING_DIR/" cp requirements.txt "$STAGING_DIR/" +# Copy Alembic migrations +echo " Copying Alembic migrations..." +cp alembic.ini "$STAGING_DIR/" +cp -r alembic "$STAGING_DIR/" + # Copy frontend build (server expects it at client/out/) echo " Copying frontend build..." mkdir -p "$STAGING_DIR/client" diff --git a/databricks-builder-app/server/db/database.py b/databricks-builder-app/server/db/database.py index 156e82dd..37bdba4c 100644 --- a/databricks-builder-app/server/db/database.py +++ b/databricks-builder-app/server/db/database.py @@ -375,6 +375,7 @@ def init_database(database_url: Optional[str] = None) -> AsyncEngine: # Connect args for psycopg3 with DNS workaround connect_args = { "sslmode": "require", + "options": f"-c search_path={os.environ.get('LAKEBASE_SCHEMA_NAME', 'builder_app')},public", } # Add hostaddr if DNS resolution was needed (bypasses Python's getaddrinfo) if _resolved_hostaddr: @@ -562,6 +563,10 @@ def run_migrations() -> None: if alembic_dir.exists(): alembic_cfg.set_main_option("script_location", str(alembic_dir)) + # Pass the schema name to Alembic env.py via config + schema_name = os.environ.get("LAKEBASE_SCHEMA_NAME", "builder_app") + alembic_cfg.set_main_option("lakebase_schema_name", schema_name) + command.upgrade(alembic_cfg, "head") logger.info("Database migrations completed") except Exception as e: diff --git a/databricks-builder-app/server/services/clusters.py b/databricks-builder-app/server/services/clusters.py index 74dad5cd..1295af1b 100644 --- a/databricks-builder-app/server/services/clusters.py +++ b/databricks-builder-app/server/services/clusters.py @@ -23,11 +23,14 @@ _cache_lock = Lock() +SERVERLESS_CLUSTER_ID = '__serverless__' + + def _fetch_clusters_sync(limit: int = 50, timeout: int = 15) -> list[dict]: """Synchronously fetch clusters from Databricks using SDK. - Returns clusters sorted by: running first, "shared" in name second, then alphabetically. - Filters out serverless clusters. + Returns a "Serverless Compute" entry first (always), followed by real clusters + sorted by: running first, "shared" in name second, then alphabetically. Args: limit: Maximum number of clusters to return @@ -57,7 +60,17 @@ def sort_key(c): filtered_clusters.sort(key=sort_key) - return [ + # Build result with Serverless Compute as the first (default) entry + result = [ + { + 'cluster_id': SERVERLESS_CLUSTER_ID, + 'cluster_name': 'Serverless Compute', + 'state': 'RUNNING', + 'creator_user_name': None, + }, + ] + + result.extend( { 'cluster_id': c.cluster_id, 'cluster_name': c.cluster_name, @@ -65,7 +78,9 @@ def sort_key(c): 'creator_user_name': c.creator_user_name, } for c in filtered_clusters[:limit] - ] + ) + + return result async def _refresh_cache(timeout_seconds: int = 30) -> None: From c3c2772c1eaf26b2752f0d86c4eda26b6f0cfcdb Mon Sep 17 00:00:00 2001 From: CAholder Date: Mon, 2 Mar 2026 21:11:56 -0800 Subject: [PATCH 15/20] refactoring to use mlflow --- .../client/src/lib/types.ts | 1 + .../client/src/pages/ProjectPage.tsx | 3 +- .../server/services/clusters.py | 14 +++++++++- .../server/services/system_prompt.py | 12 +++++++- .../server/services/test_clusters.py | 19 +++++++++---- .../server/services/warehouses.py | 28 ++++++++++++------- 6 files changed, 58 insertions(+), 19 deletions(-) diff --git a/databricks-builder-app/client/src/lib/types.ts b/databricks-builder-app/client/src/lib/types.ts index 05f45511..efaf8cff 100644 --- a/databricks-builder-app/client/src/lib/types.ts +++ b/databricks-builder-app/client/src/lib/types.ts @@ -61,6 +61,7 @@ export interface Warehouse { state: string; cluster_size?: string | null; creator_name?: string | null; + is_serverless?: boolean; } /** Todo item from agent TodoWrite tool */ diff --git a/databricks-builder-app/client/src/pages/ProjectPage.tsx b/databricks-builder-app/client/src/pages/ProjectPage.tsx index 7b7ea193..0a59d91c 100644 --- a/databricks-builder-app/client/src/pages/ProjectPage.tsx +++ b/databricks-builder-app/client/src/pages/ProjectPage.tsx @@ -1161,7 +1161,8 @@ export default function ProjectPage() { } const cluster = clusters.find(c => c.cluster_id === selectedClusterId); if (cluster) { - chips.push({ label: cluster.cluster_name || 'Cluster', color: cluster.state === 'RUNNING' ? 'text-[var(--color-success)]' : 'text-[var(--color-text-muted)]' }); + const isServerless = cluster.cluster_id === '__serverless__'; + chips.push({ label: isServerless ? 'Serverless Compute' : (cluster.cluster_name || 'Cluster'), color: cluster.state === 'RUNNING' ? 'text-[var(--color-success)]' : 'text-[var(--color-text-muted)]' }); } const warehouse = warehouses.find(w => w.warehouse_id === selectedWarehouseId); if (warehouse) { diff --git a/databricks-builder-app/server/services/clusters.py b/databricks-builder-app/server/services/clusters.py index 1295af1b..7e3f24b4 100644 --- a/databricks-builder-app/server/services/clusters.py +++ b/databricks-builder-app/server/services/clusters.py @@ -157,4 +157,16 @@ async def list_clusters_async() -> list[dict]: # No cache - we must wait for the first fetch await _refresh_cache() - return _get_cached_clusters() or [] + result = _get_cached_clusters() + if result: + return result + + # Even if the API call failed, always return the serverless option + return [ + { + 'cluster_id': SERVERLESS_CLUSTER_ID, + 'cluster_name': 'Serverless Compute', + 'state': 'RUNNING', + 'creator_user_name': None, + }, + ] diff --git a/databricks-builder-app/server/services/system_prompt.py b/databricks-builder-app/server/services/system_prompt.py index 5b7b4fef..91f73f6b 100644 --- a/databricks-builder-app/server/services/system_prompt.py +++ b/databricks-builder-app/server/services/system_prompt.py @@ -107,7 +107,17 @@ def get_system_prompt( """ cluster_section = '' - if cluster_id: + if cluster_id == 'serverless' or cluster_id == '__serverless__': + cluster_section = """ +## Compute: Serverless + +You are configured to use **Databricks Serverless Compute** for code execution. + +When using `execute_databricks_command` or `run_python_file_on_databricks`: +- **Do NOT pass a cluster_id parameter** — serverless compute is used automatically when no cluster is specified. +- Serverless compute starts instantly with no cluster startup wait time. +""" + elif cluster_id: cluster_section = f""" ## Selected Cluster diff --git a/databricks-builder-app/server/services/test_clusters.py b/databricks-builder-app/server/services/test_clusters.py index a7cfc66a..80d949c9 100644 --- a/databricks-builder-app/server/services/test_clusters.py +++ b/databricks-builder-app/server/services/test_clusters.py @@ -24,17 +24,24 @@ def test_fetch_clusters_sync_performance(): def test_clusters_sorted_correctly(): - """Test that clusters are sorted: running first, shared second, then alphabetically.""" - from server.services.clusters import _fetch_clusters_sync + """Test that clusters are sorted: serverless first, then running, shared, then rest.""" + from server.services.clusters import _fetch_clusters_sync, SERVERLESS_CLUSTER_ID clusters = _fetch_clusters_sync(limit=50) - if len(clusters) < 2: - pytest.skip('Not enough clusters to test sorting') + # First entry should always be Serverless Compute + assert clusters[0]['cluster_id'] == SERVERLESS_CLUSTER_ID + assert clusters[0]['cluster_name'] == 'Serverless Compute' + + # Skip the synthetic serverless entry for sort order checking + real_clusters = [c for c in clusters if c['cluster_id'] != SERVERLESS_CLUSTER_ID] + + if len(real_clusters) < 2: + pytest.skip('Not enough real clusters to test sorting') - # Check running clusters come first + # Check running clusters come first among real clusters found_non_running = False - for c in clusters: + for c in real_clusters: if c['state'] != 'RUNNING': found_non_running = True elif found_non_running: diff --git a/databricks-builder-app/server/services/warehouses.py b/databricks-builder-app/server/services/warehouses.py index 655b76a1..7922a7cf 100644 --- a/databricks-builder-app/server/services/warehouses.py +++ b/databricks-builder-app/server/services/warehouses.py @@ -25,10 +25,12 @@ def _fetch_warehouses_sync(limit: int = 50, timeout: int = 15) -> list[dict]: """Synchronously fetch SQL warehouses from Databricks using SDK. Returns warehouses sorted by priority: - 1. Running + "shared" in name (highest priority) - 2. Running (without "shared") - 3. Not running + "shared" in name - 4. Everything else + 1. Serverless + running (highest priority) + 2. Serverless + not running + 3. Running + "shared" in name + 4. Running (without "shared") + 5. Not running + "shared" in name + 6. Everything else Args: limit: Maximum number of warehouses to return @@ -42,19 +44,24 @@ def _fetch_warehouses_sync(limit: int = 50, timeout: int = 15) -> list[dict]: # Fetch warehouses warehouses = list(islice(client.warehouses.list(), limit * 2)) - # Sort by priority: running + shared > running > shared > rest + # Sort by priority: serverless first, then running + shared > running > shared > rest def sort_key(w): is_running = w.state == State.RUNNING if w.state else False is_shared = 'shared' in (w.name or '').lower() - # Priority: (running + shared) = 0, running = 1, shared = 2, other = 3 - if is_running and is_shared: + is_serverless = getattr(w, 'enable_serverless_compute', False) or False + # Serverless warehouses always come first + if is_serverless and is_running: priority = 0 - elif is_running: + elif is_serverless: priority = 1 - elif is_shared: + elif is_running and is_shared: priority = 2 - else: + elif is_running: priority = 3 + elif is_shared: + priority = 4 + else: + priority = 5 return priority warehouses.sort(key=sort_key) @@ -66,6 +73,7 @@ def sort_key(w): 'state': w.state.value if w.state else 'UNKNOWN', 'cluster_size': w.cluster_size, 'creator_name': w.creator_name, + 'is_serverless': getattr(w, 'enable_serverless_compute', False) or False, } for w in warehouses[:limit] ] From cb46f6aa397254837cf4357b1c9b51a839fe70ea Mon Sep 17 00:00:00 2001 From: CAholder Date: Tue, 3 Mar 2026 14:58:34 -0800 Subject: [PATCH 16/20] added mlflow judges and ability to optimize skills and tools independently. This is because tools are used universally so we may not be able to optimize the two together --- .test/README.md | 78 ++-- .test/scripts/optimize.py | 8 + .test/src/skill_test/optimize/config.py | 52 ++- .test/src/skill_test/optimize/judges.py | 334 ++++++++++++++---- .test/src/skill_test/optimize/runner.py | 146 ++++++-- .../optimize/skillbench_evaluator.py | 168 +++++---- .test/src/skill_test/optimize/splitter.py | 67 ++++ 7 files changed, 657 insertions(+), 196 deletions(-) diff --git a/.test/README.md b/.test/README.md index 92371239..42db3cbf 100644 --- a/.test/README.md +++ b/.test/README.md @@ -34,14 +34,15 @@ GEPA's reflection LM reads the `side_info` diagnostics, proposes mutations, eval ### MLflow Judges as the Evaluator -The evaluator uses [MLflow's `make_judge`](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html) to score responses. Three judges replace the previous 6-judge + binary-assertion system: +The evaluator uses [MLflow's `make_judge`](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html) to score responses. Two judges run by default during optimization: | Judge | What it does | Returns | |-------|-------------|---------| | **quality_judge** | Scores a single response against expected facts, patterns, and guidelines | `float` (0.0-1.0) + rationale | -| **effectiveness_judge** | Compares WITH-skill vs WITHOUT-skill responses | `"improved"` / `"same"` / `"regressed"` + rationale | | **regression_judge** | Identifies specific ways the skill harms responses | `bool` + rationale of what to fix | +Effectiveness is derived from the quality delta (`quality_with - quality_without`) — no separate LLM call needed. The `effectiveness_judge` is available in `judges.py` for standalone use but is not called during optimization. + Each judge returns **full rationale** — not truncated — so GEPA's reflection LM sees exactly what failed and why: ```python @@ -59,9 +60,7 @@ side_info = { }, "Judge_effectiveness": { "verdict": "improved", - "rationale": "The skill successfully teaches WITH METRICS LANGUAGE YAML " - "syntax (+0.45 delta). However, the MEASURE() wrapping example " - "uses incorrect syntax that the model copies." + "delta": 0.45, } } ``` @@ -70,9 +69,8 @@ side_info = { | Weight | Dimension | Source | |--------|-----------|--------| -| **35%** | Skill Effectiveness | `quality_with - quality_without` (the delta) | -| **25%** | Absolute Quality | `quality_with` score from judge | -| **10%** | Judge Effectiveness | Effectiveness verdict score | +| **40%** | Skill Effectiveness | `quality_with - quality_without` (the delta) | +| **30%** | Absolute Quality | `quality_with` score from judge | | **5%** | Structure | Python/SQL syntax validation | | **25%** | Token Efficiency | Smaller = higher score (bonus up to 1.15x) | @@ -100,51 +98,50 @@ uv run python .test/scripts/optimize.py databricks-metric-views --preset quick - ## What Can Be Optimized -GEPA treats any text artifact as a candidate for optimization. The framework supports three types of artifacts, independently or together: +GEPA treats any text artifact as a candidate for optimization. Skills and tools are optimized **separately** to avoid cross-skill interference. + +### Skills (SKILL.md files) — default mode -### Skills (SKILL.md files) +SKILL.md files teach agents Databricks patterns — API syntax, code examples, best practices. Each skill is a standalone GEPA component (`skill_md`). Tool descriptions are loaded as **read-only context** — included in the generation prompt so the evaluator sees realistic agent behavior, but not mutated by GEPA. -SKILL.md files teach agents Databricks patterns — API syntax, code examples, best practices. Each skill is a standalone GEPA component (`skill_md`). +This means `--preset quick` always uses **1 component / 15 metric calls per pass**, regardless of how many tool modules exist. ```bash -# Optimize a skill +# Optimize a skill (tools loaded as read-only context) uv run python .test/scripts/optimize.py databricks-metric-views --preset quick # Optimize all skills that have test cases uv run python .test/scripts/optimize.py --all --preset quick ``` -### MCP Tool Descriptions +### MCP Tool Descriptions — `--tools-only` mode `@mcp.tool` docstrings in `databricks-mcp-server/` are what the agent sees when deciding which tool to call. Concise, accurate descriptions improve tool selection. Each tool module becomes a separate GEPA component (`tools_sql`, `tools_serving`, etc.). +Tool optimization uses a **cross-skill dataset** — tasks are sampled from all skills with `ground_truth.yaml` — so optimized docstrings work well across skills, not just one. + ```bash -# Optimize tool descriptions ONLY (no SKILL.md) -uv run python .test/scripts/optimize.py databricks-metric-views --tools-only --tool-modules sql +# Optimize tool descriptions with cross-skill evaluation +uv run python .test/scripts/optimize.py databricks-metric-views --tools-only -# Optimize specific tool modules +# Optimize specific tool modules only uv run python .test/scripts/optimize.py databricks-metric-views --tools-only --tool-modules sql serving compute ``` When applied (`--apply`), optimized docstrings are written back to the MCP server source files via AST, preserving all surrounding code. -### Skills + Tools Together +### Skills + Tools Together — `--include-tools` (advanced) -The most powerful mode: optimize the skill and its related tool descriptions in a single run. GEPA round-robins across all components, so each gets dedicated reflection and mutation budget. +For advanced use: optimize both skill and tool descriptions in a single GEPA run. Both are treated as GEPA components (round-robin mutation). Per-preset metric call caps prevent budget blowup. ```bash # Skill + specific tool modules uv run python .test/scripts/optimize.py databricks-metric-views --include-tools --tool-modules sql -# Skill + ALL tool modules (auto-scaled budget) -uv run python .test/scripts/optimize.py databricks-metric-views --include-tools - # Dry run to see all components and their token counts uv run python .test/scripts/optimize.py databricks-metric-views --include-tools --dry-run ``` -By default, **tools are auto-included** even without `--include-tools` — tool descriptions are typically the largest token consumer (~17K tokens across 88 tools in 16 modules). Use `--tools-only` to skip the SKILL.md, or omit `--include-tools` to let the auto-include handle it. - Available tool modules: `agent_bricks`, `aibi_dashboards`, `apps`, `compute`, `file`, `genie`, `jobs`, `lakebase`, `manifest`, `pipelines`, `serving`, `sql`, `unity_catalog`, `user`, `vector_search`, `volume_files` --- @@ -220,10 +217,13 @@ uv run python .test/scripts/optimize.py databricks-metric-views --dry-run ``` === Dry Run: databricks-metric-views (skillbench) === SKILL.md path: databricks-skills/databricks-metric-views/SKILL.md -Components: ['skill_md', 'tools_sql', ...] -Total original tokens: 17,991 +Components: ['skill_md'] +Total original tokens: 1,234 + skill_md: 1,234 tokens +Tool context (read-only): 16,757 tokens Train tasks: 8 Evaluator: skillbench (judge-driven) +Preset: quick (max_metric_calls=15, scaled for 1 component(s)) Current score: 0.909 metric-views_create_sql_001: 0.952 metric-views_query_measure_002: 0.871 @@ -265,7 +265,7 @@ GEPA runs 15 iterations per component across up to 5 passes. Each iteration: Skill Effectiveness: 0.42 Quality (with): 0.78 Quality (without): 0.36 (baseline) - Tokens: 17,991 -> 18,265 (+1.5%) + Tokens: 1,234 -> 1,198 (-2.9%) Per-task: metric-views_create_sql_001 WITH 0.85 WITHOUT 0.35 delta +0.50 [OK] @@ -306,10 +306,11 @@ uv run python .test/scripts/optimize.py --preset thorough # 150 iterat --reflection-lm "..." # Override reflection model (default: databricks/databricks-claude-opus-4-6) --max-passes N # Max optimization passes (default: 5) --token-budget N # Hard token ceiling ---include-tools # Include MCP tool descriptions as components +--include-tools # Include MCP tool descriptions as GEPA components (advanced) --tool-modules sql ... # Specific tool modules to include ---tools-only # Optimize only tool descriptions +--tools-only # Optimize only tool descriptions (cross-skill evaluation) --all # Optimize all skills with ground_truth.yaml +--run-dir DIR # Directory for GEPA checkpoints (resumes if dir exists) # Test case generation --generate-from FILE # Generate test cases from requirements file @@ -328,6 +329,27 @@ Model strings use [litellm provider prefixes](https://docs.litellm.ai/docs/provi --- +## Resuming Long Runs + +GEPA saves optimization state to a run directory. If interrupted, resume from where you left off: + +```bash +# Start with checkpointing +uv run python .test/scripts/optimize.py databricks-metric-views \ + --preset standard --run-dir ./opt_runs/metric-views + +# Resume after interruption (same command) +uv run python .test/scripts/optimize.py databricks-metric-views \ + --preset standard --run-dir ./opt_runs/metric-views + +# Graceful stop (GEPA finishes current iteration then exits) +touch ./opt_runs/metric-views/pass_1/gepa.stop +``` + +Each pass gets its own subdirectory (`pass_1/`, `pass_2/`, ...) so checkpoints are isolated per pass. + +--- + ## Writing Test Cases Test cases in `ground_truth.yaml` define what each skill should teach. Minimal example: diff --git a/.test/scripts/optimize.py b/.test/scripts/optimize.py index 8feedd7b..622f7587 100644 --- a/.test/scripts/optimize.py +++ b/.test/scripts/optimize.py @@ -128,6 +128,12 @@ def main(): action="store_true", help="Use MemAlign to align judges with human feedback (requires alignment traces)", ) + parser.add_argument( + "--run-dir", + type=str, + default=None, + help="Directory for GEPA checkpoints. Resumes from last state if dir exists.", + ) parser.add_argument( "--generate-from", type=str, @@ -231,6 +237,7 @@ def main(): token_budget=args.token_budget, judge_model=args.judge_model, align=args.align, + run_dir=f"{args.run_dir}/{name}" if args.run_dir else None, ) review_optimization(result) if args.apply and not args.dry_run: @@ -267,6 +274,7 @@ def main(): token_budget=args.token_budget, judge_model=args.judge_model, align=args.align, + run_dir=args.run_dir, ) review_optimization(result) if args.apply and not args.dry_run: diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index 2aca7a6e..f9db04b8 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -90,6 +90,42 @@ def _register_litellm_models() -> None: "input_cost_per_token": 0, "output_cost_per_token": 0, }, + "databricks/databricks-gemini-3-1-pro": { + "max_tokens": 65_536, + "max_input_tokens": 1_048_576, + "max_output_tokens": 65_536, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, + "databricks/databricks-claude-opus-4-5": { + "max_tokens": 32_000, + "max_input_tokens": 200_000, + "max_output_tokens": 32_000, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, + "databricks/databricks-gpt-5": { + "max_tokens": 100_000, + "max_input_tokens": 1_048_576, + "max_output_tokens": 100_000, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, + "databricks/databricks-claude-sonnet-4-5": { + "max_tokens": 16_000, + "max_input_tokens": 200_000, + "max_output_tokens": 16_000, + "litellm_provider": "databricks", + "mode": "chat", + "input_cost_per_token": 0, + "output_cost_per_token": 0, + }, } for model_name, model_info in _models.items(): litellm.model_cost[model_name] = model_info @@ -136,6 +172,14 @@ def _register_litellm_models() -> None: "thorough": 150, } +# Per-preset caps: safety net so component scaling never exceeds a reasonable +# ceiling. Important for --tools-only mode which has many tool components. +PRESET_MAX_CALLS: dict[str, int] = { + "quick": 45, + "standard": 150, + "thorough": 300, +} + # Maximum total metric calls per pass to avoid runaway runtimes. # With many components, uncapped scaling (e.g., 50 * 17 = 850) can cause # multi-hour hangs with slower reflection models like Sonnet. @@ -289,8 +333,14 @@ def get_preset( # Apply explicit override if provided if max_metric_calls_override is not None: scaled_calls = max_metric_calls_override + else: + # Apply per-preset cap first (safety net for multi-component modes) + preset_cap = PRESET_MAX_CALLS[name] + if scaled_calls > preset_cap: + scaled_calls = preset_cap + # Cap for slower models to avoid multi-hour hangs - elif effective_lm not in _FAST_REFLECTION_MODELS and scaled_calls > MAX_METRIC_CALLS_PER_PASS: + if max_metric_calls_override is None and effective_lm not in _FAST_REFLECTION_MODELS and scaled_calls > MAX_METRIC_CALLS_PER_PASS: warnings.warn( f"Capping metric calls from {scaled_calls} to {MAX_METRIC_CALLS_PER_PASS} " f"for reflection model '{effective_lm}'. " diff --git a/.test/src/skill_test/optimize/judges.py b/.test/src/skill_test/optimize/judges.py index 10869c84..ab1d06ce 100644 --- a/.test/src/skill_test/optimize/judges.py +++ b/.test/src/skill_test/optimize/judges.py @@ -8,11 +8,28 @@ quality_judge — Scores a single response (0.0-1.0) against expectations. effectiveness_judge — Compares WITH vs WITHOUT responses, returns verdict. regression_judge — Identifies specific ways a skill harms responses. + +Judge model resolution (highest priority first): + 1. Explicit ``judge_model`` argument to factory functions + 2. ``GEPA_JUDGE_LM`` environment variable + 3. ``databricks:/databricks-claude-sonnet-4-6`` (default) + +Model fallback: + On rate limit errors (REQUEST_LIMIT_EXCEEDED), automatically retries with + fallback models. Configure via ``GEPA_FALLBACK_MODELS`` env var (comma-separated) + or use the built-in Databricks fallback chain. + +AI Gateway support: + Set ``DATABRICKS_AI_GATEWAY_URL`` to route calls through Databricks AI Gateway. + Example: https://1444828305810485.ai-gateway.cloud.databricks.com/mlflow/v1 + Works alongside the standard serving endpoint approach. """ from __future__ import annotations import logging +import os +import time from dataclasses import dataclass from typing import Any @@ -20,6 +37,176 @@ logger = logging.getLogger(__name__) +DEFAULT_JUDGE_LM = os.environ.get( + "GEPA_JUDGE_LM", "databricks:/databricks-claude-sonnet-4-6" +) + +# --------------------------------------------------------------------------- +# Fallback model chain for rate limit errors +# --------------------------------------------------------------------------- + +_DEFAULT_FALLBACK_MODELS = [ + "databricks/databricks-gpt-5-2", + "databricks/databricks-gemini-3-1-pro", + "databricks/databricks-claude-opus-4-5", + "databricks/databricks-gpt-5", + "databricks/databricks-claude-sonnet-4-6", + "databricks/databricks-claude-sonnet-4-5", +] + +def _get_fallback_models() -> list[str]: + """Get fallback model chain from env or defaults.""" + custom = os.environ.get("GEPA_FALLBACK_MODELS", "") + if custom.strip(): + return [m.strip() for m in custom.split(",") if m.strip()] + return list(_DEFAULT_FALLBACK_MODELS) + + +def _is_rate_limit_error(exc: Exception) -> bool: + """Check if an exception is a rate limit / request limit exceeded error.""" + msg = str(exc).lower() + return any(phrase in msg for phrase in [ + "rate_limit", + "rate limit", + "request_limit_exceeded", + "request limit exceeded", + "too many requests", + "429", + "token.*per.*minute", + ]) + + +# --------------------------------------------------------------------------- +# AI Gateway support +# --------------------------------------------------------------------------- + +DATABRICKS_AI_GATEWAY_URL = os.environ.get("DATABRICKS_AI_GATEWAY_URL", "") + + +def _get_gateway_base_url() -> str | None: + """Return the AI Gateway base URL if configured, else None.""" + url = DATABRICKS_AI_GATEWAY_URL.strip() + if not url: + return None + return url.rstrip("/") + + +def _to_litellm_model(model: str) -> tuple[str, str | None]: + """Convert a model string to (litellm_model, base_url) for completion calls. + + If AI Gateway is configured and model is a databricks/ model, routes + through the gateway as an OpenAI-compatible endpoint. Otherwise returns + the model unchanged with no base_url override. + + Returns: + (model_string, base_url_or_None) + """ + gateway = _get_gateway_base_url() + if gateway and model.startswith("databricks/"): + # Route through AI Gateway as OpenAI-compatible endpoint + endpoint_name = model.split("/", 1)[1] + return f"openai/{endpoint_name}", gateway + return model, None + + +# --------------------------------------------------------------------------- +# URI conversion +# --------------------------------------------------------------------------- + +def _to_judge_uri(model: str) -> str: + """Convert litellm-style model strings to MLflow judge URI format. + + litellm uses ``provider/model`` (e.g. ``databricks/databricks-claude-sonnet-4-6``). + MLflow judges use ``provider:/model`` (e.g. ``databricks:/databricks-claude-sonnet-4-6``). + """ + if ":/" in model: + return model + if "/" in model: + provider, name = model.split("/", 1) + return f"{provider}:/{name}" + return model + + +def _judge_inference_params() -> dict[str, Any] | None: + """Build inference_params for make_judge if AI Gateway is configured.""" + gateway = _get_gateway_base_url() + if gateway: + return {"base_url": gateway} + return None + + +def _to_judge_model_and_params(model: str) -> tuple[str, dict[str, Any] | None]: + """Convert a model string to (judge_uri, inference_params) for make_judge. + + If AI Gateway is configured, uses ``openai:/endpoint-name`` with + ``inference_params.base_url`` pointing to the gateway. Otherwise + uses standard ``provider:/model`` format. + """ + gateway = _get_gateway_base_url() + if gateway and (model.startswith("databricks/") or model.startswith("databricks:/")): + # Extract the endpoint name + if ":/" in model: + endpoint_name = model.split(":/", 1)[1] + else: + endpoint_name = model.split("/", 1)[1] + return f"openai:/{endpoint_name}", {"base_url": gateway} + return _to_judge_uri(model), _judge_inference_params() + + +# --------------------------------------------------------------------------- +# Completion with fallback +# --------------------------------------------------------------------------- + +def completion_with_fallback(*, model: str, max_retries: int = 3, **kwargs) -> Any: + """Call litellm.completion with model fallback on rate limit errors. + + Tries the primary model first. On rate limit errors, cycles through + the fallback chain. Each model gets ``max_retries`` attempts with + exponential backoff before moving to the next. + + Also supports AI Gateway: if DATABRICKS_AI_GATEWAY_URL is set, + databricks/ models are routed through the gateway. + """ + import litellm + + models_to_try = [model] + [ + m for m in _get_fallback_models() if m != model + ] + + last_err: Exception | None = None + for model_str in models_to_try: + litellm_model, base_url = _to_litellm_model(model_str) + + call_kwargs = dict(kwargs) + call_kwargs["model"] = litellm_model + if base_url: + call_kwargs["base_url"] = base_url + + for attempt in range(max_retries): + if attempt > 0: + delay = min(2 ** attempt, 30) + time.sleep(delay) + try: + return litellm.completion(**call_kwargs) + except Exception as e: + last_err = e + if _is_rate_limit_error(e): + if attempt == max_retries - 1: + logger.warning( + "Model '%s' rate limited after %d attempts, trying next fallback", + model_str, max_retries, + ) + continue + # Non-rate-limit error: don't retry, try next model + logger.warning("Model '%s' failed (non-rate-limit): %s", model_str, e) + break + + raise last_err # type: ignore[misc] + + +# --------------------------------------------------------------------------- +# Data types +# --------------------------------------------------------------------------- @dataclass class JudgeFeedback: @@ -69,20 +256,14 @@ def _safe_parse_score(raw_value: Any) -> float: 4. **Pattern adherence** (does the response follow expected code patterns?) 5. **API accuracy** (are function names, parameters, and syntax correct?) -## Expected Facts and Patterns - -{{ expectations.expected_facts }} - -{{ expectations.expected_patterns }} +## Expected Facts, Patterns, and Guidelines -## Skill-Specific Guidelines - -{{ expectations.guidelines }} +{{ expectations }} ## Input -Question: {{ inputs.prompt }} -Response: {{ outputs.response }} +Question: {{ inputs }} +Response: {{ outputs }} ## Instructions @@ -103,19 +284,15 @@ def _safe_parse_score(raw_value: Any) -> float: def create_skill_quality_judge( skill_guidelines: list[str] | None = None, + judge_model: str | None = None, ) -> Any: """Create a universal quality judge for scoring responses. - Uses ``make_judge`` with float output. Incorporates skill-specific - guidelines as "semantic memory" when available. - Args: skill_guidelines: Optional per-skill evaluation principles from ground_truth.yaml guidelines across all test cases. - - Returns: - A callable judge that accepts (inputs, outputs, expectations) and - returns an MLflow Feedback object with float value + rationale. + judge_model: LLM model for the judge. Defaults to GEPA_JUDGE_LM env + or databricks/databricks-claude-sonnet-4-6. """ instructions = _QUALITY_INSTRUCTIONS if skill_guidelines: @@ -125,10 +302,13 @@ def create_skill_quality_judge( f"{principles}\n" ) + model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( name="skill_quality", + model=model_uri, instructions=instructions, feedback_value_type=float, + inference_params=inference_params, ) @@ -140,24 +320,20 @@ def create_skill_quality_judge( You are comparing two responses to the same question to determine whether a skill document helped or hurt the agent's response quality. -## Context +The inputs contain three fields separated by markers: +- QUESTION: the user's question +- WITH-SKILL RESPONSE: generated with the skill document in context +- WITHOUT-SKILL RESPONSE: generated without any skill document -- **WITH-skill response**: Generated with the skill document in context. -- **WITHOUT-skill response**: Generated without any skill document. +The expectations contain the expected facts and patterns. -## Expected Information - -{{ expectations.expected_facts }} +## Inputs -## Input +{{ inputs }} -Question: {{ inputs.prompt }} - -WITH-skill response: -{{ inputs.with_response }} +## Expected Information -WITHOUT-skill response: -{{ inputs.without_response }} +{{ expectations }} ## Instructions @@ -181,17 +357,20 @@ def create_skill_quality_judge( """ -def create_effectiveness_judge() -> Any: +def create_effectiveness_judge(judge_model: str | None = None) -> Any: """Create a WITH vs WITHOUT comparison judge. - Returns a judge that evaluates whether the skill helped, hurt, or made - no difference. Returns Feedback with value in {"improved", "same", "regressed"} - and detailed rationale for GEPA. + Args: + judge_model: LLM model for the judge. Defaults to GEPA_JUDGE_LM env + or databricks/databricks-claude-sonnet-4-6. """ + model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( name="skill_effectiveness", + model=model_uri, instructions=_EFFECTIVENESS_INSTRUCTIONS, feedback_value_type=str, + inference_params=inference_params, ) @@ -203,20 +382,14 @@ def create_effectiveness_judge() -> Any: You are a regression detector for Databricks skill documents. Your job is to identify specific ways that a skill document HARMS agent responses. -## Context - -The skill document was added to an agent's context. Compare the agent's -response WITH the skill to the response WITHOUT it. +The inputs contain three fields separated by markers: +- QUESTION: the user's question +- WITH-SKILL RESPONSE: generated with the skill document in context +- WITHOUT-SKILL RESPONSE: generated without any skill document ## Input -Question: {{ inputs.prompt }} - -WITH-skill response: -{{ inputs.with_response }} - -WITHOUT-skill response: -{{ inputs.without_response }} +{{ inputs }} ## Instructions @@ -237,35 +410,40 @@ def create_effectiveness_judge() -> Any: """ -def create_regression_judge() -> Any: +def create_regression_judge(judge_model: str | None = None) -> Any: """Create a regression detection judge. - Returns structured feedback about what to REMOVE from the skill. - Rationale goes directly to GEPA's reflection LM for targeted fixes. + Args: + judge_model: LLM model for the judge. Defaults to GEPA_JUDGE_LM env + or databricks/databricks-claude-sonnet-4-6. """ + model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( name="skill_regression", + model=model_uri, instructions=_REGRESSION_INSTRUCTIONS, feedback_value_type=bool, + inference_params=inference_params, ) # --------------------------------------------------------------------------- -# Helper: run a judge safely and return structured feedback +# Helper: run a judge safely with fallback on rate limit # --------------------------------------------------------------------------- def run_judge_safe( judge: Any, *, - inputs: dict[str, Any], - outputs: dict[str, Any] | None = None, - expectations: dict[str, Any] | None = None, + inputs: Any, + outputs: Any | None = None, + expectations: Any | None = None, name: str = "judge", ) -> JudgeFeedback: - """Run a judge with error handling, return JudgeFeedback. + """Run a judge with error handling and model fallback. - Catches exceptions and returns a zero-score feedback with error rationale - so that evaluation never crashes from a judge failure. + On rate limit errors, recreates the judge with fallback models and + retries. On other errors, returns zero-score feedback so evaluation + never crashes from a judge failure. """ kwargs: dict[str, Any] = {"inputs": inputs} if outputs is not None: @@ -273,6 +451,7 @@ def run_judge_safe( if expectations is not None: kwargs["expectations"] = expectations + # Try the primary judge first try: fb = judge(**kwargs) return JudgeFeedback( @@ -281,9 +460,42 @@ def run_judge_safe( name=name, ) except Exception as e: - logger.warning("Judge '%s' failed: %s", name, e) - return JudgeFeedback( - value=0.0, - rationale=f"Judge error: {e}", - name=name, - ) + if not _is_rate_limit_error(e): + logger.warning("Judge '%s' failed: %s", name, e) + return JudgeFeedback(value=0.0, rationale=f"Judge error: {e}", name=name) + + # Rate limit hit — try fallback models + logger.warning("Judge '%s' rate limited, trying fallback models", name) + fallbacks = _get_fallback_models() + + for fallback_model in fallbacks: + model_uri, inference_params = _to_judge_model_and_params(fallback_model) + try: + fallback_judge = make_judge( + name=judge.name, + model=model_uri, + instructions=judge._instructions, + feedback_value_type=judge._feedback_value_type, + inference_params=inference_params, + ) + fb = fallback_judge(**kwargs) + logger.info("Judge '%s' succeeded with fallback model '%s'", name, fallback_model) + return JudgeFeedback( + value=fb.value, + rationale=fb.rationale or "", + name=name, + ) + except Exception as fallback_err: + if _is_rate_limit_error(fallback_err): + logger.warning("Fallback '%s' also rate limited, trying next", fallback_model) + continue + logger.warning("Fallback '%s' failed: %s", fallback_model, fallback_err) + continue + + # All fallbacks exhausted + logger.error("Judge '%s': all models rate limited", name) + return JudgeFeedback( + value=0.0, + rationale="All models rate limited — no judge score available", + name=name, + ) diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index 8ec474ad..b2be0b05 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -21,7 +21,7 @@ create_skillbench_evaluator, build_skillbench_background, ) -from .splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances +from .splitter import create_gepa_datasets, generate_bootstrap_tasks, to_gepa_instances, create_cross_skill_dataset from .tools import ( extract_tool_descriptions, tools_to_gepa_components, @@ -85,23 +85,28 @@ def _compute_diff_summary(original: str, optimized: str) -> str: return summary -def _evaluate_on_tasks(evaluator, candidate, tasks): +def _evaluate_on_tasks(evaluator, candidate, tasks, label: str = "Evaluating"): """Run evaluator on tasks and return mean score, per-task scores, and per-task side_info. Returns: (mean_score, per_task_scores, side_info_by_id, side_info_by_input) """ + import sys + gepa_instances = to_gepa_instances(tasks) + total = len(gepa_instances) per_task = {} side_info_by_id = {} side_info_by_input = {} for i, inst in enumerate(gepa_instances): - score, side_info = evaluator(candidate, inst) task_id = tasks[i].get("id", f"task_{i}") + print(f"\r {label}: {i + 1}/{total} ({task_id})...", end="", flush=True) + score, side_info = evaluator(candidate, inst) per_task[task_id] = score side_info_by_id[task_id] = side_info side_info_by_input[inst.get("input", f"task_{i}")] = side_info mean = sum(per_task.values()) / len(per_task) if per_task else 0.0 + print(f"\r {label}: {total}/{total} done. Mean: {mean:.3f} ") return mean, per_task, side_info_by_id, side_info_by_input @@ -119,6 +124,7 @@ def optimize_skill( token_budget: int | None = None, judge_model: str | None = None, align: bool = False, + run_dir: str | None = None, # Deprecated params kept for backward compat mode: str = "static", task_lm: str | None = None, @@ -145,6 +151,7 @@ def optimize_skill( token_budget: Hard token ceiling judge_model: Override judge model (future use) align: Use MemAlign alignment (future use) + run_dir: Directory for GEPA checkpoints. Resumes from last state if dir exists. """ # 1. Load SKILL.md skill_path = find_skill_md(skill_name) @@ -156,48 +163,71 @@ def optimize_skill( # 1b. Load MCP tool descriptions tool_map = None tool_components: dict[str, str] = {} - if include_tools or tools_only: + tool_context_str: str | None = None + + # Always load tool descriptions for context + try: tool_map = extract_tool_descriptions(modules=tool_modules) tool_components = tools_to_gepa_components(tool_map, per_module=True) stats = get_tool_stats() print(f"Tool modules: {stats['modules']}, tools: {stats['total_tools']}, " f"description chars: {stats['total_description_chars']:,}") + except FileNotFoundError: + pass # No MCP tools directory — skip + + # Build read-only tool context string (for skill optimization) + if tool_components: + tool_context_str = "\n\n".join( + tool_components[k] for k in sorted(tool_components) + ) # 2. Build seed_candidate (multi-component dict) seed_candidate: dict[str, str] = {} original_token_counts: dict[str, int] = {} - if not tools_only: + if tools_only: + # Tools-only mode: tool descriptions ARE the GEPA components + for comp_name, comp_text in tool_components.items(): + seed_candidate[comp_name] = comp_text + original_token_counts[comp_name] = count_tokens(comp_text) + tool_context_str = None # tools are in candidate, not read-only context + elif include_tools: + # Explicit --include-tools: both skill and tools are GEPA components seed_candidate[SKILL_KEY] = original_content original_token_counts[SKILL_KEY] = count_tokens(original_content) - - for comp_name, comp_text in tool_components.items(): - seed_candidate[comp_name] = comp_text - original_token_counts[comp_name] = count_tokens(comp_text) - - total_original_tokens = sum(original_token_counts.values()) - - # Auto-include tools for SkillBench - if not tools_only and not include_tools and not tool_components: - include_tools = True - tool_map = extract_tool_descriptions(modules=tool_modules) - tool_components = tools_to_gepa_components(tool_map, per_module=True) - stats = get_tool_stats() - print(f"[SkillBench] Auto-including tools: {stats['modules']} modules, " - f"{stats['total_tools']} tools, {stats['total_description_chars']:,} chars") for comp_name, comp_text in tool_components.items(): seed_candidate[comp_name] = comp_text original_token_counts[comp_name] = count_tokens(comp_text) - total_original_tokens = sum(original_token_counts.values()) + tool_context_str = None # tools are in candidate, not read-only context + else: + # Default: skill is the only GEPA component; tools are read-only context + seed_candidate[SKILL_KEY] = original_content + original_token_counts[SKILL_KEY] = count_tokens(original_content) + + total_original_tokens = sum(original_token_counts.values()) # Resolve token budget token_budget = token_budget or DEFAULT_TOKEN_BUDGET # 3. Load datasets - try: - train, val = create_gepa_datasets(skill_name) - except FileNotFoundError: - train, val = [], None + if tools_only: + # Cross-skill dataset for tool optimization + train = create_cross_skill_dataset(max_per_skill=5) + val = None + if train: + source_skills = {t.get("metadata", {}).get("source_skill", "?") for t in train} + print(f"Cross-skill dataset: {len(train)} tasks from {len(source_skills)} skill(s)") + else: + # Fall back to single-skill dataset + try: + train, val = create_gepa_datasets(skill_name) + except FileNotFoundError: + train, val = [], None + else: + try: + train, val = create_gepa_datasets(skill_name) + except FileNotFoundError: + train, val = [], None if not train: train = generate_bootstrap_tasks(skill_name) @@ -208,6 +238,10 @@ def optimize_skill( effective_gen_model = gen_model or task_lm or DEFAULT_GEN_LM if effective_gen_model: print(f"Generation model: {effective_gen_model}") + + from .judges import DEFAULT_JUDGE_LM + effective_judge_model = judge_model or DEFAULT_JUDGE_LM + print(f"Judge model: {effective_judge_model}") print("Evaluator: skillbench (judge-driven)") if not effective_gen_model: @@ -220,6 +254,8 @@ def optimize_skill( gen_model=effective_gen_model, original_token_counts=original_token_counts, token_budget=token_budget, + judge_model=judge_model, + tool_context=tool_context_str, ) # 5. Get config (scaled by component count) @@ -237,6 +273,42 @@ def optimize_skill( config.reflection.reflection_lm, total_original_tokens, ) + # 5c. Replace GEPA's reflection_lm string with a fallback-aware callable. + # GEPA internally calls make_litellm_lm() which wraps litellm.completion + # with no fallback. We pre-convert it so GEPA uses our version with + # model fallback on rate limit errors. + from .judges import completion_with_fallback + _reflection_model_name = config.reflection.reflection_lm or "" + if isinstance(config.reflection.reflection_lm, str): + def _reflection_lm_with_fallback(prompt): + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + else: + messages = prompt + result = completion_with_fallback( + model=_reflection_model_name, + messages=messages, + ) + return result.choices[0].message.content + + config.reflection.reflection_lm = _reflection_lm_with_fallback + + # Same for refiner_lm if present + if config.refiner is not None and isinstance(config.refiner.refiner_lm, str): + _refiner_model_name = config.refiner.refiner_lm + def _refiner_lm_with_fallback(prompt): + if isinstance(prompt, str): + messages = [{"role": "user", "content": prompt}] + else: + messages = prompt + result = completion_with_fallback( + model=_refiner_model_name, + messages=messages, + ) + return result.choices[0].message.content + + config.refiner.refiner_lm = _refiner_lm_with_fallback + # Dry run if dry_run: print(f"\n=== Dry Run: {skill_name} (skillbench) ===") @@ -246,16 +318,21 @@ def optimize_skill( print(f"Total original tokens: {total_original_tokens:,}") for comp, tokens in original_token_counts.items(): print(f" {comp}: {tokens:,} tokens") + if tool_context_str: + print(f"Tool context (read-only): {count_tokens(tool_context_str):,} tokens") print(f"Train tasks: {len(train)}") print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Generation model: {effective_gen_model}") print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls}, " f"scaled for {num_components} component(s))") print(f"Max passes: {max_passes}") + if run_dir: + print(f"Run dir: {run_dir}") print(f"Reflection LM: {config.reflection.reflection_lm}") + print(f"\nScoring baseline ({len(train)} tasks, ~5 LLM calls each)...") original_score, original_per_task, si_by_id, _ = _evaluate_on_tasks( - evaluator, seed_candidate, train + evaluator, seed_candidate, train, label="Baseline" ) print(f"Current score: {original_score:.3f}") for task_id, score in original_per_task.items(): @@ -292,8 +369,9 @@ def optimize_skill( ) # Evaluate original and capture per-task detail for baseline context + print(f"\nScoring baseline ({len(train)} tasks, ~5 LLM calls each)...") original_score, original_per_task, si_by_id, si_by_input = _evaluate_on_tasks( - evaluator, seed_candidate, train + evaluator, seed_candidate, train, label="Baseline" ) # 6. Build background and objective @@ -327,9 +405,11 @@ def optimize_skill( print(f"\n Starting multi-pass optimization (up to {max_passes} passes, " f"{num_components} component(s), {config.engine.max_metric_calls} metric calls/pass)") + # estimate_pass_duration expects the model name string, not the callable + _est_reflection_lm = _reflection_model_name if _reflection_model_name else str(reflection_lm or DEFAULT_GEN_LM) est_secs = estimate_pass_duration( config.engine.max_metric_calls, - config.reflection.reflection_lm, + _est_reflection_lm, total_original_tokens, num_dataset_examples=len(train), ) @@ -343,6 +423,10 @@ def optimize_skill( pass_config = copy.deepcopy(config) + # Set per-pass checkpoint directory + if run_dir: + pass_config.engine.run_dir = f"{run_dir}/pass_{pass_num}" + result = optimize_anything( seed_candidate=current_seed, evaluator=evaluator, @@ -355,7 +439,7 @@ def optimize_skill( total_metric_calls += result.total_metric_calls or 0 candidate = result.best_candidate - pass_score, _, _, _ = _evaluate_on_tasks(evaluator, candidate, train) + pass_score, _, _, _ = _evaluate_on_tasks(evaluator, candidate, train, label=f"Pass {pass_num}") improvement = pass_score - best_score print(f" Pass {pass_num} score: {pass_score:.4f} " @@ -385,7 +469,7 @@ def optimize_skill( val_scores: dict[str, float] = {} if val: - _, val_scores, _, _ = _evaluate_on_tasks(evaluator, best, val) + _, val_scores, _, _ = _evaluate_on_tasks(evaluator, best, val, label="Validation") token_reduction_pct = ( (total_original_tokens - optimized_token_count) / total_original_tokens * 100 @@ -416,7 +500,7 @@ def optimize_skill( pass # Capture final side_info for review output - _, _, final_si_by_id, _ = _evaluate_on_tasks(evaluator, best, train) + _, _, final_si_by_id, _ = _evaluate_on_tasks(evaluator, best, train, label="Final eval") return OptimizationResult( skill_name=skill_name, diff --git a/.test/src/skill_test/optimize/skillbench_evaluator.py b/.test/src/skill_test/optimize/skillbench_evaluator.py index 322feb72..fd090d2a 100644 --- a/.test/src/skill_test/optimize/skillbench_evaluator.py +++ b/.test/src/skill_test/optimize/skillbench_evaluator.py @@ -6,12 +6,11 @@ Phase 1: WITH-SKILL -- LLM generates response with SKILL.md in context Phase 2: WITHOUT-SKILL -- LLM generates response with NO skill (cached once) - Phase 3: JUDGE -- quality_judge scores both, effectiveness_judge compares + Phase 3: JUDGE -- quality_judge scores both, effectiveness derived from delta Scoring weights: - 35% Skill Effectiveness (quality_with - quality_without) - 25% Absolute Quality (quality_with score from judge) - 10% Judge Effectiveness (effectiveness verdict) + 40% Skill Effectiveness (quality_with - quality_without delta) + 30% Absolute Quality (quality_with score from judge) 5% Structure (syntax validity) 25% Token Efficiency (smaller candidates score higher) """ @@ -25,17 +24,16 @@ import time from typing import Any, Callable -import litellm from mlflow.entities import Feedback from ..scorers.universal import python_syntax, sql_syntax, no_hallucinated_apis from .judges import ( JudgeFeedback, create_skill_quality_judge, - create_effectiveness_judge, create_regression_judge, run_judge_safe, _safe_parse_score, + completion_with_fallback, ) from .utils import count_tokens @@ -70,29 +68,22 @@ def release(self) -> None: # Module-level rate limiter shared across evaluator instances. -_rate_limiter = _RateLimiter(max_concurrent=2, min_interval=1.0) - - -def _completion_with_backoff(*, max_retries: int = 6, **kwargs) -> Any: - """Call litellm.completion with explicit exponential backoff for rate limits.""" - last_err: Exception | None = None - for attempt in range(max_retries + 1): - if attempt > 0: - delay = min(2 ** attempt, 60) - logger.warning( - "Rate limited (attempt %d/%d), backing off %.0fs", - attempt, max_retries, delay, - ) - time.sleep(delay) - _rate_limiter.acquire() - try: - result = litellm.completion(**kwargs) - return result - except litellm.RateLimitError as e: - last_err = e - finally: - _rate_limiter.release() - raise last_err # type: ignore[misc] +_rate_limiter = _RateLimiter(max_concurrent=4, min_interval=0.2) + + +def _completion_with_backoff(*, max_retries: int = 3, **kwargs) -> Any: + """Call litellm.completion with rate limiting and model fallback. + + Uses the centralized completion_with_fallback which handles: + - Rate limit errors with exponential backoff + - Model fallback chain on persistent rate limits + - AI Gateway routing when configured + """ + _rate_limiter.acquire() + try: + return completion_with_fallback(max_retries=max_retries, **kwargs) + finally: + _rate_limiter.release() def _run_structure_scorers(text: str) -> float: @@ -145,6 +136,8 @@ class SkillBenchEvaluator: original_token_counts: Token counts of original artifacts for efficiency scoring. token_budget: Hard token ceiling; candidates exceeding this are penalized. skill_guidelines: Deduplicated guidelines from ground_truth.yaml for the quality judge. + judge_model: LLM model for judges. Defaults to GEPA_JUDGE_LM env + or databricks/databricks-claude-sonnet-4-6. """ def __init__( @@ -153,6 +146,8 @@ def __init__( original_token_counts: dict[str, int] | None = None, token_budget: int | None = None, skill_guidelines: list[str] | None = None, + judge_model: str | None = None, + tool_context: str | None = None, ): if not gen_model: raise ValueError( @@ -161,14 +156,15 @@ def __init__( ) self.gen_model = gen_model self._baseline_response_cache: dict[str, str] = {} + self._baseline_judge_cache: dict[str, JudgeFeedback] = {} self._original_token_counts = original_token_counts or {} self._total_original_tokens = sum(self._original_token_counts.values()) self._token_budget = token_budget + self._tool_context = tool_context or "" - # Create judge instances - self._quality_judge = create_skill_quality_judge(skill_guidelines) - self._effectiveness_judge = create_effectiveness_judge() - self._regression_judge = create_regression_judge() + # Create judge instances with configurable model + self._quality_judge = create_skill_quality_judge(skill_guidelines, judge_model=judge_model) + self._regression_judge = create_regression_judge(judge_model=judge_model) def _generate_response(self, prompt: str, skill_context: str | None = None) -> str: """Generate a response with or without skill context.""" @@ -208,7 +204,9 @@ def __call__( """ skill_md = candidate.get("skill_md", "") - # Build combined context: skill + tool descriptions + # Build combined context: skill + read-only tool descriptions + # During skill optimization, tools come from self._tool_context (read-only). + # During tool optimization, tools come from candidate keys (optimizable). tool_parts = [] for key in sorted(candidate): if key.startswith("tools_"): @@ -217,6 +215,8 @@ def __call__( full_context = skill_md if tool_parts: full_context += "\n\n## Available MCP Tools\n\n" + "\n\n".join(tool_parts) + elif self._tool_context: + full_context += "\n\n## Available MCP Tools\n\n" + self._tool_context prompt = example.get("input", "") @@ -243,53 +243,59 @@ def __call__( patterns = expectations.get("expected_patterns", []) guidelines = expectations.get("guidelines", []) - # Build expectations string for judge templates - expectations_for_judge = { - "expected_facts": "\n".join(f"- {f}" for f in facts) if facts else "None specified", - "expected_patterns": "\n".join( - f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" - for p in patterns - ) if patterns else "None specified", - "guidelines": "\n".join(f"- {g}" for g in guidelines) if guidelines else "None specified", - } + # Build flat strings for judge templates — make_judge only supports + # top-level {{ inputs }}, {{ outputs }}, {{ expectations }} variables. + facts_str = "\n".join(f"- {f}" for f in facts) if facts else "None specified" + patterns_str = "\n".join( + f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" + for p in patterns + ) if patterns else "None specified" + guidelines_str = "\n".join(f"- {g}" for g in guidelines) if guidelines else "None specified" + + expectations_text = ( + f"Expected facts:\n{facts_str}\n\n" + f"Expected patterns:\n{patterns_str}\n\n" + f"Guidelines:\n{guidelines_str}" + ) + + # make_judge requires expectations as dict, inputs/outputs as Any. + # The template renders {{ expectations }} as the dict's string repr, + # so we pack our formatted text into a single-key dict. + expectations_dict = {"criteria": expectations_text} # Quality judge: score WITH response quality_with_fb = run_judge_safe( self._quality_judge, - inputs={"prompt": prompt}, - outputs={"response": with_response}, - expectations=expectations_for_judge, + inputs=prompt, + outputs=with_response, + expectations=expectations_dict, name="quality_with", ) - # Quality judge: score WITHOUT response - quality_without_fb = run_judge_safe( - self._quality_judge, - inputs={"prompt": prompt}, - outputs={"response": without_response}, - expectations=expectations_for_judge, - name="quality_without", - ) - - # Effectiveness judge: compare both - effectiveness_fb = run_judge_safe( - self._effectiveness_judge, - inputs={ - "prompt": prompt, - "with_response": with_response, - "without_response": without_response, - }, - expectations={ - "expected_facts": "\n".join(f"- {f}" for f in facts) if facts else "None specified", - }, - name="effectiveness", - ) + # Quality judge: score WITHOUT response (cached — baseline never changes) + baseline_key = _prompt_hash(prompt) + if baseline_key not in self._baseline_judge_cache: + self._baseline_judge_cache[baseline_key] = run_judge_safe( + self._quality_judge, + inputs=prompt, + outputs=without_response, + expectations=expectations_dict, + name="quality_without", + ) + quality_without_fb = self._baseline_judge_cache[baseline_key] # Parse scores score_with = _safe_parse_score(quality_with_fb.value) score_without = _safe_parse_score(quality_without_fb.value) effectiveness_delta = score_with - score_without - effectiveness_verdict = _effectiveness_score(effectiveness_fb.value) + + # Derive effectiveness verdict from quality delta (no LLM call needed) + if effectiveness_delta > 0.05: + effectiveness_verdict = 1.0 # improved + elif effectiveness_delta < -0.05: + effectiveness_verdict = 0.0 # regressed + else: + effectiveness_verdict = 0.5 # same # Structure validation on the skill itself structure = _run_structure_scorers(skill_md) if skill_md else 1.0 @@ -312,9 +318,8 @@ def __call__( # Weighted final score final_score = ( - 0.35 * max(0.0, effectiveness_delta) - + 0.25 * score_with - + 0.10 * effectiveness_verdict + 0.40 * max(0.0, effectiveness_delta) + + 0.30 * score_with + 0.05 * structure + 0.25 * efficiency ) @@ -338,8 +343,8 @@ def __call__( "rationale": quality_without_fb.rationale, } side_info["Judge_effectiveness"] = { - "verdict": str(effectiveness_fb.value), - "rationale": effectiveness_fb.rationale, + "verdict": "improved" if effectiveness_verdict == 1.0 else "regressed" if effectiveness_verdict == 0.0 else "same", + "delta": effectiveness_delta, } # Expected vs Actual for GEPA reflection @@ -374,7 +379,7 @@ def __call__( f"(with={score_with:.2f}, without={score_without:.2f})" ) side_info["skill_md_specific_info"] = { - "Regressions": effectiveness_fb.rationale, + "Regressions": quality_with_fb.rationale, } elif score_with < 0.5: side_info["Error"] = ( @@ -417,6 +422,8 @@ def create_skillbench_evaluator( gen_model: str, original_token_counts: dict[str, int] | None = None, token_budget: int | None = None, + judge_model: str | None = None, + tool_context: str | None = None, ) -> Callable: """Factory for SkillBench-style evaluator. @@ -430,6 +437,11 @@ def create_skillbench_evaluator( gen_model: LLM model for generating responses. Required. original_token_counts: Token counts of original artifacts for efficiency scoring. token_budget: Hard token ceiling; candidates exceeding this are penalized. + judge_model: LLM model for judges. Defaults to GEPA_JUDGE_LM env + or databricks/databricks-claude-sonnet-4-6. + tool_context: Read-only tool descriptions included in generation context + but not optimized. Used during skill optimization so tools provide + context without being GEPA components. """ skill_guidelines = _collect_skill_guidelines(skill_name) if skill_guidelines: @@ -438,11 +450,17 @@ def create_skillbench_evaluator( len(skill_guidelines), ) + from .judges import DEFAULT_JUDGE_LM + effective_judge_model = judge_model or DEFAULT_JUDGE_LM + logger.info("Judge model: %s", effective_judge_model) + return SkillBenchEvaluator( gen_model=gen_model, original_token_counts=original_token_counts, token_budget=token_budget, skill_guidelines=skill_guidelines, + judge_model=judge_model, + tool_context=tool_context, ) diff --git a/.test/src/skill_test/optimize/splitter.py b/.test/src/skill_test/optimize/splitter.py index f567ca23..c268491f 100644 --- a/.test/src/skill_test/optimize/splitter.py +++ b/.test/src/skill_test/optimize/splitter.py @@ -159,6 +159,73 @@ def create_gepa_datasets( return train, val +def create_cross_skill_dataset( + skill_names: list[str] | None = None, + max_per_skill: int = 5, + base_path: Path | None = None, + seed: int = 42, +) -> list[SkillTask]: + """Create a merged dataset from multiple skills for cross-skill tool optimization. + + If ``skill_names`` is None, discovers all skills that have a ``ground_truth.yaml``. + Loads tasks from each, caps at ``max_per_skill``, and tags each task with + ``metadata["source_skill"]``. + + Args: + skill_names: Specific skills to include. None = auto-discover all. + max_per_skill: Maximum tasks per skill to keep the dataset balanced. + base_path: Override base path for skills directory. + seed: Random seed for reproducible sampling. + + Returns: + Merged list of SkillTask dicts, each tagged with source_skill. + """ + if base_path is None: + base_path = Path(".test/skills") + + # Auto-discover skills with ground_truth.yaml + if skill_names is None: + if not base_path.exists(): + return [] + skill_names = sorted( + d.name + for d in base_path.iterdir() + if d.is_dir() + and (d / "ground_truth.yaml").exists() + and not d.name.startswith("_") + ) + + if not skill_names: + return [] + + rng = random.Random(seed) + merged: list[SkillTask] = [] + + for skill_name in skill_names: + try: + source = get_dataset_source(skill_name, base_path) + records = source.load() + except Exception: + continue + + tasks = [_record_to_task(r) for r in records] + + # Tag with source skill + for t in tasks: + meta = t.get("metadata", {}) + meta["source_skill"] = skill_name + t["metadata"] = meta + + # Cap per skill + if len(tasks) > max_per_skill: + rng.shuffle(tasks) + tasks = tasks[:max_per_skill] + + merged.extend(tasks) + + return merged + + def generate_bootstrap_tasks(skill_name: str, base_path: Path | None = None) -> list[SkillTask]: """Generate synthetic tasks from a SKILL.md when no ground_truth.yaml exists. From 381bf0d8b991fb59413a96405f2bf349801cb294 Mon Sep 17 00:00:00 2001 From: CAholder Date: Tue, 3 Mar 2026 15:14:09 -0800 Subject: [PATCH 17/20] lint fixes --- .test/README.md | 47 +++++++++++++++++++ .test/src/skill_test/optimize/assertions.py | 4 +- .test/src/skill_test/optimize/config.py | 6 ++- .test/src/skill_test/optimize/judges.py | 2 +- .test/src/skill_test/optimize/review.py | 4 +- .test/src/skill_test/optimize/runner.py | 15 ++++-- .../optimize/skillbench_evaluator.py | 6 ++- .test/src/skill_test/optimize/splitter.py | 2 +- .test/src/skill_test/optimize/tools.py | 2 +- 9 files changed, 76 insertions(+), 12 deletions(-) diff --git a/.test/README.md b/.test/README.md index 42db3cbf..4de81e7d 100644 --- a/.test/README.md +++ b/.test/README.md @@ -65,6 +65,53 @@ side_info = { } ``` +### How Baseline Evaluation Works + +This section walks through how a single test case is evaluated end-to-end, from dataset loading through to the baseline score that GEPA uses for optimization. + +#### 1. Dataset Loading (`splitter.py`) + +- Loads `ground_truth.yaml` test cases via `create_gepa_datasets()` +- If >= 5 test cases: stratified train/val split by `metadata.category` (80/20 default) +- If < 5: all used as train, no val set (single-task mode) +- If no `ground_truth.yaml` exists: `generate_bootstrap_tasks()` auto-generates tasks from SKILL.md headers + +#### 2. Evaluator Construction (`skillbench_evaluator.py`) + +`create_skillbench_evaluator()` builds a `SkillBenchEvaluator` with: + +| Parameter | Purpose | +|-----------|---------| +| `gen_model` | LLM that generates responses (plays the role of the agent) | +| `original_token_counts` | Token count of the original SKILL.md (for efficiency scoring) | +| `skill_guidelines` | Deduplicated guidelines from all test cases (injected into quality judge) | +| `tool_context` | Read-only MCP tool descriptions (included in generation prompt but not mutated) | + +The evaluator instantiates two MLflow judges: `quality_judge` and `regression_judge`. + +#### 3. Per-Task Evaluation Flow (the `__call__` method) + +Each test case goes through four phases: + +1. **Phase 1: WITH-skill generation** -- Sends the SKILL.md + tool descriptions as system context, user prompt as user message, generates response at temperature=0 +2. **Phase 2: WITHOUT-skill generation** -- Same prompt, NO skill in context. Result is **cached by prompt hash** -- computed once and reused across all GEPA iterations (the baseline never changes) +3. **Phase 3: Judge scoring** -- `quality_judge` scores both responses against `expected_facts`, `expected_patterns`, and `guidelines` from the test case. WITHOUT-skill judge results are also cached. +4. **Phase 4: Compute composite score** -- Weighted combination of effectiveness delta, absolute quality, structure validation, and token efficiency + +#### 4. Baseline Scoring (`runner.py` step 5) + +Before optimization starts, `_evaluate_on_tasks()` runs the evaluator on ALL training tasks with the original SKILL.md: + +- Collects per-task scores and `side_info` diagnostics +- `build_skillbench_background()` summarizes: mean baseline score, which tasks are NEEDS_SKILL vs REGRESSION +- This baseline context tells GEPA's reflection LM what's already working and what needs improvement + +#### 5. Why This Matters for GEPA + +- The `side_info` dict returned per-task contains **full judge rationale** (not truncated) +- GEPA's reflection LM reads this rationale to understand exactly what failed +- Better diagnostics lead to more targeted mutations and faster convergence + ### Scoring Weights | Weight | Dimension | Source | diff --git a/.test/src/skill_test/optimize/assertions.py b/.test/src/skill_test/optimize/assertions.py index 3b6815ab..88a7f089 100644 --- a/.test/src/skill_test/optimize/assertions.py +++ b/.test/src/skill_test/optimize/assertions.py @@ -159,7 +159,7 @@ def summarize_failures( needs_skill: list[tuple[AssertionResult, AssertionResult]] = [] regressions: list[tuple[AssertionResult, AssertionResult]] = [] - for w, wo in zip(with_results, without_results): + for w, wo in zip(with_results, without_results, strict=True): label = _classify_assertion(w, wo) if label == "NEEDS_SKILL": needs_skill.append((w, wo)) @@ -182,7 +182,7 @@ def summarize_failures( # Regressions: concise NL (only when non-empty) if regressions: lines: list[str] = [] - for i, (w, wo) in enumerate(regressions, 1): + for i, (w, _wo) in enumerate(regressions, 1): content = _extract_content(w) lines.append(f"{i}. '{content}' — passes without skill, fails with it") result["Regressions"] = "\n".join(lines) diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index f9db04b8..5784cadb 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -340,7 +340,11 @@ def get_preset( scaled_calls = preset_cap # Cap for slower models to avoid multi-hour hangs - if max_metric_calls_override is None and effective_lm not in _FAST_REFLECTION_MODELS and scaled_calls > MAX_METRIC_CALLS_PER_PASS: + if ( + max_metric_calls_override is None + and effective_lm not in _FAST_REFLECTION_MODELS + and scaled_calls > MAX_METRIC_CALLS_PER_PASS + ): warnings.warn( f"Capping metric calls from {scaled_calls} to {MAX_METRIC_CALLS_PER_PASS} " f"for reflection model '{effective_lm}'. " diff --git a/.test/src/skill_test/optimize/judges.py b/.test/src/skill_test/optimize/judges.py index ab1d06ce..e17d6c5b 100644 --- a/.test/src/skill_test/optimize/judges.py +++ b/.test/src/skill_test/optimize/judges.py @@ -143,7 +143,7 @@ def _to_judge_model_and_params(model: str) -> tuple[str, dict[str, Any] | None]: uses standard ``provider:/model`` format. """ gateway = _get_gateway_base_url() - if gateway and (model.startswith("databricks/") or model.startswith("databricks:/")): + if gateway and model.startswith(("databricks/", "databricks:/")): # Extract the endpoint name if ":/" in model: endpoint_name = model.split(":/", 1)[1] diff --git a/.test/src/skill_test/optimize/review.py b/.test/src/skill_test/optimize/review.py index 3a0c1aa5..8e4f4a28 100644 --- a/.test/src/skill_test/optimize/review.py +++ b/.test/src/skill_test/optimize/review.py @@ -267,7 +267,7 @@ def review_optimization(result: OptimizationResult) -> None: print(f" Saved: {saved_skill}") print(f" Apply: uv run python .test/scripts/optimize.py {result.skill_name} --apply-last") elif result.original_content == result.optimized_content: - print(f" No improvement found -- nothing saved.") + print(" No improvement found -- nothing saved.") print(f"{'=' * 60}\n") @@ -335,7 +335,7 @@ def apply_optimization(result: OptimizationResult) -> Path | None: metrics={"optimized_score": result.optimized_score}, test_count=len(result.val_scores) if result.val_scores else 0, ) - print(f" Baseline updated.") + print(" Baseline updated.") except Exception: pass diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index b2be0b05..2e0ac3ac 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -15,7 +15,10 @@ from ..config import SkillTestConfig from ..runners.evaluate import setup_mlflow -from .config import get_preset, validate_reflection_context, estimate_pass_duration, DEFAULT_GEN_LM, DEFAULT_TOKEN_BUDGET +from .config import ( + get_preset, validate_reflection_context, estimate_pass_duration, + DEFAULT_GEN_LM, DEFAULT_TOKEN_BUDGET, +) from .utils import SKILL_KEY, count_tokens, find_skill_md from .skillbench_evaluator import ( create_skillbench_evaluator, @@ -416,7 +419,10 @@ def _refiner_lm_with_fallback(prompt): if est_secs is not None: est_mins = est_secs / 60 if est_mins > 5: - print(f" Estimated ~{est_mins:.0f} min/pass ({est_mins * max_passes:.0f} min total for {max_passes} passes)") + print( + f" Estimated ~{est_mins:.0f} min/pass " + f"({est_mins * max_passes:.0f} min total for {max_passes} passes)" + ) for pass_num in range(1, max_passes + 1): print(f"\n --- Pass {pass_num}/{max_passes} (best score so far: {best_score:.4f}) ---") @@ -485,7 +491,10 @@ def _refiner_lm_with_fallback(prompt): stc = SkillTestConfig() setup_mlflow(stc) with mlflow.start_run(run_name=f"{skill_name}_optimize_{preset}"): - mlflow.set_tags({"optimizer": "gepa", "skill_name": skill_name, "preset": preset, "evaluator_type": "skillbench"}) + mlflow.set_tags({ + "optimizer": "gepa", "skill_name": skill_name, + "preset": preset, "evaluator_type": "skillbench", + }) mlflow.log_metrics({ "original_score": original_score, "optimized_score": optimized_score, diff --git a/.test/src/skill_test/optimize/skillbench_evaluator.py b/.test/src/skill_test/optimize/skillbench_evaluator.py index fd090d2a..feea8239 100644 --- a/.test/src/skill_test/optimize/skillbench_evaluator.py +++ b/.test/src/skill_test/optimize/skillbench_evaluator.py @@ -343,7 +343,11 @@ def __call__( "rationale": quality_without_fb.rationale, } side_info["Judge_effectiveness"] = { - "verdict": "improved" if effectiveness_verdict == 1.0 else "regressed" if effectiveness_verdict == 0.0 else "same", + "verdict": ( + "improved" if effectiveness_verdict == 1.0 + else "regressed" if effectiveness_verdict == 0.0 + else "same" + ), "delta": effectiveness_delta, } diff --git a/.test/src/skill_test/optimize/splitter.py b/.test/src/skill_test/optimize/splitter.py index c268491f..064144d8 100644 --- a/.test/src/skill_test/optimize/splitter.py +++ b/.test/src/skill_test/optimize/splitter.py @@ -138,7 +138,7 @@ def create_gepa_datasets( train: list[SkillTask] = [] val: list[SkillTask] = [] - for cat, cat_tasks in by_category.items(): + for _cat, cat_tasks in by_category.items(): rng.shuffle(cat_tasks) n_val = max(1, int(len(cat_tasks) * val_ratio)) diff --git a/.test/src/skill_test/optimize/tools.py b/.test/src/skill_test/optimize/tools.py index bc43d46c..586baf7a 100644 --- a/.test/src/skill_test/optimize/tools.py +++ b/.test/src/skill_test/optimize/tools.py @@ -180,7 +180,7 @@ def write_tool_descriptions( """ # Group updates by file updates_by_file: dict[Path, list[tuple[ToolDescription, str]]] = {} - for module_name, tools in tool_map.items(): + for _module_name, tools in tool_map.items(): for td in tools: if td.name in optimized: updates_by_file.setdefault(td.source_path, []).append((td, optimized[td.name])) From b91f1e64df89f76ddfeeb99f3704af90584d93bc Mon Sep 17 00:00:00 2001 From: CAholder Date: Tue, 3 Mar 2026 15:19:27 -0800 Subject: [PATCH 18/20] lint fixes --- .test/src/skill_test/cli/__init__.py | 10 +- .test/src/skill_test/grp/executor.py | 4 +- .test/src/skill_test/optimize/alignment.py | 10 +- .test/src/skill_test/optimize/assertions.py | 30 +++--- .test/src/skill_test/optimize/config.py | 15 ++- .../skill_test/optimize/gskill/pipeline.py | 7 +- .test/src/skill_test/optimize/judges.py | 44 ++++----- .test/src/skill_test/optimize/review.py | 49 +++++----- .test/src/skill_test/optimize/runner.py | 94 +++++++++++-------- .../optimize/skillbench_evaluator.py | 72 ++++++-------- .test/src/skill_test/optimize/splitter.py | 8 +- .test/src/skill_test/optimize/tools.py | 12 +-- .test/src/skill_test/optimize/utils.py | 2 + .test/src/skill_test/runners/compare.py | 2 +- .test/src/skill_test/runners/evaluate.py | 3 +- .test/src/skill_test/scorers/trace.py | 2 - .test/src/skill_test/scorers/universal.py | 12 ++- .../skill_test/trace/mlflow_integration.py | 2 +- .test/src/skill_test/trace/source.py | 1 - .../databricks_mcp_server/tools/compute.py | 2 +- .../databricks_mcp_server/tools/lakebase.py | 2 - .../tools/volume_files.py | 2 +- .../agent_bricks/manager.py | 3 +- .../aibi_dashboards/dashboards.py | 2 +- .../databricks_tools_core/jobs/runs.py | 2 +- .../spark_declarative_pipelines/__init__.py | 2 +- .../unity_catalog/volume_files.py | 1 - .../lakebase_autoscale/conftest.py | 1 - .../lakebase_autoscale/test_branches.py | 1 - .../lakebase_autoscale/test_computes.py | 1 - .../lakebase_autoscale/test_credentials.py | 1 - .../sql/test_data/generate_test_files.py | 1 - .../tests/integration/sql/test_table_stats.py | 1 - .../unity_catalog/test_monitors.py | 2 - .../integration/unity_catalog/test_sharing.py | 2 - 35 files changed, 193 insertions(+), 212 deletions(-) diff --git a/.test/src/skill_test/cli/__init__.py b/.test/src/skill_test/cli/__init__.py index 246d7113..4afe3a7d 100644 --- a/.test/src/skill_test/cli/__init__.py +++ b/.test/src/skill_test/cli/__init__.py @@ -178,10 +178,14 @@ def main(): i += 1 result = optimize( - skill_name, ctx, - preset=opt_preset, mode=opt_mode, task_lm=opt_task_lm, + skill_name, + ctx, + preset=opt_preset, + mode=opt_mode, + task_lm=opt_task_lm, reflection_lm=opt_reflection_lm, - dry_run=opt_dry_run, apply=opt_apply, + dry_run=opt_dry_run, + apply=opt_apply, ) else: print(f"Unknown subcommand: {subcommand}") diff --git a/.test/src/skill_test/grp/executor.py b/.test/src/skill_test/grp/executor.py index 5cd393bb..0413ea3c 100644 --- a/.test/src/skill_test/grp/executor.py +++ b/.test/src/skill_test/grp/executor.py @@ -4,8 +4,8 @@ import re import time import yaml -from dataclasses import dataclass, field -from typing import List, Tuple, Optional, Dict, Any, Callable, Protocol +from dataclasses import dataclass +from typing import List, Tuple, Optional, Dict, Any, Protocol @dataclass diff --git a/.test/src/skill_test/optimize/alignment.py b/.test/src/skill_test/optimize/alignment.py index 190d3456..6151f3f2 100644 --- a/.test/src/skill_test/optimize/alignment.py +++ b/.test/src/skill_test/optimize/alignment.py @@ -75,7 +75,8 @@ def align_judge( if traces: logger.info( "Only %d alignment traces for %s (need >=3). Using base judge.", - len(traces), skill_name, + len(traces), + skill_name, ) return judge @@ -86,13 +87,12 @@ def align_judge( aligned = judge.align(traces=traces, optimizer=optimizer) logger.info( "Aligned judge with %d traces for %s", - len(traces), skill_name, + len(traces), + skill_name, ) return aligned except ImportError: - logger.warning( - "MemAlignOptimizer not available. Install mlflow-deepeval for alignment support." - ) + logger.warning("MemAlignOptimizer not available. Install mlflow-deepeval for alignment support.") return judge except Exception as e: logger.warning("MemAlign alignment failed for %s: %s", skill_name, e) diff --git a/.test/src/skill_test/optimize/assertions.py b/.test/src/skill_test/optimize/assertions.py index 88a7f089..5906e5fa 100644 --- a/.test/src/skill_test/optimize/assertions.py +++ b/.test/src/skill_test/optimize/assertions.py @@ -48,12 +48,14 @@ def _run_pattern_assertions(response: str, expected_patterns: list) -> list[Asse passed = matches >= min_count rationale = f"Found {matches} matches (need >={min_count})" - results.append(AssertionResult( - name=f"pattern_{description}", - passed=passed, - rationale=rationale, - assertion_type="pattern", - )) + results.append( + AssertionResult( + name=f"pattern_{description}", + passed=passed, + rationale=rationale, + assertion_type="pattern", + ) + ) return results @@ -66,12 +68,14 @@ def _run_fact_assertions(response: str, expected_facts: list[str]) -> list[Asser results = [] for fact in expected_facts: found = fact.lower() in response_lower - results.append(AssertionResult( - name=f"fact_{fact[:40]}", - passed=found, - rationale=f"{'Found' if found else 'Missing'}: {fact}", - assertion_type="fact", - )) + results.append( + AssertionResult( + name=f"fact_{fact[:40]}", + passed=found, + rationale=f"{'Found' if found else 'Missing'}: {fact}", + assertion_type="fact", + ) + ) return results @@ -130,7 +134,7 @@ def _extract_content(result: AssertionResult) -> str: if result.assertion_type == "fact": for prefix in ("Missing: ", "Found: "): if result.rationale.startswith(prefix): - return result.rationale[len(prefix):] + return result.rationale[len(prefix) :] return result.rationale else: # Pattern: name is "pattern_{description}", rationale is match count diff --git a/.test/src/skill_test/optimize/config.py b/.test/src/skill_test/optimize/config.py index 5784cadb..e361b3d6 100644 --- a/.test/src/skill_test/optimize/config.py +++ b/.test/src/skill_test/optimize/config.py @@ -8,17 +8,12 @@ from gepa.optimize_anything import GEPAConfig, EngineConfig, ReflectionConfig, RefinerConfig -DEFAULT_REFLECTION_LM = os.environ.get( - "GEPA_REFLECTION_LM", "databricks/databricks-claude-opus-4-6" -) +DEFAULT_REFLECTION_LM = os.environ.get("GEPA_REFLECTION_LM", "databricks/databricks-claude-opus-4-6") -DEFAULT_GEN_LM = os.environ.get( - "GEPA_GEN_LM", "databricks/databricks-claude-sonnet-4-6" -) +DEFAULT_GEN_LM = os.environ.get("GEPA_GEN_LM", "databricks/databricks-claude-sonnet-4-6") + +DEFAULT_TOKEN_BUDGET: int | None = int(os.environ.get("GEPA_TOKEN_BUDGET", "0")) or None -DEFAULT_TOKEN_BUDGET: int | None = int( - os.environ.get("GEPA_TOKEN_BUDGET", "0") -) or None # --------------------------------------------------------------------------- # Register Databricks models with litellm so it knows their true context @@ -62,6 +57,7 @@ def _register_litellm_models() -> None: """Register Databricks model context windows with litellm.""" try: import litellm + _models = { "databricks/databricks-claude-opus-4-6": { "max_tokens": 32_000, @@ -232,6 +228,7 @@ def validate_reflection_context( """ try: import litellm + info = litellm.get_model_info(reflection_lm) limit = info.get("max_input_tokens") or info.get("max_tokens") or 0 except Exception: diff --git a/.test/src/skill_test/optimize/gskill/pipeline.py b/.test/src/skill_test/optimize/gskill/pipeline.py index 7badfe7c..76a263f5 100644 --- a/.test/src/skill_test/optimize/gskill/pipeline.py +++ b/.test/src/skill_test/optimize/gskill/pipeline.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any -from gepa.optimize_anything import optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig +from gepa.optimize_anything import optimize_anything import gepa.optimize_anything as oa from ..config import get_preset @@ -45,10 +45,7 @@ def run_gskill( repo_context = _scan_repo(repo_path, context_files) seed_content = ( - f"# {skill_name}\n\n" - "## Overview\n\n" - f"Patterns and best practices for the {skill_name} project.\n\n" - + repo_context + f"# {skill_name}\n\n## Overview\n\nPatterns and best practices for the {skill_name} project.\n\n" + repo_context ) def evaluate(candidate: str, example: dict) -> tuple[float, dict]: diff --git a/.test/src/skill_test/optimize/judges.py b/.test/src/skill_test/optimize/judges.py index e17d6c5b..dfc748a3 100644 --- a/.test/src/skill_test/optimize/judges.py +++ b/.test/src/skill_test/optimize/judges.py @@ -37,9 +37,7 @@ logger = logging.getLogger(__name__) -DEFAULT_JUDGE_LM = os.environ.get( - "GEPA_JUDGE_LM", "databricks:/databricks-claude-sonnet-4-6" -) +DEFAULT_JUDGE_LM = os.environ.get("GEPA_JUDGE_LM", "databricks:/databricks-claude-sonnet-4-6") # --------------------------------------------------------------------------- # Fallback model chain for rate limit errors @@ -54,6 +52,7 @@ "databricks/databricks-claude-sonnet-4-5", ] + def _get_fallback_models() -> list[str]: """Get fallback model chain from env or defaults.""" custom = os.environ.get("GEPA_FALLBACK_MODELS", "") @@ -65,15 +64,18 @@ def _get_fallback_models() -> list[str]: def _is_rate_limit_error(exc: Exception) -> bool: """Check if an exception is a rate limit / request limit exceeded error.""" msg = str(exc).lower() - return any(phrase in msg for phrase in [ - "rate_limit", - "rate limit", - "request_limit_exceeded", - "request limit exceeded", - "too many requests", - "429", - "token.*per.*minute", - ]) + return any( + phrase in msg + for phrase in [ + "rate_limit", + "rate limit", + "request_limit_exceeded", + "request limit exceeded", + "too many requests", + "429", + "token.*per.*minute", + ] + ) # --------------------------------------------------------------------------- @@ -113,6 +115,7 @@ def _to_litellm_model(model: str) -> tuple[str, str | None]: # URI conversion # --------------------------------------------------------------------------- + def _to_judge_uri(model: str) -> str: """Convert litellm-style model strings to MLflow judge URI format. @@ -157,6 +160,7 @@ def _to_judge_model_and_params(model: str) -> tuple[str, dict[str, Any] | None]: # Completion with fallback # --------------------------------------------------------------------------- + def completion_with_fallback(*, model: str, max_retries: int = 3, **kwargs) -> Any: """Call litellm.completion with model fallback on rate limit errors. @@ -169,9 +173,7 @@ def completion_with_fallback(*, model: str, max_retries: int = 3, **kwargs) -> A """ import litellm - models_to_try = [model] + [ - m for m in _get_fallback_models() if m != model - ] + models_to_try = [model] + [m for m in _get_fallback_models() if m != model] last_err: Exception | None = None for model_str in models_to_try: @@ -184,7 +186,7 @@ def completion_with_fallback(*, model: str, max_retries: int = 3, **kwargs) -> A for attempt in range(max_retries): if attempt > 0: - delay = min(2 ** attempt, 30) + delay = min(2**attempt, 30) time.sleep(delay) try: return litellm.completion(**call_kwargs) @@ -194,7 +196,8 @@ def completion_with_fallback(*, model: str, max_retries: int = 3, **kwargs) -> A if attempt == max_retries - 1: logger.warning( "Model '%s' rate limited after %d attempts, trying next fallback", - model_str, max_retries, + model_str, + max_retries, ) continue # Non-rate-limit error: don't retry, try next model @@ -208,6 +211,7 @@ def completion_with_fallback(*, model: str, max_retries: int = 3, **kwargs) -> A # Data types # --------------------------------------------------------------------------- + @dataclass class JudgeFeedback: """Structured feedback from a judge call.""" @@ -297,10 +301,7 @@ def create_skill_quality_judge( instructions = _QUALITY_INSTRUCTIONS if skill_guidelines: principles = "\n".join(f"- {g}" for g in skill_guidelines) - instructions += ( - "\n\n## Domain-Specific Principles\n" - f"{principles}\n" - ) + instructions += f"\n\n## Domain-Specific Principles\n{principles}\n" model_uri, inference_params = _to_judge_model_and_params(judge_model or DEFAULT_JUDGE_LM) return make_judge( @@ -431,6 +432,7 @@ def create_regression_judge(judge_model: str | None = None) -> Any: # Helper: run a judge safely with fallback on rate limit # --------------------------------------------------------------------------- + def run_judge_safe( judge: Any, *, diff --git a/.test/src/skill_test/optimize/review.py b/.test/src/skill_test/optimize/review.py index 8e4f4a28..2374d429 100644 --- a/.test/src/skill_test/optimize/review.py +++ b/.test/src/skill_test/optimize/review.py @@ -75,9 +75,7 @@ def save_result(result: OptimizationResult) -> tuple[Path | None, Path | None]: # Save tool components if present if result.components: - tool_components = { - k: v for k, v in result.components.items() if k.startswith("tools_") - } + tool_components = {k: v for k, v in result.components.items() if k.startswith("tools_")} if tool_components: metadata["has_tool_components"] = True # Save each tool component @@ -185,10 +183,7 @@ def review_optimization(result: OptimizationResult) -> None: if not notes: notes.append("OK") note_str = f" [{'; '.join(notes)}]" - per_task_lines.append( - f" {task_id:<30s} WITH {pw:.2f} WITHOUT {pwo:.2f} " - f"delta {eff:+.2f}{note_str}" - ) + per_task_lines.append(f" {task_id:<30s} WITH {pw:.2f} WITHOUT {pwo:.2f} delta {eff:+.2f}{note_str}") if task_count > 0: agg_with = sum_with / task_count @@ -199,16 +194,20 @@ def review_optimization(result: OptimizationResult) -> None: # Score summary improvement_sign = "+" if result.improvement >= 0 else "" - print(f" Score: {result.original_score:.3f} -> {result.optimized_score:.3f} " - f"({improvement_sign}{result.improvement:.3f})") + print( + f" Score: {result.original_score:.3f} -> {result.optimized_score:.3f} " + f"({improvement_sign}{result.improvement:.3f})" + ) print(f" Skill Effectiveness: {agg_eff:.2f}") print(f" Quality (with): {agg_with:.2f}") print(f" Quality (without): {agg_without:.2f} (baseline)") # Token counts reduction_sign = "+" if result.token_reduction_pct >= 0 else "" - print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " - f"({reduction_sign}{result.token_reduction_pct:.1f}%)") + print( + f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " + f"({reduction_sign}{result.token_reduction_pct:.1f}%)" + ) if result.gepa_result and hasattr(result.gepa_result, "iterations"): print(f" Iterations: {result.gepa_result.iterations}") @@ -233,13 +232,15 @@ def review_optimization(result: OptimizationResult) -> None: # Detailed diff (first 50 lines) if result.original_content != result.optimized_content: - diff_lines = list(difflib.unified_diff( - result.original_content.splitlines(keepends=True), - result.optimized_content.splitlines(keepends=True), - fromfile="original SKILL.md", - tofile="optimized SKILL.md", - n=2, - )) + diff_lines = list( + difflib.unified_diff( + result.original_content.splitlines(keepends=True), + result.optimized_content.splitlines(keepends=True), + fromfile="original SKILL.md", + tofile="optimized SKILL.md", + n=2, + ) + ) if len(diff_lines) > 50: print(f" Diff (first 50 of {len(diff_lines)} lines):") for line in diff_lines[:50]: @@ -289,8 +290,7 @@ def apply_optimization(result: OptimizationResult) -> Path | None: """ if result.improvement < 0: raise ValueError( - f"Optimization regressed quality ({result.improvement:+.3f}). " - "Refusing to apply. Use --force to override." + f"Optimization regressed quality ({result.improvement:+.3f}). Refusing to apply. Use --force to override." ) skill_path = None @@ -319,10 +319,11 @@ def apply_optimization(result: OptimizationResult) -> Path | None: for f in modified: print(f" {f}") - print(f" Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} " - f"({result.improvement:+.3f})") - print(f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " - f"({result.token_reduction_pct:+.1f}%)") + print(f" Quality: {result.original_score:.3f} -> {result.optimized_score:.3f} ({result.improvement:+.3f})") + print( + f" Tokens: {result.original_token_count:,} -> {result.optimized_token_count:,} " + f"({result.token_reduction_pct:+.1f}%)" + ) # Try to update baseline try: diff --git a/.test/src/skill_test/optimize/runner.py b/.test/src/skill_test/optimize/runner.py index 2e0ac3ac..d1d0ef16 100644 --- a/.test/src/skill_test/optimize/runner.py +++ b/.test/src/skill_test/optimize/runner.py @@ -6,18 +6,19 @@ import copy import difflib -import re from dataclasses import dataclass -from pathlib import Path from typing import Any -from gepa.optimize_anything import optimize_anything, GEPAConfig +from gepa.optimize_anything import optimize_anything from ..config import SkillTestConfig from ..runners.evaluate import setup_mlflow from .config import ( - get_preset, validate_reflection_context, estimate_pass_duration, - DEFAULT_GEN_LM, DEFAULT_TOKEN_BUDGET, + get_preset, + validate_reflection_context, + estimate_pass_duration, + DEFAULT_GEN_LM, + DEFAULT_TOKEN_BUDGET, ) from .utils import SKILL_KEY, count_tokens, find_skill_md from .skillbench_evaluator import ( @@ -94,7 +95,6 @@ def _evaluate_on_tasks(evaluator, candidate, tasks, label: str = "Evaluating"): Returns: (mean_score, per_task_scores, side_info_by_id, side_info_by_input) """ - import sys gepa_instances = to_gepa_instances(tasks) total = len(gepa_instances) @@ -173,16 +173,16 @@ def optimize_skill( tool_map = extract_tool_descriptions(modules=tool_modules) tool_components = tools_to_gepa_components(tool_map, per_module=True) stats = get_tool_stats() - print(f"Tool modules: {stats['modules']}, tools: {stats['total_tools']}, " - f"description chars: {stats['total_description_chars']:,}") + print( + f"Tool modules: {stats['modules']}, tools: {stats['total_tools']}, " + f"description chars: {stats['total_description_chars']:,}" + ) except FileNotFoundError: pass # No MCP tools directory — skip # Build read-only tool context string (for skill optimization) if tool_components: - tool_context_str = "\n\n".join( - tool_components[k] for k in sorted(tool_components) - ) + tool_context_str = "\n\n".join(tool_components[k] for k in sorted(tool_components)) # 2. Build seed_candidate (multi-component dict) seed_candidate: dict[str, str] = {} @@ -243,15 +243,13 @@ def optimize_skill( print(f"Generation model: {effective_gen_model}") from .judges import DEFAULT_JUDGE_LM + effective_judge_model = judge_model or DEFAULT_JUDGE_LM print(f"Judge model: {effective_judge_model}") print("Evaluator: skillbench (judge-driven)") if not effective_gen_model: - raise ValueError( - "SkillBench evaluator requires a gen_model. " - "Pass --gen-model or set GEPA_GEN_LM env var." - ) + raise ValueError("SkillBench evaluator requires a gen_model. Pass --gen-model or set GEPA_GEN_LM env var.") evaluator = create_skillbench_evaluator( skill_name, gen_model=effective_gen_model, @@ -273,7 +271,8 @@ def optimize_skill( # 5b. Validate reflection model context window validate_reflection_context( - config.reflection.reflection_lm, total_original_tokens, + config.reflection.reflection_lm, + total_original_tokens, ) # 5c. Replace GEPA's reflection_lm string with a fallback-aware callable. @@ -281,8 +280,10 @@ def optimize_skill( # with no fallback. We pre-convert it so GEPA uses our version with # model fallback on rate limit errors. from .judges import completion_with_fallback + _reflection_model_name = config.reflection.reflection_lm or "" if isinstance(config.reflection.reflection_lm, str): + def _reflection_lm_with_fallback(prompt): if isinstance(prompt, str): messages = [{"role": "user", "content": prompt}] @@ -299,6 +300,7 @@ def _reflection_lm_with_fallback(prompt): # Same for refiner_lm if present if config.refiner is not None and isinstance(config.refiner.refiner_lm, str): _refiner_model_name = config.refiner.refiner_lm + def _refiner_lm_with_fallback(prompt): if isinstance(prompt, str): messages = [{"role": "user", "content": prompt}] @@ -326,8 +328,10 @@ def _refiner_lm_with_fallback(prompt): print(f"Train tasks: {len(train)}") print(f"Val tasks: {len(val) if val else 'None (single-task mode)'}") print(f"Generation model: {effective_gen_model}") - print(f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls}, " - f"scaled for {num_components} component(s))") + print( + f"Preset: {preset} (max_metric_calls={config.engine.max_metric_calls}, " + f"scaled for {num_components} component(s))" + ) print(f"Max passes: {max_passes}") if run_dir: print(f"Run dir: {run_dir}") @@ -342,7 +346,8 @@ def _refiner_lm_with_fallback(prompt): print(f" {task_id}: {score:.3f}") background = build_skillbench_background( - skill_name, total_original_tokens, + skill_name, + total_original_tokens, component_names=list(seed_candidate.keys()), baseline_scores=original_per_task, baseline_side_info=si_by_id, @@ -379,7 +384,8 @@ def _refiner_lm_with_fallback(prompt): # 6. Build background and objective background = build_skillbench_background( - skill_name, total_original_tokens, + skill_name, + total_original_tokens, component_names=list(seed_candidate.keys()), baseline_scores=original_per_task, baseline_side_info=si_by_id, @@ -405,8 +411,10 @@ def _refiner_lm_with_fallback(prompt): total_metric_calls = 0 improvement_threshold = 0.0005 - print(f"\n Starting multi-pass optimization (up to {max_passes} passes, " - f"{num_components} component(s), {config.engine.max_metric_calls} metric calls/pass)") + print( + f"\n Starting multi-pass optimization (up to {max_passes} passes, " + f"{num_components} component(s), {config.engine.max_metric_calls} metric calls/pass)" + ) # estimate_pass_duration expects the model name string, not the callable _est_reflection_lm = _reflection_model_name if _reflection_model_name else str(reflection_lm or DEFAULT_GEN_LM) @@ -420,8 +428,7 @@ def _refiner_lm_with_fallback(prompt): est_mins = est_secs / 60 if est_mins > 5: print( - f" Estimated ~{est_mins:.0f} min/pass " - f"({est_mins * max_passes:.0f} min total for {max_passes} passes)" + f" Estimated ~{est_mins:.0f} min/pass ({est_mins * max_passes:.0f} min total for {max_passes} passes)" ) for pass_num in range(1, max_passes + 1): @@ -448,8 +455,7 @@ def _refiner_lm_with_fallback(prompt): pass_score, _, _, _ = _evaluate_on_tasks(evaluator, candidate, train, label=f"Pass {pass_num}") improvement = pass_score - best_score - print(f" Pass {pass_num} score: {pass_score:.4f} " - f"(delta: {'+' if improvement >= 0 else ''}{improvement:.4f})") + print(f" Pass {pass_num} score: {pass_score:.4f} (delta: {'+' if improvement >= 0 else ''}{improvement:.4f})") if pass_score > best_score + improvement_threshold: best = dict(candidate) @@ -479,7 +485,8 @@ def _refiner_lm_with_fallback(prompt): token_reduction_pct = ( (total_original_tokens - optimized_token_count) / total_original_tokens * 100 - if total_original_tokens > 0 else 0.0 + if total_original_tokens > 0 + else 0.0 ) diff_summary = _compute_diff_summary(original_content, optimized_content) @@ -488,22 +495,29 @@ def _refiner_lm_with_fallback(prompt): mlflow_run_id = None try: import mlflow + stc = SkillTestConfig() setup_mlflow(stc) with mlflow.start_run(run_name=f"{skill_name}_optimize_{preset}"): - mlflow.set_tags({ - "optimizer": "gepa", "skill_name": skill_name, - "preset": preset, "evaluator_type": "skillbench", - }) - mlflow.log_metrics({ - "original_score": original_score, - "optimized_score": optimized_score, - "improvement": optimized_score - original_score, - "original_tokens": float(total_original_tokens), - "optimized_tokens": float(optimized_token_count), - "token_reduction_pct": token_reduction_pct, - "total_metric_calls": float(total_metric_calls), - }) + mlflow.set_tags( + { + "optimizer": "gepa", + "skill_name": skill_name, + "preset": preset, + "evaluator_type": "skillbench", + } + ) + mlflow.log_metrics( + { + "original_score": original_score, + "optimized_score": optimized_score, + "improvement": optimized_score - original_score, + "original_tokens": float(total_original_tokens), + "optimized_tokens": float(optimized_token_count), + "token_reduction_pct": token_reduction_pct, + "total_metric_calls": float(total_metric_calls), + } + ) mlflow_run_id = mlflow.active_run().info.run_id except Exception: pass diff --git a/.test/src/skill_test/optimize/skillbench_evaluator.py b/.test/src/skill_test/optimize/skillbench_evaluator.py index feea8239..d13601e8 100644 --- a/.test/src/skill_test/optimize/skillbench_evaluator.py +++ b/.test/src/skill_test/optimize/skillbench_evaluator.py @@ -150,10 +150,7 @@ def __init__( tool_context: str | None = None, ): if not gen_model: - raise ValueError( - "SkillBench evaluator requires a gen_model. " - "Pass --gen-model or set GEPA_GEN_LM env var." - ) + raise ValueError("SkillBench evaluator requires a gen_model. Pass --gen-model or set GEPA_GEN_LM env var.") self.gen_model = gen_model self._baseline_response_cache: dict[str, str] = {} self._baseline_judge_cache: dict[str, JudgeFeedback] = {} @@ -170,14 +167,16 @@ def _generate_response(self, prompt: str, skill_context: str | None = None) -> s """Generate a response with or without skill context.""" messages = [] if skill_context: - messages.append({ - "role": "system", - "content": ( - "Use ONLY the following skill documentation to answer " - "the user's question. Do not use any other knowledge.\n\n" - f"{skill_context}" - ), - }) + messages.append( + { + "role": "system", + "content": ( + "Use ONLY the following skill documentation to answer " + "the user's question. Do not use any other knowledge.\n\n" + f"{skill_context}" + ), + } + ) messages.append({"role": "user", "content": prompt}) resp = _completion_with_backoff( @@ -196,7 +195,9 @@ def _get_baseline_response(self, prompt: str) -> str: return self._baseline_response_cache[key] def __call__( - self, candidate: dict[str, str], example: dict, + self, + candidate: dict[str, str], + example: dict, ) -> tuple[float, dict]: """Evaluate a candidate skill against a single task example. @@ -246,16 +247,17 @@ def __call__( # Build flat strings for judge templates — make_judge only supports # top-level {{ inputs }}, {{ outputs }}, {{ expectations }} variables. facts_str = "\n".join(f"- {f}" for f in facts) if facts else "None specified" - patterns_str = "\n".join( - f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" - for p in patterns - ) if patterns else "None specified" + patterns_str = ( + "\n".join( + f"- {p}" if isinstance(p, str) else f"- {p.get('description', p.get('pattern', ''))}" for p in patterns + ) + if patterns + else "None specified" + ) guidelines_str = "\n".join(f"- {g}" for g in guidelines) if guidelines else "None specified" expectations_text = ( - f"Expected facts:\n{facts_str}\n\n" - f"Expected patterns:\n{patterns_str}\n\n" - f"Guidelines:\n{guidelines_str}" + f"Expected facts:\n{facts_str}\n\nExpected patterns:\n{patterns_str}\n\nGuidelines:\n{guidelines_str}" ) # make_judge requires expectations as dict, inputs/outputs as Any. @@ -317,12 +319,7 @@ def __call__( efficiency = 1.0 # Weighted final score - final_score = ( - 0.40 * max(0.0, effectiveness_delta) - + 0.30 * score_with - + 0.05 * structure - + 0.25 * efficiency - ) + final_score = 0.40 * max(0.0, effectiveness_delta) + 0.30 * score_with + 0.05 * structure + 0.25 * efficiency # Build side info with FULL judge rationale (not truncated!) reference_answer = example.get("answer", "") @@ -344,9 +341,7 @@ def __call__( } side_info["Judge_effectiveness"] = { "verdict": ( - "improved" if effectiveness_verdict == 1.0 - else "regressed" if effectiveness_verdict == 0.0 - else "same" + "improved" if effectiveness_verdict == 1.0 else "regressed" if effectiveness_verdict == 0.0 else "same" ), "delta": effectiveness_delta, } @@ -387,8 +382,7 @@ def __call__( } elif score_with < 0.5: side_info["Error"] = ( - f"NEEDS_SKILL: quality_with={score_with:.2f}, missing content. " - f"Judge: {quality_with_fb.rationale[:200]}" + f"NEEDS_SKILL: quality_with={score_with:.2f}, missing content. Judge: {quality_with_fb.rationale[:200]}" ) return final_score, side_info @@ -455,6 +449,7 @@ def create_skillbench_evaluator( ) from .judges import DEFAULT_JUDGE_LM + effective_judge_model = judge_model or DEFAULT_JUDGE_LM logger.info("Judge model: %s", effective_judge_model) @@ -496,15 +491,9 @@ def build_skillbench_background( if "REGRESSION" in error: regression_ids.append(tid) if needs_skill_ids: - baseline_desc += ( - f"\n NEEDS_SKILL ({len(needs_skill_ids)} tasks): " - f"{', '.join(needs_skill_ids[:5])}" - ) + baseline_desc += f"\n NEEDS_SKILL ({len(needs_skill_ids)} tasks): {', '.join(needs_skill_ids[:5])}" if regression_ids: - baseline_desc += ( - f"\n REGRESSION ({len(regression_ids)} tasks): " - f"{', '.join(regression_ids[:5])}" - ) + baseline_desc += f"\n REGRESSION ({len(regression_ids)} tasks): {', '.join(regression_ids[:5])}" components_desc = "" if component_names and any(c.startswith("tools_") for c in component_names): @@ -519,10 +508,7 @@ def build_skillbench_background( "Smaller candidates score HIGHER. Be ruthlessly concise." ) if token_budget: - token_desc += ( - f"\nTOKEN BUDGET: {token_budget:,} tokens. Candidates exceeding this " - "are heavily penalized." - ) + token_desc += f"\nTOKEN BUDGET: {token_budget:,} tokens. Candidates exceeding this are heavily penalized." return ( f"You are refining SKILL.md for '{skill_name}'.\n" diff --git a/.test/src/skill_test/optimize/splitter.py b/.test/src/skill_test/optimize/splitter.py index 064144d8..0fc6f170 100644 --- a/.test/src/skill_test/optimize/splitter.py +++ b/.test/src/skill_test/optimize/splitter.py @@ -75,9 +75,7 @@ def _record_to_task(record: EvalRecord) -> SkillTask: # Also encode expectations into additional_context for GEPA reflection task["additional_context"]["expectations"] = json.dumps(record.expectations) # Human-readable summary for GEPA's reflection LM - task["additional_context"]["evaluation_criteria"] = _summarize_expectations( - record.expectations - ) + task["additional_context"]["evaluation_criteria"] = _summarize_expectations(record.expectations) return task @@ -190,9 +188,7 @@ def create_cross_skill_dataset( skill_names = sorted( d.name for d in base_path.iterdir() - if d.is_dir() - and (d / "ground_truth.yaml").exists() - and not d.name.startswith("_") + if d.is_dir() and (d / "ground_truth.yaml").exists() and not d.name.startswith("_") ) if not skill_names: diff --git a/.test/src/skill_test/optimize/tools.py b/.test/src/skill_test/optimize/tools.py index 586baf7a..f519d099 100644 --- a/.test/src/skill_test/optimize/tools.py +++ b/.test/src/skill_test/optimize/tools.py @@ -42,8 +42,7 @@ def _find_tools_dir() -> Path: if candidate.exists(): return candidate raise FileNotFoundError( - "Could not find MCP tools directory. " - "Expected at databricks-mcp-server/databricks_mcp_server/tools/" + "Could not find MCP tools directory. Expected at databricks-mcp-server/databricks_mcp_server/tools/" ) @@ -247,19 +246,14 @@ def list_tool_modules(tools_dir: Path | None = None) -> list[str]: """List available tool module names.""" if tools_dir is None: tools_dir = _find_tools_dir() - return sorted( - f.stem for f in tools_dir.glob("*.py") - if f.stem != "__init__" - ) + return sorted(f.stem for f in tools_dir.glob("*.py") if f.stem != "__init__") def get_tool_stats(tools_dir: Path | None = None) -> dict[str, Any]: """Get statistics about available MCP tools.""" tool_map = extract_tool_descriptions(tools_dir=tools_dir) total_tools = sum(len(tools) for tools in tool_map.values()) - total_chars = sum( - len(td.docstring) for tools in tool_map.values() for td in tools - ) + total_chars = sum(len(td.docstring) for tools in tool_map.values() for td in tools) return { "modules": len(tool_map), "total_tools": total_tools, diff --git a/.test/src/skill_test/optimize/utils.py b/.test/src/skill_test/optimize/utils.py index 3cb5f89f..464ee02c 100644 --- a/.test/src/skill_test/optimize/utils.py +++ b/.test/src/skill_test/optimize/utils.py @@ -15,6 +15,7 @@ # Path utilities # --------------------------------------------------------------------------- + def find_repo_root() -> Path: """Find the repo root by searching upward for .test/src/.""" current = Path(__file__).resolve().parent @@ -44,6 +45,7 @@ def find_skill_md(skill_name: str) -> Path | None: # Token utilities # --------------------------------------------------------------------------- + def count_tokens(text: str) -> int: """Count tokens using cl100k_base encoding.""" enc = tiktoken.get_encoding("cl100k_base") diff --git a/.test/src/skill_test/runners/compare.py b/.test/src/skill_test/runners/compare.py index 460d03db..d2d95b56 100644 --- a/.test/src/skill_test/runners/compare.py +++ b/.test/src/skill_test/runners/compare.py @@ -3,7 +3,7 @@ import json from datetime import datetime from pathlib import Path -from typing import Dict, Any, Optional, List +from typing import Dict, Optional, List from dataclasses import dataclass, asdict diff --git a/.test/src/skill_test/runners/evaluate.py b/.test/src/skill_test/runners/evaluate.py index 1dff1009..dc2530f4 100644 --- a/.test/src/skill_test/runners/evaluate.py +++ b/.test/src/skill_test/runners/evaluate.py @@ -4,7 +4,7 @@ from typing import Optional, Dict, Any, List import yaml import mlflow -from mlflow.genai.scorers import Guidelines, Safety +from mlflow.genai.scorers import Safety from ..config import SkillTestConfig from ..dataset import get_dataset_source @@ -25,7 +25,6 @@ file_existence, tool_sequence, category_limits, - get_trace_scorers, ) diff --git a/.test/src/skill_test/scorers/trace.py b/.test/src/skill_test/scorers/trace.py index 2532a01d..8cd6bf13 100644 --- a/.test/src/skill_test/scorers/trace.py +++ b/.test/src/skill_test/scorers/trace.py @@ -12,8 +12,6 @@ from mlflow.entities import Feedback from mlflow.genai.scorers import scorer -from ..trace.models import TraceMetrics - @scorer def tool_count( diff --git a/.test/src/skill_test/scorers/universal.py b/.test/src/skill_test/scorers/universal.py index 40e2a8b2..d1e5ac47 100644 --- a/.test/src/skill_test/scorers/universal.py +++ b/.test/src/skill_test/scorers/universal.py @@ -136,9 +136,11 @@ def expected_facts_present(outputs: Dict[str, Any], expectations: Dict[str, Any] feedbacks = [] for fact in expected_facts: found = fact.lower() in response - feedbacks.append(Feedback( - name=f"fact_{fact[:40]}", - value="yes" if found else "no", - rationale=f"{'Found' if found else 'Missing'}: {fact}", - )) + feedbacks.append( + Feedback( + name=f"fact_{fact[:40]}", + value="yes" if found else "no", + rationale=f"{'Found' if found else 'Missing'}: {fact}", + ) + ) return feedbacks diff --git a/.test/src/skill_test/trace/mlflow_integration.py b/.test/src/skill_test/trace/mlflow_integration.py index 62f8ffb1..179391f7 100644 --- a/.test/src/skill_test/trace/mlflow_integration.py +++ b/.test/src/skill_test/trace/mlflow_integration.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -from .models import TraceMetrics, ToolCall, FileOperation +from .models import TraceMetrics, ToolCall from .parser import parse_and_compute_metrics diff --git a/.test/src/skill_test/trace/source.py b/.test/src/skill_test/trace/source.py index 74918875..8b7abef9 100644 --- a/.test/src/skill_test/trace/source.py +++ b/.test/src/skill_test/trace/source.py @@ -5,7 +5,6 @@ 2. Local fallback (~/.claude/projects/{hash}/*.jsonl) """ -import os import subprocess from dataclasses import dataclass from pathlib import Path diff --git a/databricks-mcp-server/databricks_mcp_server/tools/compute.py b/databricks-mcp-server/databricks_mcp_server/tools/compute.py index e2462d44..06e52eff 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/compute.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/compute.py @@ -1,6 +1,6 @@ """Compute tools - Execute code on Databricks clusters.""" -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List from databricks_tools_core.compute import ( list_clusters as _list_clusters, diff --git a/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py b/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py index 08324bd6..2727c999 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py @@ -31,14 +31,12 @@ update_project as _update_project, delete_project as _delete_project, create_branch as _create_branch, - get_branch as _get_branch, list_branches as _list_branches, update_branch as _update_branch, delete_branch as _delete_branch, create_endpoint as _create_endpoint, list_endpoints as _list_endpoints, update_endpoint as _update_endpoint, - delete_endpoint as _delete_endpoint, generate_credential as _generate_autoscale_credential, ) diff --git a/databricks-mcp-server/databricks_mcp_server/tools/volume_files.py b/databricks-mcp-server/databricks_mcp_server/tools/volume_files.py index e4f1a04d..a75da32e 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/volume_files.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/volume_files.py @@ -1,6 +1,6 @@ """Volume file tools - Manage files in Unity Catalog Volumes.""" -from typing import Dict, Any, List +from typing import Dict, Any from databricks_tools_core.unity_catalog import ( list_volume_files as _list_volume_files, diff --git a/databricks-tools-core/databricks_tools_core/agent_bricks/manager.py b/databricks-tools-core/databricks_tools_core/agent_bricks/manager.py index 13789798..7b829bbc 100644 --- a/databricks-tools-core/databricks_tools_core/agent_bricks/manager.py +++ b/databricks-tools-core/databricks_tools_core/agent_bricks/manager.py @@ -13,7 +13,7 @@ import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import requests from databricks.sdk import WorkspaceClient @@ -21,7 +21,6 @@ from ..auth import get_workspace_client, get_current_username from .models import ( EndpointStatus, - EvaluationRunDict, GenieIds, GenieListInstructionsResponseDict, GenieListQuestionsResponseDict, diff --git a/databricks-tools-core/databricks_tools_core/aibi_dashboards/dashboards.py b/databricks-tools-core/databricks_tools_core/aibi_dashboards/dashboards.py index beabb5e2..7f3cceb2 100644 --- a/databricks-tools-core/databricks_tools_core/aibi_dashboards/dashboards.py +++ b/databricks-tools-core/databricks_tools_core/aibi_dashboards/dashboards.py @@ -8,7 +8,7 @@ import asyncio import json import logging -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, Optional, Union from databricks.sdk.service.dashboards import Dashboard diff --git a/databricks-tools-core/databricks_tools_core/jobs/runs.py b/databricks-tools-core/databricks_tools_core/jobs/runs.py index 665280b4..6b74c055 100644 --- a/databricks-tools-core/databricks_tools_core/jobs/runs.py +++ b/databricks-tools-core/databricks_tools_core/jobs/runs.py @@ -13,7 +13,7 @@ ) from ..auth import get_workspace_client -from .models import JobRunResult, JobError, RunLifecycleState, RunResultState as RunResultStateEnum +from .models import JobRunResult, JobError # Terminal states - run has finished (success or failure) diff --git a/databricks-tools-core/databricks_tools_core/spark_declarative_pipelines/__init__.py b/databricks-tools-core/databricks_tools_core/spark_declarative_pipelines/__init__.py index 1808e0a1..44e53763 100644 --- a/databricks-tools-core/databricks_tools_core/spark_declarative_pipelines/__init__.py +++ b/databricks-tools-core/databricks_tools_core/spark_declarative_pipelines/__init__.py @@ -1,3 +1,3 @@ """Spark Declarative Pipelines (SDP) operations""" -from . import pipelines, workspace_files +from . import pipelines as pipelines, workspace_files as workspace_files diff --git a/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py b/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py index a8a2d208..958684d8 100644 --- a/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py +++ b/databricks-tools-core/databricks_tools_core/unity_catalog/volume_files.py @@ -12,7 +12,6 @@ from pathlib import Path from typing import List, Optional -from databricks.sdk.service.files import DirectoryEntry from ..auth import get_workspace_client diff --git a/databricks-tools-core/tests/integration/lakebase_autoscale/conftest.py b/databricks-tools-core/tests/integration/lakebase_autoscale/conftest.py index bfe4fb68..edef3450 100644 --- a/databricks-tools-core/tests/integration/lakebase_autoscale/conftest.py +++ b/databricks-tools-core/tests/integration/lakebase_autoscale/conftest.py @@ -11,7 +11,6 @@ """ import logging -import os import time import uuid diff --git a/databricks-tools-core/tests/integration/lakebase_autoscale/test_branches.py b/databricks-tools-core/tests/integration/lakebase_autoscale/test_branches.py index 6a27d934..f0f24635 100644 --- a/databricks-tools-core/tests/integration/lakebase_autoscale/test_branches.py +++ b/databricks-tools-core/tests/integration/lakebase_autoscale/test_branches.py @@ -11,7 +11,6 @@ import logging import time -import uuid import pytest diff --git a/databricks-tools-core/tests/integration/lakebase_autoscale/test_computes.py b/databricks-tools-core/tests/integration/lakebase_autoscale/test_computes.py index b46b5fae..b4317718 100644 --- a/databricks-tools-core/tests/integration/lakebase_autoscale/test_computes.py +++ b/databricks-tools-core/tests/integration/lakebase_autoscale/test_computes.py @@ -10,7 +10,6 @@ import logging import time -import uuid import pytest diff --git a/databricks-tools-core/tests/integration/lakebase_autoscale/test_credentials.py b/databricks-tools-core/tests/integration/lakebase_autoscale/test_credentials.py index 1da5ff66..f2a7e465 100644 --- a/databricks-tools-core/tests/integration/lakebase_autoscale/test_credentials.py +++ b/databricks-tools-core/tests/integration/lakebase_autoscale/test_credentials.py @@ -7,7 +7,6 @@ import logging -import pytest from databricks_tools_core.lakebase_autoscale import ( generate_credential, diff --git a/databricks-tools-core/tests/integration/sql/test_data/generate_test_files.py b/databricks-tools-core/tests/integration/sql/test_data/generate_test_files.py index b5f445db..e6bdd178 100644 --- a/databricks-tools-core/tests/integration/sql/test_data/generate_test_files.py +++ b/databricks-tools-core/tests/integration/sql/test_data/generate_test_files.py @@ -6,7 +6,6 @@ - txt_files/: Simple text files """ -import os from pathlib import Path import pandas as pd diff --git a/databricks-tools-core/tests/integration/sql/test_table_stats.py b/databricks-tools-core/tests/integration/sql/test_table_stats.py index 4894108b..66c9515b 100644 --- a/databricks-tools-core/tests/integration/sql/test_table_stats.py +++ b/databricks-tools-core/tests/integration/sql/test_table_stats.py @@ -13,7 +13,6 @@ get_table_details, TableStatLevel, TableSchemaResult, - TableInfo, ) diff --git a/databricks-tools-core/tests/integration/unity_catalog/test_monitors.py b/databricks-tools-core/tests/integration/unity_catalog/test_monitors.py index 824f3996..5134baae 100644 --- a/databricks-tools-core/tests/integration/unity_catalog/test_monitors.py +++ b/databricks-tools-core/tests/integration/unity_catalog/test_monitors.py @@ -18,9 +18,7 @@ from databricks_tools_core.unity_catalog import ( create_monitor, get_monitor, - run_monitor_refresh, list_monitor_refreshes, - delete_monitor, ) logger = logging.getLogger(__name__) diff --git a/databricks-tools-core/tests/integration/unity_catalog/test_sharing.py b/databricks-tools-core/tests/integration/unity_catalog/test_sharing.py index 7f427768..4eb1dfc9 100644 --- a/databricks-tools-core/tests/integration/unity_catalog/test_sharing.py +++ b/databricks-tools-core/tests/integration/unity_catalog/test_sharing.py @@ -21,13 +21,11 @@ create_share, add_table_to_share, remove_table_from_share, - delete_share, grant_share_to_recipient, revoke_share_from_recipient, list_recipients, get_recipient, create_recipient, - delete_recipient, list_providers, ) From de782e89287b4a36e9002c02e7534bef76934fd7 Mon Sep 17 00:00:00 2001 From: CAholder Date: Tue, 3 Mar 2026 18:22:20 -0800 Subject: [PATCH 19/20] removed unreferenced files and updated .test/README.md --- .test/README.md | 251 ++++++++++++++++++ .../src/skill_test/optimize/gskill/README.md | 128 --------- .../skill_test/optimize/gskill/__init__.py | 5 - .../skill_test/optimize/gskill/pipeline.py | 141 ---------- 4 files changed, 251 insertions(+), 274 deletions(-) delete mode 100644 .test/src/skill_test/optimize/gskill/README.md delete mode 100644 .test/src/skill_test/optimize/gskill/__init__.py delete mode 100644 .test/src/skill_test/optimize/gskill/pipeline.py diff --git a/.test/README.md b/.test/README.md index 4de81e7d..015133ad 100644 --- a/.test/README.md +++ b/.test/README.md @@ -32,6 +32,77 @@ SKILL.md files teach AI agents (like Claude Code) how to use Databricks features GEPA's reflection LM reads the `side_info` diagnostics, proposes mutations, evaluates them, and selects the best via Pareto frontier. The critical insight: the richer the `side_info` diagnostics, the better GEPA's mutations. +### Evaluation Methodology: How We Measure Skill Quality + +Before understanding the judges and scoring, it's important to understand **what we're measuring and why the measurement is trustworthy**. + +#### The core question: "Does this skill actually help?" + +A SKILL.md is only valuable if an agent produces **better responses with the skill than without it**. This is a testable claim — we can generate responses both ways and compare. That comparison is the foundation of all evaluation and optimization in this framework. + +#### Two layers of comparison + +There are two distinct comparisons happening — understanding both is key to reading the scores: + +1. **Within each evaluation** (WITH vs WITHOUT skill): measures whether a given SKILL.md adds value over a bare LLM. This is what `quality_with` and `quality_without` refer to. +2. **Across the optimization loop** (original vs optimized): measures whether GEPA's mutations improved the SKILL.md. This is what `original_score` vs `optimized_score` refer to. + +The first comparison runs inside the evaluator on every iteration. The second comparison runs in the runner to decide whether to keep GEPA's changes. + +#### The WITH vs WITHOUT experimental design + +Every evaluation follows a controlled experiment that measures whether a specific SKILL.md candidate helps the LLM produce better responses: + +1. **WITH-skill trial** (`quality_with`) — An LLM generates a response with the SKILL.md injected as system context. The skill teaches the model Databricks-specific patterns, syntax, and constraints it wouldn't otherwise know. +2. **WITHOUT-skill trial** (`quality_without`) — The **same LLM** generates a response to the **same prompt** with **no SKILL.md in context**. This is the control — it shows what the model already knows on its own. **This is NOT "without optimization"** — it is the bare model with no skill document at all. +3. **Judge both** — An MLflow judge scores each response against the test case's expected facts, patterns, and guidelines, returning a 0.0-1.0 quality score plus a written rationale. + +The WITHOUT-skill response is **computed once and cached by prompt hash** — since the model and prompt don't change, the baseline is stable across all GEPA iterations. This means every candidate SKILL.md is compared against the same fixed control (the bare model). + +#### What "baseline score" means + +Before optimization begins, the runner evaluates the **original SKILL.md** on all training tasks using the WITH/WITHOUT protocol above. This produces: + +- A **per-task score** — the composite score (see [Scoring Weights](#scoring-weights)) for each test case +- A **mean baseline score** — the average across all tasks (e.g., `0.909`) +- **Diagnostic labels** — each task is classified: + - **OK** — skill helped (quality delta > +0.05) + - **NEEDS_SKILL** — WITH-skill quality is below 0.5 (skill isn't teaching enough) + - **REGRESSION** — skill actively hurt the response (quality delta < −0.05) + +This baseline tells you exactly where the skill stands *before* any optimization. + +#### What "improvement" means (the second layer) + +This is the **outer comparison** — original SKILL.md vs optimized SKILL.md. After GEPA produces an optimized candidate, it's re-evaluated on all training tasks using the same WITH/WITHOUT protocol. Improvement is the difference between the optimized mean score and the original mean score: + +``` +improvement = optimized_score - original_score +``` + +Both scores come from the same evaluator, which internally runs the WITH vs WITHOUT comparison. So "improvement" means the optimized SKILL.md produced a larger quality delta (WITH minus WITHOUT) than the original SKILL.md did — i.e., the optimized skill helps the LLM more than the original skill did. + +This is **not** a subjective assessment. Both scores come from the same judges, same prompts, same cached WITHOUT-skill baselines. The only variable is the SKILL.md content. + +The composite score itself is a weighted combination of four dimensions (detailed in [Scoring Weights](#scoring-weights)): + +| Dimension | What it measures | Why it matters | +|-----------|-----------------|----------------| +| **Skill Effectiveness (40%)** | `quality_with - quality_without` | The skill's unique contribution — what the model gets right *because* of the skill | +| **Absolute Quality (30%)** | `quality_with` score | Overall response quality with the skill present | +| **Structure (5%)** | Python/SQL syntax validity | Code in the skill must be syntactically correct | +| **Token Efficiency (25%)** | Token count vs original | Smaller skills save context window — candidates that shrink get a bonus up to 1.15x | + +A skill that scores 0.91 after optimization vs 0.88 at baseline has a measurable, reproducible improvement of +0.03 — driven by higher quality deltas, fewer regressions, or better token efficiency. + +#### Why this is rigorous, not made up + +- **Same model, same prompts** — the only variable is the skill content, isolating its effect +- **Cached baselines** — WITHOUT-skill responses don't change between iterations, so score deltas are real +- **Judge rationale** — every score comes with a written explanation of which facts were present/missing and which patterns matched/failed, making scores auditable +- **Train/val split** — with 5+ test cases, stratified splitting prevents overfitting to the training set +- **Deterministic structure checks** — syntax validation and pattern adherence use regex/AST parsing, not LLM judgment + ### MLflow Judges as the Evaluator The evaluator uses [MLflow's `make_judge`](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html) to score responses. Two judges run by default during optimization: @@ -121,6 +192,48 @@ Before optimization starts, `_evaluate_on_tasks()` runs the evaluator on ALL tra | **5%** | Structure | Python/SQL syntax validation | | **25%** | Token Efficiency | Smaller = higher score (bonus up to 1.15x) | +### How Multi-Pass Optimization Works + +Optimization runs as a multi-pass loop where each pass feeds its best result into the next. This section explains what happens inside a single pass and how the runner decides when to stop. + +#### What happens inside a single GEPA pass + +GEPA's `optimize_anything` receives the seed candidate (current SKILL.md text), the evaluator, the training dataset, and the preset config. Within a pass, GEPA runs up to `max_metric_calls` iterations — **15** for `quick`, **50** for `standard`, **150** for `thorough`. + +Each iteration follows this cycle: + +1. **Reflect** — The reflection LM reads `side_info` from the previous evaluation. This includes the full judge rationale: which expected facts were missing, which regex patterns weren't found, which guidelines were violated, and whether regressions occurred. +2. **Mutate** — Based on the rationale, the reflection LM proposes a targeted mutation to the SKILL.md (or tool docstring). Mutations are surgical — informed by exactly what the judges flagged. +3. **Evaluate** — The evaluator scores the mutated candidate on a task from the dataset. This involves generating responses WITH the candidate, running MLflow judges, and computing the composite score. +4. **Select** — GEPA tracks a Pareto frontier of best candidates. If the mutation improves the frontier, it's kept; otherwise, it's discarded. + +The key insight: because `side_info` contains **full judge rationale** (not truncated summaries), the reflection LM sees exactly which facts were missed, which patterns were absent, and which regressions occurred — leading to more targeted mutations. + +#### How multi-pass works and when it stops + +The runner (`runner.py`) wraps GEPA in a multi-pass loop (default: up to 5 passes, controlled by `--max-passes`): + +1. **Pass N starts** — The best candidate from pass N-1 (or the original SKILL.md for pass 1) becomes the seed. +2. **GEPA optimizes** — Runs up to `max_metric_calls` iterations within the pass. +3. **Re-evaluate** — After the pass completes, the best candidate is re-evaluated on **all** training tasks to get a stable score. +4. **Compare** — The pass score is compared to the previous best score. +5. **Decision:** + - If improvement > **0.0005** (the `improvement_threshold`): the best candidate becomes the seed for pass N+1, and optimization continues. + - If improvement ≤ **0.0005**: early stop — no further passes are run. + +This creates a refinement chain: each pass starts from the previous pass's best, allowing incremental improvements that compound across passes. Early stopping prevents wasting compute when the skill has converged. + +#### Component scaling + +When optimizing multiple components (e.g., SKILL.md + tool modules with `--include-tools`), metric calls scale: + +- **Base formula:** `base_calls × num_components` +- **Per-preset caps:** quick → 45, standard → 150, thorough → 300 +- **Global cap:** 300 (applied for slower reflection models) +- **Round-robin:** GEPA's component selector alternates which component to mutate each iteration, so all components get roughly equal optimization effort. + +For example, with `--include-tools --tool-modules sql serving` (3 components: `skill_md` + `tools_sql` + `tools_serving`), a `quick` preset uses min(15 × 3, 45) = **45** metric calls per pass. + --- ## Quick Start @@ -137,6 +250,13 @@ export OPENAI_API_KEY="sk-..." export GEPA_REFLECTION_LM="openai/gpt-4o" export GEPA_GEN_LM="openai/gpt-4o" +# OR use Databricks AI Gateway (routes through a centralized gateway with rate limits and logging) +export DATABRICKS_API_KEY="dapi..." +export DATABRICKS_API_BASE="https://.ai-gateway.cloud.databricks.com/mlflow/v1/serving-endpoints" +# IMPORTANT: When using AI Gateway, OPENAI_API_KEY must also be set to your Databricks API token. +# The MLflow judges and litellm call OpenAI-compatible endpoints, which read OPENAI_API_KEY for auth. +export OPENAI_API_KEY="$DATABRICKS_API_KEY" + # Optimize uv run python .test/scripts/optimize.py databricks-metric-views --preset quick --apply ``` @@ -364,6 +484,28 @@ uv run python .test/scripts/optimize.py --preset thorough # 150 iterat --requirement "..." # Inline requirement (repeatable) ``` +### Flag Details + +- **`--dry-run`**: Runs baseline evaluation on all training tasks — scores the current SKILL.md WITH and WITHOUT the skill in context, shows per-task scores and a cost estimate, then exits without running optimization. Useful for checking your baseline before committing to a full run. + +- **`--apply`**: Runs optimization to completion, then immediately writes the optimized SKILL.md back to `databricks-skills/`. Combines `optimize` + `--apply-last` in one step. Use when you're confident in the preset and want a hands-off workflow. + +- **`--apply-last`**: Loads the previously saved `optimized_SKILL.md` and `last_optimization.json` from `.test/skills//` and writes the optimized content back to the repo. Does **not** re-run optimization. Use after reviewing a previous run's diff to confirm the changes look good. + +- **`--include-tools`**: Makes MCP tool docstrings optimizable GEPA components alongside SKILL.md. Both are mutated by GEPA via round-robin selection. Tool descriptions are no longer read-only context — they become first-class candidates. Metric calls scale with component count (see [Component scaling](#component-scaling)). + +- **`--tools-only`**: Drops SKILL.md entirely. Only tool module docstrings become GEPA components. Uses a **cross-skill dataset** (tasks sampled from ALL skills with `ground_truth.yaml`, max 5 per skill) so optimized descriptions generalize across skills rather than overfitting to one. + +- **`--tool-modules`**: Filters which tool modules are extracted for optimization. Without this flag, all modules are included. Example: `--tool-modules sql serving` optimizes only the `tools_sql` and `tools_serving` components. + +- **`--all`**: Discovers all skills with `ground_truth.yaml` in `.test/skills/`, runs optimization sequentially for each, and prints per-skill results plus a summary table at the end. + +- **`--run-dir`**: Enables GEPA checkpointing. Each pass saves state to `{run_dir}/pass_{N}/`. If the same `--run-dir` is passed on a subsequent run, GEPA resumes from the last checkpoint. Use `touch {run_dir}/pass_N/gepa.stop` for graceful mid-pass stop. + +- **`--max-passes`**: Maximum number of optimization passes (default 5). Each pass feeds the previous best as seed. Early stops if improvement falls below the threshold (0.0005). Lower values trade potential quality for faster completion. + +- **`--token-budget`**: Hard ceiling on candidate token count. The efficiency scorer penalizes candidates that exceed this budget. Also available via `GEPA_TOKEN_BUDGET` env var. + ### Model Configuration | Env Var | Default | Purpose | @@ -437,6 +579,115 @@ test_cases: --- +## Test Case & Configuration Files + +Each skill under `.test/skills//` has two configuration files that drive evaluation and optimization. + +### `ground_truth.yaml` — What the skill must teach + +The evaluation dataset. Each test case represents a user prompt and the expected behavior when the skill is in context. + +**Full field schema:** + +| Field | Required | Description | +|-------|----------|-------------| +| `metadata.skill_name` | yes | Identifier matching the skill directory name | +| `metadata.version` | yes | Schema version (e.g., `"1.0"`) | +| `metadata.created_at` | no | ISO timestamp of creation | +| `test_cases[].id` | yes | Unique identifier (convention: `__`) | +| `test_cases[].inputs.prompt` | yes | The user question sent to the generation model | +| `test_cases[].outputs.response` | no | Expected reference answer. Used for judge comparison, **not** exact matching. Omit if you only want pattern/fact checks. | +| `test_cases[].expectations.expected_facts` | yes | List of factual claims the response must contain. The quality judge checks each one. | +| `test_cases[].expectations.expected_patterns` | no | Regex patterns with fields: `pattern`, `description`, and optionally `min_count` / `max_count`. Checked deterministically. | +| `test_cases[].expectations.guidelines` | no | Soft rules evaluated by the quality judge for things regex can't check (e.g., "Should explain why SELECT * doesn't work"). | +| `test_cases[].metadata.category` | recommended | Used for stratified train/val splitting. Common values: `happy_path`, `error_handling`, `advanced`, `conceptual`, `edge_case`. | + +**Example with all fields:** + +```yaml +metadata: + skill_name: databricks-metric-views + version: "1.0" + created_at: "2025-01-15T10:00:00Z" + +test_cases: + - id: metric-views_create_sql_001 + inputs: + prompt: "Create a metric view for order analytics" + outputs: + response: | + ```sql + CREATE OR REPLACE VIEW main.default.order_metrics + WITH METRICS LANGUAGE YAML + $$ + source: main.default.orders + measures: + - name: Total Revenue + expr: SUM(amount) + $$ + ``` + expectations: + expected_facts: + - "Uses CREATE OR REPLACE VIEW with WITH METRICS LANGUAGE YAML" + - "Defines measures with name and expr using aggregate functions" + expected_patterns: + - pattern: "WITH METRICS LANGUAGE YAML" + description: "Metric view DDL syntax" + min_count: 1 + - pattern: "MEASURE\\(" + description: "MEASURE() function for querying" + min_count: 0 + max_count: 5 + guidelines: + - "Must use WITH METRICS LANGUAGE YAML syntax, not CREATE METRIC VIEW" + - "Should include a complete YAML block between $$ delimiters" + metadata: + category: happy_path +``` + +### `manifest.yaml` — How to evaluate the skill + +Configures which scorers run and what quality thresholds apply during evaluation. + +**Full field schema:** + +| Field | Description | +|-------|-------------| +| `skill_name` | Identifier matching the skill directory name | +| `scorers.enabled` | List of deterministic scorers to run: `python_syntax`, `sql_syntax`, `pattern_adherence`, `no_hallucinated_apis`, `expected_facts_present` | +| `scorers.llm_scorers` | List of LLM-based scorers: `Safety`, `guidelines_from_expectations`, `Guidelines` | +| `scorers.default_guidelines` | Fallback guidelines applied when a test case doesn't specify its own `guidelines` field | +| `quality_gates` | Minimum score thresholds per scorer (e.g., `syntax_valid: 1.0`, `pattern_adherence: 0.9`). Failing a gate flags the test case. | +| `scorers.trace_expectations.tool_limits` | Max number of tool calls allowed (for trace-based scoring) | +| `scorers.trace_expectations.token_budget` | Max tokens allowed in the response | +| `scorers.trace_expectations.required_tools` | Tools that must be called (e.g., `["execute_sql"]`) | +| `scorers.trace_expectations.banned_tools` | Tools that must not be called | + +**Example:** + +```yaml +skill_name: databricks-metric-views + +scorers: + enabled: + - sql_syntax + - pattern_adherence + - expected_facts_present + llm_scorers: + - Safety + - guidelines_from_expectations + default_guidelines: + - "Responses must use Databricks-specific syntax, not generic SQL" + - "Code examples must be runnable without modification" + +quality_gates: + syntax_valid: 1.0 + pattern_adherence: 0.9 + safety: 1.0 +``` + +--- + ## Architecture ``` diff --git a/.test/src/skill_test/optimize/gskill/README.md b/.test/src/skill_test/optimize/gskill/README.md deleted file mode 100644 index 1c4621b0..00000000 --- a/.test/src/skill_test/optimize/gskill/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# gskill: Auto-Generate Optimized Skills for Your Repository - -`gskill` uses [GEPA](https://github.com/databricks/gepa) to automatically generate optimized SKILL.md files that teach Claude Code how to work with your specific Databricks project patterns. - -## What It Does - -1. **Scans your repository** for Databricks patterns (SDK usage, SQL, notebooks, configs) -2. **Generates a SKILL.md** optimized for AI agent consumption -3. **Validates quality** using the skill-test evaluation framework -4. **Outputs to `.claude/skills/`** so Claude Code automatically picks it up - -## Quick Start - -### Prerequisites - -```bash -# Install GEPA -pip install gepa>=0.1.0 - -# Set up LLM API keys (for GEPA reflection) -export OPENAI_API_KEY=your-key-here -``` - -### Generate a Skill - -```bash -# From the ai-dev-kit repository -cd /path/to/ai-dev-kit - -# Generate a skill for your project repo -uv run python -c " -from skill_test.optimize.gskill import run_gskill -result = run_gskill('/path/to/your/databricks-project') -print(f'Generated: {result[\"skill_path\"]}') -" -``` - -### Using with Claude Code - -Once generated, the skill is automatically available to Claude Code: - -``` -your-repo/ -├── .claude/ -│ └── skills/ -│ └── your-repo/ -│ └── SKILL.md # <- Generated skill -├── src/ -│ └── ... -``` - -Claude Code reads `.claude/skills/*/SKILL.md` files and uses them as context when helping with your code. - -## Configuration - -### Presets - -| Preset | Iterations | Best For | -|--------|-----------|----------| -| `quick` | 15 | Initial generation, small repos | -| `standard` | 50 | Most repos (default) | -| `thorough` | 150 | Large repos, production quality | - -### Custom Context - -Provide additional files for gskill to consider: - -```python -from skill_test.optimize.gskill import run_gskill - -result = run_gskill( - repo_path="/path/to/your/repo", - preset="standard", - context_files=[ - "docs/architecture.md", - "README.md", - "src/config.py", - ], -) -``` - -## Evaluating Generated Skills - -Use the ai-dev-kit evaluation framework to validate generated skills: - -```bash -# 1. Add test cases for the generated skill -uv run python .test/scripts/init_skill.py your-skill-name - -# 2. Add ground truth test cases -uv run python .test/scripts/add.py your-skill-name - -# 3. Run evaluation -uv run python .test/scripts/run_eval.py your-skill-name - -# 4. Optimize further with GEPA -uv run python .test/scripts/optimize.py your-skill-name -``` - -## How It Works - -``` -Your Repository - │ - ▼ -┌──────────┐ ┌──────────┐ ┌──────────┐ -│ Scan for │ --> │ Generate │ --> │ Validate │ --> SKILL.md -│ patterns │ │ SKILL.md │ │ quality │ -└──────────┘ └──────────┘ └──────────┘ - │ - ▼ - GEPA optimize_anything - (reflects on quality, - iterates to improve) -``` - -GEPA's `optimize_anything` treats the SKILL.md as the artifact to optimize. It: -- Starts with patterns found in your repo as the seed -- Uses GEPA's reflection LM to propose improvements -- Scores each iteration for quality, correctness, and conciseness -- Selects the best candidate via Pareto frontier optimization - -## Tips - -- **Keep skills focused**: One skill per domain (e.g., separate skills for "data pipeline" and "model serving") -- **Add test cases**: Skills with ground truth test cases optimize much better than bootstrap mode -- **Iterate**: Run `optimize.py` after adding test cases for incremental improvement -- **Token budget**: Skills should be as concise as possible -- every token consumed is agent context window budget diff --git a/.test/src/skill_test/optimize/gskill/__init__.py b/.test/src/skill_test/optimize/gskill/__init__.py deleted file mode 100644 index d27b7b51..00000000 --- a/.test/src/skill_test/optimize/gskill/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""gskill: Generate optimized skills for customer repositories using GEPA.""" - -from .pipeline import run_gskill - -__all__ = ["run_gskill"] diff --git a/.test/src/skill_test/optimize/gskill/pipeline.py b/.test/src/skill_test/optimize/gskill/pipeline.py deleted file mode 100644 index 76a263f5..00000000 --- a/.test/src/skill_test/optimize/gskill/pipeline.py +++ /dev/null @@ -1,141 +0,0 @@ -"""gskill pipeline: generate optimized skills for customer repositories. - -Uses optimize_anything to produce SKILL.md files from repository context. -""" - -import ast -import re -from pathlib import Path -from typing import Any - -from gepa.optimize_anything import optimize_anything -import gepa.optimize_anything as oa - -from ..config import get_preset - - -def run_gskill( - repo_path: str | Path, - skill_name: str | None = None, - output_dir: str | Path | None = None, - preset: str = "standard", - context_files: list[str] | None = None, -) -> dict[str, Any]: - """Generate an optimized skill for a customer repository. - - Args: - repo_path: Path to the customer's repository - skill_name: Name for the generated skill (auto-detected if None) - output_dir: Override output directory - preset: GEPA optimization preset - context_files: Additional files to provide as context - - Returns: - Dict with generated skill path and metadata - """ - repo_path = Path(repo_path).resolve() - if not repo_path.exists(): - raise FileNotFoundError(f"Repository not found: {repo_path}") - - config = get_preset(preset) - - if skill_name is None: - skill_name = repo_path.name - - repo_context = _scan_repo(repo_path, context_files) - - seed_content = ( - f"# {skill_name}\n\n## Overview\n\nPatterns and best practices for the {skill_name} project.\n\n" + repo_context - ) - - def evaluate(candidate: str, example: dict) -> tuple[float, dict]: - """Score structural quality of generated skill.""" - score = 0.0 - parts = 0 - - if re.search(r"^#{1,3}\s+", candidate, re.MULTILINE): - score += 1.0 - parts += 1 - - code_blocks = re.findall(r"```(\w+)\n(.*?)```", candidate, re.DOTALL) - if code_blocks: - score += 1.0 - parts += 1 - py_blocks = [b for lang, b in code_blocks if lang == "python"] - if py_blocks: - valid = sum(1 for b in py_blocks if _parses(b)) - score += valid / len(py_blocks) - parts += 1 - - word_count = len(candidate.split()) - if 200 <= word_count <= 5000: - score += 1.0 - parts += 1 - - final = score / parts if parts > 0 else 0.0 - oa.log(f"Structure score: {final:.2f}, words: {word_count}") - - return final, {"structure_score": final, "word_count": word_count} - - trainset = [ - {"input": f"Help me understand patterns in {skill_name}", "additional_context": {}, "answer": ""}, - {"input": f"Show code examples from {skill_name}", "additional_context": {}, "answer": ""}, - ] - - result = optimize_anything( - seed_candidate=seed_content, - evaluator=evaluate, - dataset=trainset, - objective=f"Generate a SKILL.md that teaches an AI coding agent the patterns in {skill_name}.", - background=( - "SKILL.md files teach AI agents (Claude Code) repository-specific patterns. " - "Focus on Databricks patterns: Unity Catalog, MLflow, Spark, Delta Lake, etc. " - "Be CONCISE and ACTION-ORIENTED. Lead with code examples." - ), - config=config, - ) - - generated_content = result.best_candidate - if isinstance(generated_content, dict): - generated_content = list(generated_content.values())[0] - - if output_dir is None: - output_dir = repo_path / ".claude" / "skills" / skill_name - else: - output_dir = Path(output_dir) - - output_dir.mkdir(parents=True, exist_ok=True) - skill_path = output_dir / "SKILL.md" - skill_path.write_text(generated_content) - - return { - "skill_name": skill_name, - "skill_path": str(skill_path), - "content_length": len(generated_content), - "repo_path": str(repo_path), - "preset": preset, - } - - -def _parses(code: str) -> bool: - try: - ast.parse(code) - return True - except SyntaxError: - return False - - -def _scan_repo(repo_path: Path, context_files: list[str] | None = None) -> str: - context_parts = [] - if context_files: - for f in context_files: - p = Path(f) if Path(f).is_absolute() else repo_path / f - if p.exists(): - content = p.read_text()[:5000] - context_parts.append(f"### {p.name}\n\n```\n{content}\n```\n") - - readme = repo_path / "README.md" - if readme.exists() and not context_files: - context_parts.append(f"### README\n\n{readme.read_text()[:3000]}\n") - - return "\n".join(context_parts) if context_parts else "" From 4256c020c1dda2c5de20d1f4cf099061b807db30 Mon Sep 17 00:00:00 2001 From: calreynolds Date: Wed, 4 Mar 2026 10:39:23 -0500 Subject: [PATCH 20/20] fix: remove hardcoded email from app.yaml, deduplicate DNS resolution, validate schema name - Replace austin.choi@databricks.com with empty placeholder in MLFLOW_EXPERIMENT_NAME - Import _resolve_hostname from database.py instead of duplicating in alembic/env.py - Add regex validation on schema name before interpolation into SQL Co-Authored-By: Claude Opus 4.6 --- databricks-builder-app/alembic/env.py | 34 +++++---------------------- databricks-builder-app/app.yaml | 2 +- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/databricks-builder-app/alembic/env.py b/databricks-builder-app/alembic/env.py index 0b8110db..4d39d0ce 100644 --- a/databricks-builder-app/alembic/env.py +++ b/databricks-builder-app/alembic/env.py @@ -5,8 +5,6 @@ """ import os -import socket -import subprocess from logging.config import fileConfig from alembic import context @@ -18,6 +16,7 @@ # Import models for autogenerate support from server.db.models import Base +from server.db.database import _resolve_hostname # this is the Alembic Config object config = context.config @@ -33,32 +32,6 @@ _resolved_hostaddr = None -def _resolve_hostname(hostname): - """Resolve hostname to IP using dig fallback for macOS DNS issues.""" - try: - result = socket.getaddrinfo(hostname, 5432) - if result: - return result[0][4][0] - except socket.gaierror: - pass - - try: - result = subprocess.run( - ['dig', '+short', hostname, 'A'], - capture_output=True, - text=True, - timeout=10, - ) - ips = [line for line in result.stdout.strip().split('\n') if line and line[0].isdigit()] - if ips: - print(f'[Alembic] Resolved {hostname} -> {ips[0]} via dig') - return ips[0] - except Exception: - pass - - return None - - def get_url_and_connect_args(): """Get database URL and connect_args from environment. @@ -153,6 +126,11 @@ def run_migrations_online(): # Get schema name from Alembic config or environment schema_name = config.get_main_option('lakebase_schema_name') or os.environ.get('LAKEBASE_SCHEMA_NAME', 'builder_app') + # Validate schema name to prevent SQL injection (only allow alphanumeric + underscores) + import re + if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', schema_name): + raise ValueError(f'Invalid schema name: {schema_name!r} — must be alphanumeric/underscores only') + # Add search_path to connect_args so tables are created in the custom schema connect_args.setdefault('options', f'-c search_path={schema_name},public') diff --git a/databricks-builder-app/app.yaml b/databricks-builder-app/app.yaml index a57d8526..1349cb8c 100644 --- a/databricks-builder-app/app.yaml +++ b/databricks-builder-app/app.yaml @@ -114,7 +114,7 @@ env: value: "databricks-uc" # Optional: Default MLflow experiment for traces (can be overridden per-session in the UI) - name: MLFLOW_EXPERIMENT_NAME - value: "/Users/austin.choi@databricks.com/claude-code-traces" + value: "" # Set to your MLflow experiment path, e.g. "/Users/your.email@databricks.com/claude-code-traces" # ============================================================================= # Permission Configuration