Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
35c48c6
adding skill evals and skill optimization capabilities using GEPA opt…
auschoi96 Feb 21, 2026
24419cb
optimization gepa updates and a demo/tutorial notebook
auschoi96 Feb 22, 2026
5f0c102
fixed config json extra parameter issues
auschoi96 Feb 22, 2026
33ae5a5
added optimizations for tools
auschoi96 Feb 22, 2026
7b6e1ea
fix to use gepa 0.1.0
auschoi96 Feb 22, 2026
bafd451
addition and updates to evaluation powering GEPA. added capability to…
auschoi96 Feb 24, 2026
d3757df
readme updates
auschoi96 Feb 24, 2026
ba0705f
Merge branch 'databricks-solutions:main' into main
auschoi96 Feb 26, 2026
37be3ee
Merge remote-tracking branch 'upstream/main'
auschoi96 Feb 27, 2026
ede86aa
readme updates and skillbench inclusion
auschoi96 Feb 27, 2026
e6c6410
Merge branch 'databricks-solutions:main' into main
auschoi96 Feb 27, 2026
54b783d
Merge branch 'main' of https://github.com/auschoi96/ai-dev-kit
auschoi96 Feb 27, 2026
d0a872e
Merge branch 'databricks-solutions:main' into main
auschoi96 Mar 2, 2026
50b7cdd
add the parsing skill for parsing documents/custom rag
NatyraB Mar 2, 2026
c0ca2c2
feat: add lakehouse monitoring reference to Unity Catalog skill
Feb 27, 2026
4ca586a
feat: updated lakehouse monitoring to use new naming data profiling
Mar 2, 2026
0953f6f
updated client files with latest sdk syntax recommendations
dbderek Mar 1, 2026
3929bc6
install databricks parsing skill
NatyraB Mar 2, 2026
93292d1
refactoring to use mlflow
auschoi96 Mar 3, 2026
c3c2772
refactoring to use mlflow
auschoi96 Mar 3, 2026
5dbabfb
Merge remote-tracking branch 'upstream/main'
auschoi96 Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
477 changes: 320 additions & 157 deletions .test/README.md

Large diffs are not rendered by default.

502 changes: 502 additions & 0 deletions .test/notebooks/gepa_skill_optimization_demo.ipynb

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion .test/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@ dependencies = [
[project.optional-dependencies]
databricks = ["databricks-sdk>=0.20.0"]
dev = ["pytest>=8.0", "pytest-asyncio>=0.23"]
all = ["skill-test[databricks,dev]"]
optimize = ["gepa>=0.1.0", "tiktoken>=0.7.0"]
# judges group: install separately when mlflow-deepeval/mlflow-ragas are published
# judges = ["mlflow-deepeval>=0.1.0", "mlflow-ragas>=0.1.0"]
all = ["skill-test[databricks,dev,optimize]"]

[project.scripts]
skill-test = "skill_test.cli:main"
Expand Down
272 changes: 272 additions & 0 deletions .test/scripts/add_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""Manually add a test case to a skill's ground_truth.yaml.

Usage:
# Interactive mode — prompts for each field
uv run python .test/scripts/add_example.py databricks-model-serving

# Inline mode — provide prompt and response directly
uv run python .test/scripts/add_example.py databricks-model-serving \
--prompt "Create a ChatAgent with tool calling" \
--response-file /path/to/response.md \
--facts "Uses ChatAgent class" "Implements predict method" \
--patterns "ChatAgent" "def predict"

# From clipboard
uv run python .test/scripts/add_example.py databricks-model-serving --from-clipboard
"""

import argparse
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent))
from _common import setup_path

setup_path()


def _detect_language(response: str) -> str | None:
"""Auto-detect code language from response code blocks."""
langs = re.findall(r"```(\w+)\n", response)
if not langs:
return None
# Most common language wins
from collections import Counter
counts = Counter(l for l in langs if l != "text")
return counts.most_common(1)[0][0] if counts else None


def _auto_extract_patterns(response: str) -> list[str]:
"""Extract patterns from code blocks."""
patterns = []
for match in re.finditer(r"```(?:python)\n(.*?)```", response, re.DOTALL):
code = match.group(1)
for m in re.finditer(r"class\s+(\w+)", code):
patterns.append(m.group(1))
for m in re.finditer(r"def\s+(\w+)", code):
patterns.append(m.group(1))

for match in re.finditer(r"```(?:sql)\n(.*?)```", response, re.DOTALL):
code = match.group(1)
for m in re.finditer(r"(?:CREATE|ALTER)\s+(?:TABLE|VIEW)\s+(\S+)", code, re.I):
patterns.append(m.group(1))

return list(dict.fromkeys(patterns))


def _next_id(skill_name: str, existing_ids: set[str]) -> str:
"""Generate the next sequential ID for a skill."""
prefix = skill_name.replace("-", "_")
idx = 1
while True:
candidate = f"{prefix}_{idx:03d}"
if candidate not in existing_ids:
return candidate
idx += 1


def _read_clipboard() -> str:
"""Read text from system clipboard."""
import subprocess
try:
result = subprocess.run(["pbpaste"], capture_output=True, text=True, check=True)
return result.stdout
except (subprocess.CalledProcessError, FileNotFoundError):
try:
result = subprocess.run(
["xclip", "-selection", "clipboard", "-o"],
capture_output=True, text=True, check=True,
)
return result.stdout
except (subprocess.CalledProcessError, FileNotFoundError):
print("Error: Could not read clipboard (tried pbpaste and xclip)")
sys.exit(1)


def main():
parser = argparse.ArgumentParser(
description="Add a test case to a skill's ground_truth.yaml",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"skill_name",
help="Name of the skill (e.g., databricks-model-serving)",
)
parser.add_argument(
"--prompt", "-p",
default=None,
help="The user prompt for the test case",
)
parser.add_argument(
"--response", "-r",
default=None,
help="The expected response text (inline)",
)
parser.add_argument(
"--response-file",
type=Path,
default=None,
help="Path to a file containing the expected response",
)
parser.add_argument(
"--facts", "-f",
nargs="*",
default=None,
help="Expected facts that must appear in the response",
)
parser.add_argument(
"--patterns",
nargs="*",
default=None,
help="Expected patterns (regex) that must match in the response",
)
parser.add_argument(
"--category", "-c",
default="happy_path",
help="Test case category (default: happy_path)",
)
parser.add_argument(
"--from-clipboard",
action="store_true",
help="Read prompt and response from clipboard (separated by ---)",
)
parser.add_argument(
"--id",
default=None,
help="Override the auto-generated test case ID",
)

args = parser.parse_args()

import yaml
from skill_test.dataset import get_dataset_source, YAMLDatasetSource

# Validate skill exists
skill_dir = Path(".test/skills") / args.skill_name
gt_path = skill_dir / "ground_truth.yaml"

if not skill_dir.exists():
print(f"Error: skill directory not found: {skill_dir}")
print("Available skills:")
for d in sorted(Path(".test/skills").iterdir()):
if d.is_dir() and not d.name.startswith("_"):
print(f" {d.name}")
sys.exit(1)

# Load existing records
existing_ids = set()
if gt_path.exists():
try:
source = YAMLDatasetSource(gt_path)
existing = source.load()
existing_ids = {r.id for r in existing}
except Exception:
pass

# Get prompt
prompt = args.prompt
response = args.response

if args.from_clipboard:
clipboard = _read_clipboard()
if "---" in clipboard:
parts = clipboard.split("---", 1)
prompt = parts[0].strip()
response = parts[1].strip()
else:
prompt = clipboard.strip()
print("Clipboard content set as prompt (no --- separator found for response)")

if args.response_file:
response = args.response_file.read_text()

if not prompt:
print("Enter the user prompt (Ctrl+D to finish):")
prompt = sys.stdin.read().strip()

if not prompt:
print("Error: prompt is required")
sys.exit(1)

if not response:
print("Enter the expected response (Ctrl+D to finish):")
response = sys.stdin.read().strip()

# Generate ID
test_id = args.id or _next_id(args.skill_name, existing_ids)

# Auto-extract patterns and facts
auto_patterns = _auto_extract_patterns(response) if response else []
auto_facts = args.facts or []
user_patterns = args.patterns or []

# Merge auto and user patterns
all_patterns = list(dict.fromkeys(user_patterns + auto_patterns))

# Detect language
language = _detect_language(response) if response else None

# Build test case
test_case = {
"id": test_id,
"inputs": {"prompt": prompt},
"metadata": {
"category": args.category,
"source": "manual",
},
}

if response:
test_case["outputs"] = {"response": response}
if language:
test_case["metadata"]["language"] = language

expectations = {}
if auto_facts:
expectations["expected_facts"] = auto_facts
if all_patterns:
expectations["expected_patterns"] = all_patterns
if expectations:
test_case["expectations"] = expectations

# Show summary
print(f"\n--- Test Case Preview ---")
print(f"ID: {test_id}")
print(f"Skill: {args.skill_name}")
print(f"Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
if response:
print(f"Response: {len(response)} chars")
if all_patterns:
print(f"Patterns: {all_patterns}")
if auto_facts:
print(f"Facts: {auto_facts}")
print(f"Category: {args.category}")

# Confirm
if sys.stdin.isatty():
confirm = input("\nAppend to ground_truth.yaml? [Y/n] ").strip().lower()
if confirm and confirm != "y":
print("Aborted.")
sys.exit(0)

# Save
if gt_path.exists():
with open(gt_path) as f:
data = yaml.safe_load(f) or {"test_cases": []}
else:
gt_path.parent.mkdir(parents=True, exist_ok=True)
data = {"test_cases": []}

data["test_cases"].append(test_case)

with open(gt_path, "w") as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False, allow_unicode=True)

print(f"Added test case '{test_id}' to {gt_path}")


if __name__ == "__main__":
main()
Loading
Loading