forked from hidai25/eval-view
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
195 lines (159 loc) · 6.03 KB
/
Makefile
File metadata and controls
195 lines (159 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
.PHONY: help install test format lint typecheck check clean dev-install run-example agent-tests gym gym-list gym-failures gym-security gym-agent \
dogfood-agent dogfood-check dogfood-snapshot dogfood-run
# Default target
help:
@echo "EvalView Development Commands"
@echo ""
@echo "Setup (choose pip OR uv):"
@echo " make install Install package (uv sync)"
@echo " make dev-install Install with all extras (uv sync --all-extras)"
@echo " make pip-install Install package (pip install -e .)"
@echo " make pip-dev Install with dev extras (pip install -e '.[dev]')"
@echo ""
@echo "Development (uv - faster):"
@echo " make format Format code with black"
@echo " make lint Lint code with ruff"
@echo " make typecheck Type check with mypy"
@echo " make check Run all checks (format + lint + typecheck)"
@echo " make test Run tests with pytest"
@echo ""
@echo "Development (pip - traditional):"
@echo " make pip-format Format code with black (pip)"
@echo " make pip-lint Lint code with ruff (pip)"
@echo " make pip-typecheck Type check with mypy (pip)"
@echo " make pip-check Run all checks (pip)"
@echo " make pip-test Run tests with pytest (pip)"
@echo ""
@echo "Utilities:"
@echo " make clean Clean build artifacts and cache"
@echo " make run-example Run example test case"
@echo " make agent-tests Run EvalView agent tests (no CI required)"
@echo ""
@echo "Gym (Practice Evals):"
@echo " make gym Run all gym scenarios"
@echo " make gym-list List available scenarios"
@echo " make gym-failures Run failure-mode scenarios only"
@echo " make gym-security Run security scenarios only"
@echo " make gym-agent Start the gym agent (localhost:2024)"
@echo ""
# ============================================
# UV-based commands (faster, recommended)
# ============================================
install:
uv sync
dev-install:
uv sync --all-extras
format:
@echo "Formatting code with black..."
uv run black evalview/ tests/ --line-length 100
lint:
@echo "Linting code with ruff..."
uv run ruff check evalview/ tests/
typecheck:
@echo "Type checking with mypy..."
uv run mypy evalview/ --strict
check: format lint typecheck
@echo "✅ All checks passed!"
test:
@echo "Running tests with pytest..."
uv run pytest tests/ -v
test-cov:
@echo "Running tests with coverage..."
uv run pytest tests/ --cov=evalview --cov-report=html --cov-report=term
# ============================================
# Pip-based commands (traditional)
# ============================================
pip-install:
pip install -e .
pip-dev:
pip install -e ".[dev]"
pip-format:
@echo "Formatting code with black..."
black evalview/ tests/ --line-length 100
pip-lint:
@echo "Linting code with ruff..."
ruff check evalview/ tests/
pip-typecheck:
@echo "Type checking with mypy..."
mypy evalview/ --strict
pip-check: pip-format pip-lint pip-typecheck
@echo "✅ All checks passed!"
pip-test:
@echo "Running tests with pytest..."
pytest tests/ -v
pip-test-cov:
@echo "Running tests with coverage..."
pytest tests/ --cov=evalview --cov-report=html --cov-report=term
# ============================================
# Shared utilities
# ============================================
clean:
@echo "Cleaning build artifacts..."
rm -rf build/ dist/ *.egg-info/
rm -rf .pytest_cache/ .mypy_cache/ .ruff_cache/
rm -rf htmlcov/ .coverage
rm -rf .venv/ venv/
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
find . -type f -name "*.pyc" -delete
@echo "✅ Cleaned!"
run-example:
@echo "Running example test case..."
@if [ ! -d ".evalview" ]; then \
echo "Initializing EvalView..."; \
uv run evalview init --dir .; \
fi
@if [ -f "tests/test-cases/example.yaml" ]; then \
uv run evalview run --pattern "example.yaml" --verbose; \
else \
echo "❌ No example.yaml found. Run 'evalview init' first."; \
fi
agent-tests:
@echo "Running EvalView agent tests..."
uv run evalview run --pattern "tests/test-cases/*.yaml" --verbose
dev: dev-install
@echo "✅ Development environment ready!"
@echo ""
@echo "Next steps:"
@echo " 1. Run 'uv run evalview init' to create a test project"
@echo " 2. Make your changes"
@echo " 3. Run 'make check' to verify code quality"
@echo " 4. Run 'make test' to run tests"
quick-test:
@echo "Running quick test (no coverage)..."
uv run pytest tests/ -x --tb=short
# ============================================
# Gym - Agent Resilience Training
# ============================================
gym:
@echo "━━━ EvalView Gym ━━━"
@echo "Running all gym scenarios..."
uv run evalview gym
gym-list:
@echo "━━━ Gym Scenarios ━━━"
uv run evalview gym --list-only
gym-failures:
@echo "━━━ Running Failure Mode Scenarios ━━━"
uv run evalview gym --suite failure-modes
gym-security:
@echo "━━━ Running Security Scenarios ━━━"
uv run evalview gym --suite security
gym-agent:
@echo "━━━ Starting Gym Agent ━━━"
@echo "Agent will run at http://localhost:2024"
@echo ""
cd gym/agents/support-bot && uv run langgraph dev
# ── Dogfood regression tests ──────────────────────────────────────────────────
# EvalView testing itself using the deterministic mock agent (port 8002).
# Tests the 3 correct-behavior scenarios to catch EvalView evaluation regressions.
## Start the deterministic mock agent (keep running in a separate terminal)
dogfood-agent:
uv run python dogfood/mock_agent.py
## Check for regressions in EvalView's evaluation logic (requires dogfood-agent)
dogfood-check:
uv run evalview check dogfood/mock-agent-tests/ --fail-on REGRESSION
## Save current evaluation results as the new golden baseline (requires dogfood-agent)
dogfood-snapshot:
uv run evalview snapshot dogfood/mock-agent-tests/
## Run the full dogfood suite including failure-detection tests (requires dogfood-agent)
dogfood-run:
uv run evalview run dogfood/mock-agent-tests/