eval-view/Makefile at main · XJ789/eval-view · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
.PHONY: help install test format lint typecheck check clean dev-install run-example agent-tests gym gym-list gym-failures gym-security gym-agent \
        dogfood-agent dogfood-check dogfood-snapshot dogfood-run

# Default target
help:
	@echo "EvalView Development Commands"
	@echo ""
	@echo "Setup (choose pip OR uv):"
	@echo "  make install       Install package (uv sync)"
	@echo "  make dev-install   Install with all extras (uv sync --all-extras)"
	@echo "  make pip-install   Install package (pip install -e .)"
	@echo "  make pip-dev       Install with dev extras (pip install -e '.[dev]')"
	@echo ""
	@echo "Development (uv - faster):"
	@echo "  make format        Format code with black"
	@echo "  make lint          Lint code with ruff"
	@echo "  make typecheck     Type check with mypy"
	@echo "  make check         Run all checks (format + lint + typecheck)"
	@echo "  make test          Run tests with pytest"
	@echo ""
	@echo "Development (pip - traditional):"
	@echo "  make pip-format    Format code with black (pip)"
	@echo "  make pip-lint      Lint code with ruff (pip)"
	@echo "  make pip-typecheck Type check with mypy (pip)"
	@echo "  make pip-check     Run all checks (pip)"
	@echo "  make pip-test      Run tests with pytest (pip)"
	@echo ""
	@echo "Utilities:"
	@echo "  make clean         Clean build artifacts and cache"
	@echo "  make run-example   Run example test case"
	@echo "  make agent-tests   Run EvalView agent tests (no CI required)"
	@echo ""
	@echo "Gym (Practice Evals):"
	@echo "  make gym           Run all gym scenarios"
	@echo "  make gym-list      List available scenarios"
	@echo "  make gym-failures  Run failure-mode scenarios only"
	@echo "  make gym-security  Run security scenarios only"
	@echo "  make gym-agent     Start the gym agent (localhost:2024)"
	@echo ""

# ============================================
# UV-based commands (faster, recommended)
# ============================================

install:
	uv sync

dev-install:
	uv sync --all-extras

format:
	@echo "Formatting code with black..."
	uv run black evalview/ tests/ --line-length 100

lint:
	@echo "Linting code with ruff..."
	uv run ruff check evalview/ tests/

typecheck:
	@echo "Type checking with mypy..."
	uv run mypy evalview/ --strict

check: format lint typecheck
	@echo "✅ All checks passed!"

test:
	@echo "Running tests with pytest..."
	uv run pytest tests/ -v

test-cov:
	@echo "Running tests with coverage..."
	uv run pytest tests/ --cov=evalview --cov-report=html --cov-report=term

# ============================================
# Pip-based commands (traditional)
# ============================================

pip-install:
	pip install -e .

pip-dev:
	pip install -e ".[dev]"

pip-format:
	@echo "Formatting code with black..."
	black evalview/ tests/ --line-length 100

pip-lint:
	@echo "Linting code with ruff..."
	ruff check evalview/ tests/

pip-typecheck:
	@echo "Type checking with mypy..."
	mypy evalview/ --strict

pip-check: pip-format pip-lint pip-typecheck
	@echo "✅ All checks passed!"

pip-test:
	@echo "Running tests with pytest..."
	pytest tests/ -v

pip-test-cov:
	@echo "Running tests with coverage..."
	pytest tests/ --cov=evalview --cov-report=html --cov-report=term

# ============================================
# Shared utilities
# ============================================

clean:
	@echo "Cleaning build artifacts..."
	rm -rf build/ dist/ *.egg-info/
	rm -rf .pytest_cache/ .mypy_cache/ .ruff_cache/
	rm -rf htmlcov/ .coverage
	rm -rf .venv/ venv/
	find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
	find . -type f -name "*.pyc" -delete
	@echo "✅ Cleaned!"

run-example:
	@echo "Running example test case..."
	@if [ ! -d ".evalview" ]; then \
		echo "Initializing EvalView..."; \
		uv run evalview init --dir .; \
	fi
	@if [ -f "tests/test-cases/example.yaml" ]; then \
		uv run evalview run --pattern "example.yaml" --verbose; \
	else \
		echo "❌ No example.yaml found. Run 'evalview init' first."; \
	fi

agent-tests:
	@echo "Running EvalView agent tests..."
	uv run evalview run --pattern "tests/test-cases/*.yaml" --verbose

dev: dev-install
	@echo "✅ Development environment ready!"
	@echo ""
	@echo "Next steps:"
	@echo "  1. Run 'uv run evalview init' to create a test project"
	@echo "  2. Make your changes"
	@echo "  3. Run 'make check' to verify code quality"
	@echo "  4. Run 'make test' to run tests"

quick-test:
	@echo "Running quick test (no coverage)..."
	uv run pytest tests/ -x --tb=short

# ============================================
# Gym - Agent Resilience Training
# ============================================

gym:
	@echo "━━━ EvalView Gym ━━━"
	@echo "Running all gym scenarios..."
	uv run evalview gym

gym-list:
	@echo "━━━ Gym Scenarios ━━━"
	uv run evalview gym --list-only

gym-failures:
	@echo "━━━ Running Failure Mode Scenarios ━━━"
	uv run evalview gym --suite failure-modes

gym-security:
	@echo "━━━ Running Security Scenarios ━━━"
	uv run evalview gym --suite security

gym-agent:
	@echo "━━━ Starting Gym Agent ━━━"
	@echo "Agent will run at http://localhost:2024"
	@echo ""
	cd gym/agents/support-bot && uv run langgraph dev

# ── Dogfood regression tests ──────────────────────────────────────────────────
# EvalView testing itself using the deterministic mock agent (port 8002).
# Tests the 3 correct-behavior scenarios to catch EvalView evaluation regressions.

## Start the deterministic mock agent (keep running in a separate terminal)
dogfood-agent:
	uv run python dogfood/mock_agent.py

## Check for regressions in EvalView's evaluation logic (requires dogfood-agent)
dogfood-check:
	uv run evalview check dogfood/mock-agent-tests/ --fail-on REGRESSION

## Save current evaluation results as the new golden baseline (requires dogfood-agent)
dogfood-snapshot:
	uv run evalview snapshot dogfood/mock-agent-tests/

## Run the full dogfood suite including failure-detection tests (requires dogfood-agent)
dogfood-run:
	uv run evalview run dogfood/mock-agent-tests/