diff --git a/packages/ai/package.json b/packages/ai/package.json index 4c4fdb4c..089d86d6 100644 --- a/packages/ai/package.json +++ b/packages/ai/package.json @@ -82,8 +82,10 @@ "c12": "^2.0.4", "commander": "^14.0.0", "defu": "^6.1.4", + "esbuild": "^0.25.8", "handlebars": "^4.7.8", - "nanoid": "^5.1.5" + "nanoid": "^5.1.5", + "vitest": "catalog:" }, "peerDependencies": { "@opentelemetry/api": "^1.9.0", @@ -105,13 +107,11 @@ "@vitest/coverage-v8": "^3.2.4", "aiv4": "npm:ai@^4.3.19", "aiv5": "npm:ai@^5.0.0", - "esbuild": "^0.25.8", "eslint": "catalog:", "prettier": "catalog:", "tinyrainbow": "^2.0.0", "tsup": "catalog:", "typescript": "catalog:", - "vitest": "catalog:", "zod": "catalog:" }, "files": [ diff --git a/packages/ai/test/evals/eval-integration.test.ts b/packages/ai/test/evals/eval-integration.test.ts new file mode 100644 index 00000000..731d4474 --- /dev/null +++ b/packages/ai/test/evals/eval-integration.test.ts @@ -0,0 +1,228 @@ +// @vitest-environment node +// @vitest-pool forks + +/** + * Integration test for Eval() that captures network calls and spans + */ + +import { afterAll, vi } from 'vitest'; +import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import type { ResolvedAxiomConfig } from '../../src/config/index'; +import type { ReadableSpan } from '@opentelemetry/sdk-trace-base'; + +const fetchCalls: Array<{ url: string; options: any }> = []; +const consoleOutput: string[] = []; +const spanExporter = new InMemorySpanExporter(); + +const tracerProvider = new NodeTracerProvider({ + spanProcessors: [new SimpleSpanProcessor(spanExporter)], +}); + +const mockConfig: ResolvedAxiomConfig = { + eval: { + url: 'https://test.axiom.co', + token: 'test-token', + dataset: 'test-dataset', + instrumentation: null, + include: [], + exclude: [], + timeoutMs: 60_000, + }, +} as ResolvedAxiomConfig; + +global.fetch = vi.fn(async (url: string, options?: any) => { + fetchCalls.push({ url: String(url), options }); + + if (url.includes('_apl')) { + return new Response(JSON.stringify({ matches: [] }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); + } + + return new Response(JSON.stringify({ success: true }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); +}) as any; + +const originalLog = console.log; +console.log = (...args: any[]) => { + consoleOutput.push(args.map(String).join(' ')); +}; + +vi.doMock('../../src/evals/instrument', async () => { + const { trace: _trace } = await import('@opentelemetry/api'); + const tracer = tracerProvider.getTracer('axiom-eval-test'); + + return { + ensureInstrumentationInitialized: vi.fn(async () => {}), + initInstrumentation: vi.fn(async () => {}), + flush: vi.fn(async () => { + await tracerProvider.forceFlush(); + }), + startSpan: vi.fn((name: string, opts: any, ctx?: any) => { + return tracer.startSpan(name, opts, ctx); + }), + startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => { + const span = tracer.startSpan(name, opts, ctx); + try { + const result = await fn(span); + span.end(); + return result; + } catch (error) { + span.recordException(error as Error); + span.end(); + throw error; + } + }), + }; +}); + +// Mock context storage +vi.doMock('../../src/evals/context/storage', () => ({ + getAxiomConfig: vi.fn(() => mockConfig), + setAxiomConfig: vi.fn(), + getConfigScope: vi.fn(() => ({ + getAllDefaultFlags: () => ({}), + })), + withEvalContext: vi.fn(async (_opts: any, fn: any) => { + const result = await fn(); + return result; + }), + getEvalContext: vi.fn(() => ({ + flags: {}, + outOfScopeFlags: [], + })), +})); + +// Mock global flags +vi.doMock('../../src/evals/context/global-flags', () => ({ + getGlobalFlagOverrides: vi.fn(() => ({})), + setGlobalFlagOverrides: vi.fn(), +})); + +// Mock vitest inject() to provide context +vi.doMock('vitest', async () => { + const actual = await vi.importActual('vitest'); + return { + ...actual, + inject: vi.fn((key: string) => { + const context: Record = { + baseline: undefined, + debug: false, + list: false, + overrides: {}, + axiomConfig: mockConfig, + runId: 'test-run-123', + }; + return context[key]; + }), + }; +}); + +const { Eval } = await import('../../src/evals/eval'); +const { createScorer: Scorer } = await import('../../src/evals/scorers'); + +const testScorer = Scorer('test-scorer', async ({ output }: { output: any }) => { + return { + score: typeof output === 'string' && output.includes('output') ? 1.0 : 0.0, + }; +}); + +const scorer1 = Scorer('scorer-1', async ({ output: _output }: { output: any }) => ({ + score: 1.0, +})); + +const scorer2 = Scorer('scorer-2', async ({ output: _output }: { output: any }) => ({ + score: 0.8, +})); + +Eval('Integration-Test-Eval', { + data: async () => [ + { input: 'test input 1', expected: 'expected output 1' }, + { input: 'test input 2', expected: 'expected output 2' }, + ], + task: async ({ input }) => { + return `output for ${input}`; + }, + scorers: [testScorer as any], +}); + +Eval('Second-Eval', { + data: async () => [{ input: 'input A', expected: 'expected A' }], + task: async ({ input }) => `output for ${input}`, + scorers: [scorer1 as any], +}); + +Eval('Third-Eval', { + data: async () => [ + { input: 'input X', expected: 'expected X' }, + { input: 'input Y', expected: 'expected Y' }, + ], + task: async ({ input }) => `result for ${input}`, + scorers: [scorer2 as any], +}); + +afterAll(async () => { + console.log = originalLog; + + const spans: ReadableSpan[] = spanExporter.getFinishedSpans(); + + const baselineCall = fetchCalls.find((c) => c.url.includes('_apl')); + const createCall = fetchCalls.find( + (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'POST', + ); + const updateCall = fetchCalls.find( + (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'PATCH', + ); + + if (!baselineCall) throw new Error('Expected baseline query call'); + if (!createCall) throw new Error('Expected create evaluation call'); + if (!updateCall) throw new Error('Expected update evaluation call'); + + // Assert span structure for Integration-Test-Eval (comprehensive) + const evalSpan = spans.find((s) => s.name.includes('eval Integration-Test-Eval')); + const allCaseSpans = spans.filter((s) => s.name.startsWith('case')); + const integrationCaseSpans = allCaseSpans.slice(0, 2); // First 2 cases are from Integration-Test-Eval + const taskSpans = spans.filter((s) => s.name === 'task'); + const scorerSpans = spans.filter((s) => s.name.includes('score')); + + if (!evalSpan) throw new Error('Expected eval span'); + if (integrationCaseSpans.length !== 2) { + throw new Error( + `Expected 2 Integration-Test-Eval case spans, got ${integrationCaseSpans.length}`, + ); + } + if (taskSpans.length < 2) { + throw new Error(`Expected at least 2 task spans, got ${taskSpans.length}`); + } + if (scorerSpans.length < 2) { + throw new Error(`Expected at least 2 scorer spans, got ${scorerSpans.length}`); + } + + // Assert span attributes + const firstCaseSpan = integrationCaseSpans[0]; + const attrs = firstCaseSpan.attributes; + + if (!attrs['eval.case.input']) throw new Error('Expected eval.case.input attribute'); + if (!attrs['eval.case.output']) throw new Error('Expected eval.case.output attribute'); + if (!attrs['eval.case.scores']) throw new Error('Expected eval.case.scores attribute'); + + // Assert multiple evals ran (light validation) + const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval')); + const thirdEvalSpan = spans.find((s) => s.name.includes('Third-Eval')); + + if (!secondEvalSpan) throw new Error('Expected Second-Eval span'); + if (!thirdEvalSpan) throw new Error('Expected Third-Eval span'); + + // Total case count should be 2 + 1 + 2 = 5 + if (allCaseSpans.length !== 5) { + throw new Error(`Expected 5 total case spans, got ${allCaseSpans.length}`); + } + + // Cleanup + await tracerProvider.shutdown(); + await spanExporter.shutdown(); +}); diff --git a/packages/ai/vitest.config.ts b/packages/ai/vitest.config.ts index 887054f4..8f3512af 100644 --- a/packages/ai/vitest.config.ts +++ b/packages/ai/vitest.config.ts @@ -5,6 +5,7 @@ export default defineConfig({ test: { environment: 'node', include: ['test/**/*.test.ts'], + exclude: ['test/evals/**/*.integration.test.ts', '**/node_modules/**'], globals: true, pool: 'forks', // TODO: ensure that this allows parallel tests