From ce596606b933506d6fe7f016a6d1e69c37be40d7 Mon Sep 17 00:00:00 2001 From: Christopher Ehrlich Date: Fri, 14 Nov 2025 14:40:00 +0700 Subject: [PATCH 1/2] initial testing idea --- packages/ai/package.json | 6 +- .../ai/test/evals/eval-integration.test.ts | 205 ++++++++++++++++++ packages/ai/test/evals/eval-multi.test.ts | 163 ++++++++++++++ packages/ai/vitest.config.ts | 1 + 4 files changed, 372 insertions(+), 3 deletions(-) create mode 100644 packages/ai/test/evals/eval-integration.test.ts create mode 100644 packages/ai/test/evals/eval-multi.test.ts diff --git a/packages/ai/package.json b/packages/ai/package.json index 4c4fdb4c..089d86d6 100644 --- a/packages/ai/package.json +++ b/packages/ai/package.json @@ -82,8 +82,10 @@ "c12": "^2.0.4", "commander": "^14.0.0", "defu": "^6.1.4", + "esbuild": "^0.25.8", "handlebars": "^4.7.8", - "nanoid": "^5.1.5" + "nanoid": "^5.1.5", + "vitest": "catalog:" }, "peerDependencies": { "@opentelemetry/api": "^1.9.0", @@ -105,13 +107,11 @@ "@vitest/coverage-v8": "^3.2.4", "aiv4": "npm:ai@^4.3.19", "aiv5": "npm:ai@^5.0.0", - "esbuild": "^0.25.8", "eslint": "catalog:", "prettier": "catalog:", "tinyrainbow": "^2.0.0", "tsup": "catalog:", "typescript": "catalog:", - "vitest": "catalog:", "zod": "catalog:" }, "files": [ diff --git a/packages/ai/test/evals/eval-integration.test.ts b/packages/ai/test/evals/eval-integration.test.ts new file mode 100644 index 00000000..17a94a10 --- /dev/null +++ b/packages/ai/test/evals/eval-integration.test.ts @@ -0,0 +1,205 @@ +// @vitest-environment node +// @vitest-pool forks + +/** + * Integration test for Eval() that captures network calls, spans, and console output. + * + * IMPORTANT: Eval() must be called at the MODULE TOP-LEVEL, not inside it() blocks, + * because it calls describe() to create vitest suites dynamically. + */ + +import { afterAll, vi } from 'vitest'; +import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import type { ResolvedAxiomConfig } from '../../src/config/index'; +import type { ReadableSpan } from '@opentelemetry/sdk-trace-base'; + +// ===== SETUP: Capture side effects ===== + +const fetchCalls: Array<{ url: string; options: any }> = []; +const consoleOutput: string[] = []; +const spanExporter = new InMemorySpanExporter(); + +// Setup OTel tracer provider with in-memory exporter +const tracerProvider = new NodeTracerProvider({ + spanProcessors: [new SimpleSpanProcessor(spanExporter)], +}); + +const mockConfig: ResolvedAxiomConfig = { + eval: { + url: 'https://test.axiom.co', + token: 'test-token', + dataset: 'test-dataset', + instrumentation: null, + include: [], + exclude: [], + timeoutMs: 60_000, + }, +} as ResolvedAxiomConfig; + +// Mock fetch to capture network calls +global.fetch = vi.fn(async (url: string, options?: any) => { + fetchCalls.push({ url: String(url), options }); + + // Return empty baseline response for APL queries + if (url.includes('_apl')) { + return new Response(JSON.stringify({ matches: [] }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); + } + + // Return success for evaluation API calls + return new Response(JSON.stringify({ success: true }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); +}) as any; + +// Mock console.log to capture output +const originalLog = console.log; +console.log = (...args: any[]) => { + consoleOutput.push(args.map(String).join(' ')); +}; + +// Mock the instrumentation module to use our test provider +vi.doMock('../../src/evals/instrument', async () => { + const { trace: _trace } = await import('@opentelemetry/api'); + const tracer = tracerProvider.getTracer('axiom-eval-test'); + + return { + ensureInstrumentationInitialized: vi.fn(async () => {}), + initInstrumentation: vi.fn(async () => {}), + flush: vi.fn(async () => { + await tracerProvider.forceFlush(); + }), + startSpan: vi.fn((name: string, opts: any, ctx?: any) => { + return tracer.startSpan(name, opts, ctx); + }), + startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => { + const span = tracer.startSpan(name, opts, ctx); + try { + const result = await fn(span); + span.end(); + return result; + } catch (error) { + span.recordException(error as Error); + span.end(); + throw error; + } + }), + }; +}); + +// Mock context storage +vi.doMock('../../src/evals/context/storage', () => ({ + getAxiomConfig: vi.fn(() => mockConfig), + setAxiomConfig: vi.fn(), + getConfigScope: vi.fn(() => ({ + getAllDefaultFlags: () => ({}), + })), + withEvalContext: vi.fn(async (_opts: any, fn: any) => { + const result = await fn(); + return result; + }), + getEvalContext: vi.fn(() => ({ + flags: {}, + outOfScopeFlags: [], + })), +})); + +// Mock global flags +vi.doMock('../../src/evals/context/global-flags', () => ({ + getGlobalFlagOverrides: vi.fn(() => ({})), + setGlobalFlagOverrides: vi.fn(), +})); + +// Mock vitest inject() to provide context +vi.doMock('vitest', async () => { + const actual = await vi.importActual('vitest'); + return { + ...actual, + inject: vi.fn((key: string) => { + const context: Record = { + baseline: undefined, + debug: false, + list: false, + overrides: {}, + axiomConfig: mockConfig, + runId: 'test-run-123', + }; + return context[key]; + }), + }; +}); + +// ===== CALL Eval() AT TOP LEVEL ===== + +const { Eval } = await import('../../src/evals/eval'); + +// Create scorer function with name property +const testScorer = async ({ output }: { output: any }) => { + return { + score: typeof output === 'string' && output.includes('output') ? 1.0 : 0.0, + }; +}; +Object.defineProperty(testScorer, 'name', { value: 'test-scorer' }); + +Eval('Integration-Test-Eval', { + data: async () => [ + { input: 'test input 1', expected: 'expected output 1' }, + { input: 'test input 2', expected: 'expected output 2' }, + ], + task: async ({ input }) => { + return `output for ${input}`; + }, + scorers: [testScorer as any], +}); + +// ===== ASSERTIONS: Run after all tests complete ===== + +afterAll(async () => { + // Restore console + console.log = originalLog; + + // Get all captured data + const spans: ReadableSpan[] = spanExporter.getFinishedSpans(); + + // Assert network calls + const baselineCall = fetchCalls.find((c) => c.url.includes('_apl')); + const createCall = fetchCalls.find( + (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'POST', + ); + const updateCall = fetchCalls.find( + (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'PATCH', + ); + + if (!baselineCall) throw new Error('Expected baseline query call'); + if (!createCall) throw new Error('Expected create evaluation call'); + if (!updateCall) throw new Error('Expected update evaluation call'); + + // Assert span structure + const evalSpan = spans.find((s) => s.name.includes('eval Integration-Test-Eval')); + const caseSpans = spans.filter((s) => s.name.startsWith('case')); + const taskSpans = spans.filter((s) => s.name === 'task'); + const scorerSpans = spans.filter((s) => s.name.includes('score')); + + if (!evalSpan) throw new Error('Expected eval span'); + if (caseSpans.length !== 2) throw new Error(`Expected 2 case spans, got ${caseSpans.length}`); + if (taskSpans.length !== 2) throw new Error(`Expected 2 task spans, got ${taskSpans.length}`); + if (scorerSpans.length !== 2) { + throw new Error(`Expected 2 scorer spans, got ${scorerSpans.length}`); + } + + // Assert span attributes + const firstCaseSpan = caseSpans[0]; + const attrs = firstCaseSpan.attributes; + + if (!attrs['eval.case.input']) throw new Error('Expected eval.case.input attribute'); + if (!attrs['eval.case.output']) throw new Error('Expected eval.case.output attribute'); + if (!attrs['eval.case.scores']) throw new Error('Expected eval.case.scores attribute'); + + // Cleanup + await tracerProvider.shutdown(); + await spanExporter.shutdown(); +}); diff --git a/packages/ai/test/evals/eval-multi.test.ts b/packages/ai/test/evals/eval-multi.test.ts new file mode 100644 index 00000000..dc896e4b --- /dev/null +++ b/packages/ai/test/evals/eval-multi.test.ts @@ -0,0 +1,163 @@ +// @vitest-environment node +// @vitest-pool forks + +/** + * Test showing multiple Eval() calls in one file + */ + +import { afterAll, vi } from 'vitest'; +import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'; +import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; +import type { ResolvedAxiomConfig } from '../../src/config/index'; + +// Setup +const fetchCalls: Array<{ url: string; evalName?: string }> = []; +const spanExporter = new InMemorySpanExporter(); +const tracerProvider = new NodeTracerProvider({ + spanProcessors: [new SimpleSpanProcessor(spanExporter)], +}); + +const mockConfig: ResolvedAxiomConfig = { + eval: { + url: 'https://test.axiom.co', + token: 'test-token', + dataset: 'test-dataset', + instrumentation: null, + include: [], + exclude: [], + timeoutMs: 60_000, + }, +} as ResolvedAxiomConfig; + +global.fetch = vi.fn(async (url: string, _options?: any) => { + fetchCalls.push({ url: String(url) }); + if (url.includes('_apl')) { + return new Response(JSON.stringify({ matches: [] }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); + } + return new Response(JSON.stringify({ success: true }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + }); +}) as any; + +const originalLog = console.log; +console.log = (..._args: any[]) => { + // Suppress output +}; + +// Mock instrumentation +vi.doMock('../../src/evals/instrument', async () => { + const { trace: _trace } = await import('@opentelemetry/api'); + const tracer = tracerProvider.getTracer('axiom-eval-test'); + return { + ensureInstrumentationInitialized: vi.fn(async () => {}), + initInstrumentation: vi.fn(async () => {}), + flush: vi.fn(async () => { + await tracerProvider.forceFlush(); + }), + startSpan: vi.fn((name: string, opts: any, ctx?: any) => tracer.startSpan(name, opts, ctx)), + startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => { + const span = tracer.startSpan(name, opts, ctx); + try { + const result = await fn(span); + span.end(); + return result; + } catch (error) { + span.recordException(error as Error); + span.end(); + throw error; + } + }), + }; +}); + +vi.doMock('../../src/evals/context/storage', () => ({ + getAxiomConfig: vi.fn(() => mockConfig), + setAxiomConfig: vi.fn(), + getConfigScope: vi.fn(() => ({ getAllDefaultFlags: () => ({}) })), + withEvalContext: vi.fn(async (_opts: any, fn: any) => await fn()), + getEvalContext: vi.fn(() => ({ flags: {}, outOfScopeFlags: [] })), +})); + +vi.doMock('../../src/evals/context/global-flags', () => ({ + getGlobalFlagOverrides: vi.fn(() => ({})), + setGlobalFlagOverrides: vi.fn(), +})); + +vi.doMock('vitest', async () => { + const actual = await vi.importActual('vitest'); + return { + ...actual, + inject: vi.fn((key: string) => { + const context: Record = { + baseline: undefined, + debug: false, + list: false, + overrides: {}, + axiomConfig: mockConfig, + runId: 'test-run-123', + }; + return context[key]; + }), + }; +}); + +const { Eval } = await import('../../src/evals/eval'); + +// Create scorers +const scorer1 = async ({ output: _output }: { output: any }) => ({ score: 1.0 }); +Object.defineProperty(scorer1, 'name', { value: 'scorer-1' }); + +const scorer2 = async ({ output: _output }: { output: any }) => ({ score: 0.8 }); +Object.defineProperty(scorer2, 'name', { value: 'scorer-2' }); + +// ===== FIRST EVAL ===== +Eval('First-Eval', { + data: async () => [{ input: 'input 1', expected: 'expected 1' }], + task: async ({ input }) => `output for ${input}`, + scorers: [scorer1 as any], +}); + +// ===== SECOND EVAL ===== +Eval('Second-Eval', { + data: async () => [ + { input: 'input A', expected: 'expected A' }, + { input: 'input B', expected: 'expected B' }, + ], + task: async ({ input }) => `result for ${input}`, + scorers: [scorer2 as any], +}); + +afterAll(async () => { + console.log = originalLog; + + const spans = spanExporter.getFinishedSpans(); + + console.log('\n=== MULTIPLE EVALS IN ONE FILE ==='); + console.log(`Total fetch calls: ${fetchCalls.length}`); + console.log(`Total spans: ${spans.length}`); + + const firstEvalSpan = spans.find((s) => s.name.includes('First-Eval')); + const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval')); + + if (!firstEvalSpan) throw new Error('Expected First Eval span'); + if (!secondEvalSpan) throw new Error('Expected Second Eval span'); + + console.log('✓ Both evals ran'); + console.log(`✓ First Eval: ${firstEvalSpan.name}`); + console.log(`✓ Second Eval: ${secondEvalSpan.name}`); + + // We should have 3 tests total (1 from First + 2 from Second) + const caseSpans = spans.filter((s) => s.name.startsWith('case')); + if (caseSpans.length !== 3) { + throw new Error(`Expected 2 case spans, got ${caseSpans.length}`); + } + + console.log(`✓ All 3 test cases ran\n`); + + await tracerProvider.shutdown(); + await spanExporter.shutdown(); +}); diff --git a/packages/ai/vitest.config.ts b/packages/ai/vitest.config.ts index 887054f4..8f3512af 100644 --- a/packages/ai/vitest.config.ts +++ b/packages/ai/vitest.config.ts @@ -5,6 +5,7 @@ export default defineConfig({ test: { environment: 'node', include: ['test/**/*.test.ts'], + exclude: ['test/evals/**/*.integration.test.ts', '**/node_modules/**'], globals: true, pool: 'forks', // TODO: ensure that this allows parallel tests From 049667092f735e3c3d7808ba84dd013d58bf3e52 Mon Sep 17 00:00:00 2001 From: Christopher Ehrlich Date: Fri, 14 Nov 2025 15:00:07 +0700 Subject: [PATCH 2/2] better integration --- .../ai/test/evals/eval-integration.test.ts | 81 +++++---- packages/ai/test/evals/eval-multi.test.ts | 163 ------------------ 2 files changed, 52 insertions(+), 192 deletions(-) delete mode 100644 packages/ai/test/evals/eval-multi.test.ts diff --git a/packages/ai/test/evals/eval-integration.test.ts b/packages/ai/test/evals/eval-integration.test.ts index 17a94a10..731d4474 100644 --- a/packages/ai/test/evals/eval-integration.test.ts +++ b/packages/ai/test/evals/eval-integration.test.ts @@ -2,10 +2,7 @@ // @vitest-pool forks /** - * Integration test for Eval() that captures network calls, spans, and console output. - * - * IMPORTANT: Eval() must be called at the MODULE TOP-LEVEL, not inside it() blocks, - * because it calls describe() to create vitest suites dynamically. + * Integration test for Eval() that captures network calls and spans */ import { afterAll, vi } from 'vitest'; @@ -14,13 +11,10 @@ import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; import type { ResolvedAxiomConfig } from '../../src/config/index'; import type { ReadableSpan } from '@opentelemetry/sdk-trace-base'; -// ===== SETUP: Capture side effects ===== - const fetchCalls: Array<{ url: string; options: any }> = []; const consoleOutput: string[] = []; const spanExporter = new InMemorySpanExporter(); -// Setup OTel tracer provider with in-memory exporter const tracerProvider = new NodeTracerProvider({ spanProcessors: [new SimpleSpanProcessor(spanExporter)], }); @@ -37,11 +31,9 @@ const mockConfig: ResolvedAxiomConfig = { }, } as ResolvedAxiomConfig; -// Mock fetch to capture network calls global.fetch = vi.fn(async (url: string, options?: any) => { fetchCalls.push({ url: String(url), options }); - // Return empty baseline response for APL queries if (url.includes('_apl')) { return new Response(JSON.stringify({ matches: [] }), { status: 200, @@ -49,20 +41,17 @@ global.fetch = vi.fn(async (url: string, options?: any) => { }); } - // Return success for evaluation API calls return new Response(JSON.stringify({ success: true }), { status: 200, headers: { 'Content-Type': 'application/json' }, }); }) as any; -// Mock console.log to capture output const originalLog = console.log; console.log = (...args: any[]) => { consoleOutput.push(args.map(String).join(' ')); }; -// Mock the instrumentation module to use our test provider vi.doMock('../../src/evals/instrument', async () => { const { trace: _trace } = await import('@opentelemetry/api'); const tracer = tracerProvider.getTracer('axiom-eval-test'); @@ -133,17 +122,22 @@ vi.doMock('vitest', async () => { }; }); -// ===== CALL Eval() AT TOP LEVEL ===== - const { Eval } = await import('../../src/evals/eval'); +const { createScorer: Scorer } = await import('../../src/evals/scorers'); -// Create scorer function with name property -const testScorer = async ({ output }: { output: any }) => { +const testScorer = Scorer('test-scorer', async ({ output }: { output: any }) => { return { score: typeof output === 'string' && output.includes('output') ? 1.0 : 0.0, }; -}; -Object.defineProperty(testScorer, 'name', { value: 'test-scorer' }); +}); + +const scorer1 = Scorer('scorer-1', async ({ output: _output }: { output: any }) => ({ + score: 1.0, +})); + +const scorer2 = Scorer('scorer-2', async ({ output: _output }: { output: any }) => ({ + score: 0.8, +})); Eval('Integration-Test-Eval', { data: async () => [ @@ -156,16 +150,26 @@ Eval('Integration-Test-Eval', { scorers: [testScorer as any], }); -// ===== ASSERTIONS: Run after all tests complete ===== +Eval('Second-Eval', { + data: async () => [{ input: 'input A', expected: 'expected A' }], + task: async ({ input }) => `output for ${input}`, + scorers: [scorer1 as any], +}); + +Eval('Third-Eval', { + data: async () => [ + { input: 'input X', expected: 'expected X' }, + { input: 'input Y', expected: 'expected Y' }, + ], + task: async ({ input }) => `result for ${input}`, + scorers: [scorer2 as any], +}); afterAll(async () => { - // Restore console console.log = originalLog; - // Get all captured data const spans: ReadableSpan[] = spanExporter.getFinishedSpans(); - // Assert network calls const baselineCall = fetchCalls.find((c) => c.url.includes('_apl')); const createCall = fetchCalls.find( (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'POST', @@ -178,27 +182,46 @@ afterAll(async () => { if (!createCall) throw new Error('Expected create evaluation call'); if (!updateCall) throw new Error('Expected update evaluation call'); - // Assert span structure + // Assert span structure for Integration-Test-Eval (comprehensive) const evalSpan = spans.find((s) => s.name.includes('eval Integration-Test-Eval')); - const caseSpans = spans.filter((s) => s.name.startsWith('case')); + const allCaseSpans = spans.filter((s) => s.name.startsWith('case')); + const integrationCaseSpans = allCaseSpans.slice(0, 2); // First 2 cases are from Integration-Test-Eval const taskSpans = spans.filter((s) => s.name === 'task'); const scorerSpans = spans.filter((s) => s.name.includes('score')); if (!evalSpan) throw new Error('Expected eval span'); - if (caseSpans.length !== 2) throw new Error(`Expected 2 case spans, got ${caseSpans.length}`); - if (taskSpans.length !== 2) throw new Error(`Expected 2 task spans, got ${taskSpans.length}`); - if (scorerSpans.length !== 2) { - throw new Error(`Expected 2 scorer spans, got ${scorerSpans.length}`); + if (integrationCaseSpans.length !== 2) { + throw new Error( + `Expected 2 Integration-Test-Eval case spans, got ${integrationCaseSpans.length}`, + ); + } + if (taskSpans.length < 2) { + throw new Error(`Expected at least 2 task spans, got ${taskSpans.length}`); + } + if (scorerSpans.length < 2) { + throw new Error(`Expected at least 2 scorer spans, got ${scorerSpans.length}`); } // Assert span attributes - const firstCaseSpan = caseSpans[0]; + const firstCaseSpan = integrationCaseSpans[0]; const attrs = firstCaseSpan.attributes; if (!attrs['eval.case.input']) throw new Error('Expected eval.case.input attribute'); if (!attrs['eval.case.output']) throw new Error('Expected eval.case.output attribute'); if (!attrs['eval.case.scores']) throw new Error('Expected eval.case.scores attribute'); + // Assert multiple evals ran (light validation) + const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval')); + const thirdEvalSpan = spans.find((s) => s.name.includes('Third-Eval')); + + if (!secondEvalSpan) throw new Error('Expected Second-Eval span'); + if (!thirdEvalSpan) throw new Error('Expected Third-Eval span'); + + // Total case count should be 2 + 1 + 2 = 5 + if (allCaseSpans.length !== 5) { + throw new Error(`Expected 5 total case spans, got ${allCaseSpans.length}`); + } + // Cleanup await tracerProvider.shutdown(); await spanExporter.shutdown(); diff --git a/packages/ai/test/evals/eval-multi.test.ts b/packages/ai/test/evals/eval-multi.test.ts deleted file mode 100644 index dc896e4b..00000000 --- a/packages/ai/test/evals/eval-multi.test.ts +++ /dev/null @@ -1,163 +0,0 @@ -// @vitest-environment node -// @vitest-pool forks - -/** - * Test showing multiple Eval() calls in one file - */ - -import { afterAll, vi } from 'vitest'; -import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base'; -import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node'; -import type { ResolvedAxiomConfig } from '../../src/config/index'; - -// Setup -const fetchCalls: Array<{ url: string; evalName?: string }> = []; -const spanExporter = new InMemorySpanExporter(); -const tracerProvider = new NodeTracerProvider({ - spanProcessors: [new SimpleSpanProcessor(spanExporter)], -}); - -const mockConfig: ResolvedAxiomConfig = { - eval: { - url: 'https://test.axiom.co', - token: 'test-token', - dataset: 'test-dataset', - instrumentation: null, - include: [], - exclude: [], - timeoutMs: 60_000, - }, -} as ResolvedAxiomConfig; - -global.fetch = vi.fn(async (url: string, _options?: any) => { - fetchCalls.push({ url: String(url) }); - if (url.includes('_apl')) { - return new Response(JSON.stringify({ matches: [] }), { - status: 200, - headers: { 'Content-Type': 'application/json' }, - }); - } - return new Response(JSON.stringify({ success: true }), { - status: 200, - headers: { 'Content-Type': 'application/json' }, - }); -}) as any; - -const originalLog = console.log; -console.log = (..._args: any[]) => { - // Suppress output -}; - -// Mock instrumentation -vi.doMock('../../src/evals/instrument', async () => { - const { trace: _trace } = await import('@opentelemetry/api'); - const tracer = tracerProvider.getTracer('axiom-eval-test'); - return { - ensureInstrumentationInitialized: vi.fn(async () => {}), - initInstrumentation: vi.fn(async () => {}), - flush: vi.fn(async () => { - await tracerProvider.forceFlush(); - }), - startSpan: vi.fn((name: string, opts: any, ctx?: any) => tracer.startSpan(name, opts, ctx)), - startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => { - const span = tracer.startSpan(name, opts, ctx); - try { - const result = await fn(span); - span.end(); - return result; - } catch (error) { - span.recordException(error as Error); - span.end(); - throw error; - } - }), - }; -}); - -vi.doMock('../../src/evals/context/storage', () => ({ - getAxiomConfig: vi.fn(() => mockConfig), - setAxiomConfig: vi.fn(), - getConfigScope: vi.fn(() => ({ getAllDefaultFlags: () => ({}) })), - withEvalContext: vi.fn(async (_opts: any, fn: any) => await fn()), - getEvalContext: vi.fn(() => ({ flags: {}, outOfScopeFlags: [] })), -})); - -vi.doMock('../../src/evals/context/global-flags', () => ({ - getGlobalFlagOverrides: vi.fn(() => ({})), - setGlobalFlagOverrides: vi.fn(), -})); - -vi.doMock('vitest', async () => { - const actual = await vi.importActual('vitest'); - return { - ...actual, - inject: vi.fn((key: string) => { - const context: Record = { - baseline: undefined, - debug: false, - list: false, - overrides: {}, - axiomConfig: mockConfig, - runId: 'test-run-123', - }; - return context[key]; - }), - }; -}); - -const { Eval } = await import('../../src/evals/eval'); - -// Create scorers -const scorer1 = async ({ output: _output }: { output: any }) => ({ score: 1.0 }); -Object.defineProperty(scorer1, 'name', { value: 'scorer-1' }); - -const scorer2 = async ({ output: _output }: { output: any }) => ({ score: 0.8 }); -Object.defineProperty(scorer2, 'name', { value: 'scorer-2' }); - -// ===== FIRST EVAL ===== -Eval('First-Eval', { - data: async () => [{ input: 'input 1', expected: 'expected 1' }], - task: async ({ input }) => `output for ${input}`, - scorers: [scorer1 as any], -}); - -// ===== SECOND EVAL ===== -Eval('Second-Eval', { - data: async () => [ - { input: 'input A', expected: 'expected A' }, - { input: 'input B', expected: 'expected B' }, - ], - task: async ({ input }) => `result for ${input}`, - scorers: [scorer2 as any], -}); - -afterAll(async () => { - console.log = originalLog; - - const spans = spanExporter.getFinishedSpans(); - - console.log('\n=== MULTIPLE EVALS IN ONE FILE ==='); - console.log(`Total fetch calls: ${fetchCalls.length}`); - console.log(`Total spans: ${spans.length}`); - - const firstEvalSpan = spans.find((s) => s.name.includes('First-Eval')); - const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval')); - - if (!firstEvalSpan) throw new Error('Expected First Eval span'); - if (!secondEvalSpan) throw new Error('Expected Second Eval span'); - - console.log('✓ Both evals ran'); - console.log(`✓ First Eval: ${firstEvalSpan.name}`); - console.log(`✓ Second Eval: ${secondEvalSpan.name}`); - - // We should have 3 tests total (1 from First + 2 from Second) - const caseSpans = spans.filter((s) => s.name.startsWith('case')); - if (caseSpans.length !== 3) { - throw new Error(`Expected 2 case spans, got ${caseSpans.length}`); - } - - console.log(`✓ All 3 test cases ran\n`); - - await tracerProvider.shutdown(); - await spanExporter.shutdown(); -});