From ce596606b933506d6fe7f016a6d1e69c37be40d7 Mon Sep 17 00:00:00 2001
From: Christopher Ehrlich <ehrlich.christopher@gmail.com>
Date: Fri, 14 Nov 2025 14:40:00 +0700
Subject: [PATCH 1/2] initial testing idea

---
 packages/ai/package.json                      |   6 +-
 .../ai/test/evals/eval-integration.test.ts    | 205 ++++++++++++++++++
 packages/ai/test/evals/eval-multi.test.ts     | 163 ++++++++++++++
 packages/ai/vitest.config.ts                  |   1 +
 4 files changed, 372 insertions(+), 3 deletions(-)
 create mode 100644 packages/ai/test/evals/eval-integration.test.ts
 create mode 100644 packages/ai/test/evals/eval-multi.test.ts

diff --git a/packages/ai/package.json b/packages/ai/package.json
index 4c4fdb4c..089d86d6 100644
--- a/packages/ai/package.json
+++ b/packages/ai/package.json
@@ -82,8 +82,10 @@
     "c12": "^2.0.4",
     "commander": "^14.0.0",
     "defu": "^6.1.4",
+    "esbuild": "^0.25.8",
     "handlebars": "^4.7.8",
-    "nanoid": "^5.1.5"
+    "nanoid": "^5.1.5",
+    "vitest": "catalog:"
   },
   "peerDependencies": {
     "@opentelemetry/api": "^1.9.0",
@@ -105,13 +107,11 @@
     "@vitest/coverage-v8": "^3.2.4",
     "aiv4": "npm:ai@^4.3.19",
     "aiv5": "npm:ai@^5.0.0",
-    "esbuild": "^0.25.8",
     "eslint": "catalog:",
     "prettier": "catalog:",
     "tinyrainbow": "^2.0.0",
     "tsup": "catalog:",
     "typescript": "catalog:",
-    "vitest": "catalog:",
     "zod": "catalog:"
   },
   "files": [
diff --git a/packages/ai/test/evals/eval-integration.test.ts b/packages/ai/test/evals/eval-integration.test.ts
new file mode 100644
index 00000000..17a94a10
--- /dev/null
+++ b/packages/ai/test/evals/eval-integration.test.ts
@@ -0,0 +1,205 @@
+// @vitest-environment node
+// @vitest-pool forks
+
+/**
+ * Integration test for Eval() that captures network calls, spans, and console output.
+ *
+ * IMPORTANT: Eval() must be called at the MODULE TOP-LEVEL, not inside it() blocks,
+ * because it calls describe() to create vitest suites dynamically.
+ */
+
+import { afterAll, vi } from 'vitest';
+import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
+import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
+import type { ResolvedAxiomConfig } from '../../src/config/index';
+import type { ReadableSpan } from '@opentelemetry/sdk-trace-base';
+
+// ===== SETUP: Capture side effects =====
+
+const fetchCalls: Array<{ url: string; options: any }> = [];
+const consoleOutput: string[] = [];
+const spanExporter = new InMemorySpanExporter();
+
+// Setup OTel tracer provider with in-memory exporter
+const tracerProvider = new NodeTracerProvider({
+  spanProcessors: [new SimpleSpanProcessor(spanExporter)],
+});
+
+const mockConfig: ResolvedAxiomConfig = {
+  eval: {
+    url: 'https://test.axiom.co',
+    token: 'test-token',
+    dataset: 'test-dataset',
+    instrumentation: null,
+    include: [],
+    exclude: [],
+    timeoutMs: 60_000,
+  },
+} as ResolvedAxiomConfig;
+
+// Mock fetch to capture network calls
+global.fetch = vi.fn(async (url: string, options?: any) => {
+  fetchCalls.push({ url: String(url), options });
+
+  // Return empty baseline response for APL queries
+  if (url.includes('_apl')) {
+    return new Response(JSON.stringify({ matches: [] }), {
+      status: 200,
+      headers: { 'Content-Type': 'application/json' },
+    });
+  }
+
+  // Return success for evaluation API calls
+  return new Response(JSON.stringify({ success: true }), {
+    status: 200,
+    headers: { 'Content-Type': 'application/json' },
+  });
+}) as any;
+
+// Mock console.log to capture output
+const originalLog = console.log;
+console.log = (...args: any[]) => {
+  consoleOutput.push(args.map(String).join(' '));
+};
+
+// Mock the instrumentation module to use our test provider
+vi.doMock('../../src/evals/instrument', async () => {
+  const { trace: _trace } = await import('@opentelemetry/api');
+  const tracer = tracerProvider.getTracer('axiom-eval-test');
+
+  return {
+    ensureInstrumentationInitialized: vi.fn(async () => {}),
+    initInstrumentation: vi.fn(async () => {}),
+    flush: vi.fn(async () => {
+      await tracerProvider.forceFlush();
+    }),
+    startSpan: vi.fn((name: string, opts: any, ctx?: any) => {
+      return tracer.startSpan(name, opts, ctx);
+    }),
+    startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => {
+      const span = tracer.startSpan(name, opts, ctx);
+      try {
+        const result = await fn(span);
+        span.end();
+        return result;
+      } catch (error) {
+        span.recordException(error as Error);
+        span.end();
+        throw error;
+      }
+    }),
+  };
+});
+
+// Mock context storage
+vi.doMock('../../src/evals/context/storage', () => ({
+  getAxiomConfig: vi.fn(() => mockConfig),
+  setAxiomConfig: vi.fn(),
+  getConfigScope: vi.fn(() => ({
+    getAllDefaultFlags: () => ({}),
+  })),
+  withEvalContext: vi.fn(async (_opts: any, fn: any) => {
+    const result = await fn();
+    return result;
+  }),
+  getEvalContext: vi.fn(() => ({
+    flags: {},
+    outOfScopeFlags: [],
+  })),
+}));
+
+// Mock global flags
+vi.doMock('../../src/evals/context/global-flags', () => ({
+  getGlobalFlagOverrides: vi.fn(() => ({})),
+  setGlobalFlagOverrides: vi.fn(),
+}));
+
+// Mock vitest inject() to provide context
+vi.doMock('vitest', async () => {
+  const actual = await vi.importActual('vitest');
+  return {
+    ...actual,
+    inject: vi.fn((key: string) => {
+      const context: Record<string, any> = {
+        baseline: undefined,
+        debug: false,
+        list: false,
+        overrides: {},
+        axiomConfig: mockConfig,
+        runId: 'test-run-123',
+      };
+      return context[key];
+    }),
+  };
+});
+
+// ===== CALL Eval() AT TOP LEVEL =====
+
+const { Eval } = await import('../../src/evals/eval');
+
+// Create scorer function with name property
+const testScorer = async ({ output }: { output: any }) => {
+  return {
+    score: typeof output === 'string' && output.includes('output') ? 1.0 : 0.0,
+  };
+};
+Object.defineProperty(testScorer, 'name', { value: 'test-scorer' });
+
+Eval('Integration-Test-Eval', {
+  data: async () => [
+    { input: 'test input 1', expected: 'expected output 1' },
+    { input: 'test input 2', expected: 'expected output 2' },
+  ],
+  task: async ({ input }) => {
+    return `output for ${input}`;
+  },
+  scorers: [testScorer as any],
+});
+
+// ===== ASSERTIONS: Run after all tests complete =====
+
+afterAll(async () => {
+  // Restore console
+  console.log = originalLog;
+
+  // Get all captured data
+  const spans: ReadableSpan[] = spanExporter.getFinishedSpans();
+
+  // Assert network calls
+  const baselineCall = fetchCalls.find((c) => c.url.includes('_apl'));
+  const createCall = fetchCalls.find(
+    (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'POST',
+  );
+  const updateCall = fetchCalls.find(
+    (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'PATCH',
+  );
+
+  if (!baselineCall) throw new Error('Expected baseline query call');
+  if (!createCall) throw new Error('Expected create evaluation call');
+  if (!updateCall) throw new Error('Expected update evaluation call');
+
+  // Assert span structure
+  const evalSpan = spans.find((s) => s.name.includes('eval Integration-Test-Eval'));
+  const caseSpans = spans.filter((s) => s.name.startsWith('case'));
+  const taskSpans = spans.filter((s) => s.name === 'task');
+  const scorerSpans = spans.filter((s) => s.name.includes('score'));
+
+  if (!evalSpan) throw new Error('Expected eval span');
+  if (caseSpans.length !== 2) throw new Error(`Expected 2 case spans, got ${caseSpans.length}`);
+  if (taskSpans.length !== 2) throw new Error(`Expected 2 task spans, got ${taskSpans.length}`);
+  if (scorerSpans.length !== 2) {
+    throw new Error(`Expected 2 scorer spans, got ${scorerSpans.length}`);
+  }
+
+  // Assert span attributes
+  const firstCaseSpan = caseSpans[0];
+  const attrs = firstCaseSpan.attributes;
+
+  if (!attrs['eval.case.input']) throw new Error('Expected eval.case.input attribute');
+  if (!attrs['eval.case.output']) throw new Error('Expected eval.case.output attribute');
+  if (!attrs['eval.case.scores']) throw new Error('Expected eval.case.scores attribute');
+
+  // Cleanup
+  await tracerProvider.shutdown();
+  await spanExporter.shutdown();
+});
diff --git a/packages/ai/test/evals/eval-multi.test.ts b/packages/ai/test/evals/eval-multi.test.ts
new file mode 100644
index 00000000..dc896e4b
--- /dev/null
+++ b/packages/ai/test/evals/eval-multi.test.ts
@@ -0,0 +1,163 @@
+// @vitest-environment node
+// @vitest-pool forks
+
+/**
+ * Test showing multiple Eval() calls in one file
+ */
+
+import { afterAll, vi } from 'vitest';
+import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
+import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
+import type { ResolvedAxiomConfig } from '../../src/config/index';
+
+// Setup
+const fetchCalls: Array<{ url: string; evalName?: string }> = [];
+const spanExporter = new InMemorySpanExporter();
+const tracerProvider = new NodeTracerProvider({
+  spanProcessors: [new SimpleSpanProcessor(spanExporter)],
+});
+
+const mockConfig: ResolvedAxiomConfig = {
+  eval: {
+    url: 'https://test.axiom.co',
+    token: 'test-token',
+    dataset: 'test-dataset',
+    instrumentation: null,
+    include: [],
+    exclude: [],
+    timeoutMs: 60_000,
+  },
+} as ResolvedAxiomConfig;
+
+global.fetch = vi.fn(async (url: string, _options?: any) => {
+  fetchCalls.push({ url: String(url) });
+  if (url.includes('_apl')) {
+    return new Response(JSON.stringify({ matches: [] }), {
+      status: 200,
+      headers: { 'Content-Type': 'application/json' },
+    });
+  }
+  return new Response(JSON.stringify({ success: true }), {
+    status: 200,
+    headers: { 'Content-Type': 'application/json' },
+  });
+}) as any;
+
+const originalLog = console.log;
+console.log = (..._args: any[]) => {
+  // Suppress output
+};
+
+// Mock instrumentation
+vi.doMock('../../src/evals/instrument', async () => {
+  const { trace: _trace } = await import('@opentelemetry/api');
+  const tracer = tracerProvider.getTracer('axiom-eval-test');
+  return {
+    ensureInstrumentationInitialized: vi.fn(async () => {}),
+    initInstrumentation: vi.fn(async () => {}),
+    flush: vi.fn(async () => {
+      await tracerProvider.forceFlush();
+    }),
+    startSpan: vi.fn((name: string, opts: any, ctx?: any) => tracer.startSpan(name, opts, ctx)),
+    startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => {
+      const span = tracer.startSpan(name, opts, ctx);
+      try {
+        const result = await fn(span);
+        span.end();
+        return result;
+      } catch (error) {
+        span.recordException(error as Error);
+        span.end();
+        throw error;
+      }
+    }),
+  };
+});
+
+vi.doMock('../../src/evals/context/storage', () => ({
+  getAxiomConfig: vi.fn(() => mockConfig),
+  setAxiomConfig: vi.fn(),
+  getConfigScope: vi.fn(() => ({ getAllDefaultFlags: () => ({}) })),
+  withEvalContext: vi.fn(async (_opts: any, fn: any) => await fn()),
+  getEvalContext: vi.fn(() => ({ flags: {}, outOfScopeFlags: [] })),
+}));
+
+vi.doMock('../../src/evals/context/global-flags', () => ({
+  getGlobalFlagOverrides: vi.fn(() => ({})),
+  setGlobalFlagOverrides: vi.fn(),
+}));
+
+vi.doMock('vitest', async () => {
+  const actual = await vi.importActual('vitest');
+  return {
+    ...actual,
+    inject: vi.fn((key: string) => {
+      const context: Record<string, any> = {
+        baseline: undefined,
+        debug: false,
+        list: false,
+        overrides: {},
+        axiomConfig: mockConfig,
+        runId: 'test-run-123',
+      };
+      return context[key];
+    }),
+  };
+});
+
+const { Eval } = await import('../../src/evals/eval');
+
+// Create scorers
+const scorer1 = async ({ output: _output }: { output: any }) => ({ score: 1.0 });
+Object.defineProperty(scorer1, 'name', { value: 'scorer-1' });
+
+const scorer2 = async ({ output: _output }: { output: any }) => ({ score: 0.8 });
+Object.defineProperty(scorer2, 'name', { value: 'scorer-2' });
+
+// ===== FIRST EVAL =====
+Eval('First-Eval', {
+  data: async () => [{ input: 'input 1', expected: 'expected 1' }],
+  task: async ({ input }) => `output for ${input}`,
+  scorers: [scorer1 as any],
+});
+
+// ===== SECOND EVAL =====
+Eval('Second-Eval', {
+  data: async () => [
+    { input: 'input A', expected: 'expected A' },
+    { input: 'input B', expected: 'expected B' },
+  ],
+  task: async ({ input }) => `result for ${input}`,
+  scorers: [scorer2 as any],
+});
+
+afterAll(async () => {
+  console.log = originalLog;
+
+  const spans = spanExporter.getFinishedSpans();
+
+  console.log('\n=== MULTIPLE EVALS IN ONE FILE ===');
+  console.log(`Total fetch calls: ${fetchCalls.length}`);
+  console.log(`Total spans: ${spans.length}`);
+
+  const firstEvalSpan = spans.find((s) => s.name.includes('First-Eval'));
+  const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval'));
+
+  if (!firstEvalSpan) throw new Error('Expected First Eval span');
+  if (!secondEvalSpan) throw new Error('Expected Second Eval span');
+
+  console.log('✓ Both evals ran');
+  console.log(`✓ First Eval: ${firstEvalSpan.name}`);
+  console.log(`✓ Second Eval: ${secondEvalSpan.name}`);
+
+  // We should have 3 tests total (1 from First + 2 from Second)
+  const caseSpans = spans.filter((s) => s.name.startsWith('case'));
+  if (caseSpans.length !== 3) {
+    throw new Error(`Expected 2 case spans, got ${caseSpans.length}`);
+  }
+
+  console.log(`✓ All 3 test cases ran\n`);
+
+  await tracerProvider.shutdown();
+  await spanExporter.shutdown();
+});
diff --git a/packages/ai/vitest.config.ts b/packages/ai/vitest.config.ts
index 887054f4..8f3512af 100644
--- a/packages/ai/vitest.config.ts
+++ b/packages/ai/vitest.config.ts
@@ -5,6 +5,7 @@ export default defineConfig({
   test: {
     environment: 'node',
     include: ['test/**/*.test.ts'],
+    exclude: ['test/evals/**/*.integration.test.ts', '**/node_modules/**'],
     globals: true,
     pool: 'forks',
     // TODO: ensure that this allows parallel tests

From 049667092f735e3c3d7808ba84dd013d58bf3e52 Mon Sep 17 00:00:00 2001
From: Christopher Ehrlich <ehrlich.christopher@gmail.com>
Date: Fri, 14 Nov 2025 15:00:07 +0700
Subject: [PATCH 2/2] better integration

---
 .../ai/test/evals/eval-integration.test.ts    |  81 +++++----
 packages/ai/test/evals/eval-multi.test.ts     | 163 ------------------
 2 files changed, 52 insertions(+), 192 deletions(-)
 delete mode 100644 packages/ai/test/evals/eval-multi.test.ts

diff --git a/packages/ai/test/evals/eval-integration.test.ts b/packages/ai/test/evals/eval-integration.test.ts
index 17a94a10..731d4474 100644
--- a/packages/ai/test/evals/eval-integration.test.ts
+++ b/packages/ai/test/evals/eval-integration.test.ts
@@ -2,10 +2,7 @@
 // @vitest-pool forks
 
 /**
- * Integration test for Eval() that captures network calls, spans, and console output.
- *
- * IMPORTANT: Eval() must be called at the MODULE TOP-LEVEL, not inside it() blocks,
- * because it calls describe() to create vitest suites dynamically.
+ * Integration test for Eval() that captures network calls and spans
  */
 
 import { afterAll, vi } from 'vitest';
@@ -14,13 +11,10 @@ import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
 import type { ResolvedAxiomConfig } from '../../src/config/index';
 import type { ReadableSpan } from '@opentelemetry/sdk-trace-base';
 
-// ===== SETUP: Capture side effects =====
-
 const fetchCalls: Array<{ url: string; options: any }> = [];
 const consoleOutput: string[] = [];
 const spanExporter = new InMemorySpanExporter();
 
-// Setup OTel tracer provider with in-memory exporter
 const tracerProvider = new NodeTracerProvider({
   spanProcessors: [new SimpleSpanProcessor(spanExporter)],
 });
@@ -37,11 +31,9 @@ const mockConfig: ResolvedAxiomConfig = {
   },
 } as ResolvedAxiomConfig;
 
-// Mock fetch to capture network calls
 global.fetch = vi.fn(async (url: string, options?: any) => {
   fetchCalls.push({ url: String(url), options });
 
-  // Return empty baseline response for APL queries
   if (url.includes('_apl')) {
     return new Response(JSON.stringify({ matches: [] }), {
       status: 200,
@@ -49,20 +41,17 @@ global.fetch = vi.fn(async (url: string, options?: any) => {
     });
   }
 
-  // Return success for evaluation API calls
   return new Response(JSON.stringify({ success: true }), {
     status: 200,
     headers: { 'Content-Type': 'application/json' },
   });
 }) as any;
 
-// Mock console.log to capture output
 const originalLog = console.log;
 console.log = (...args: any[]) => {
   consoleOutput.push(args.map(String).join(' '));
 };
 
-// Mock the instrumentation module to use our test provider
 vi.doMock('../../src/evals/instrument', async () => {
   const { trace: _trace } = await import('@opentelemetry/api');
   const tracer = tracerProvider.getTracer('axiom-eval-test');
@@ -133,17 +122,22 @@ vi.doMock('vitest', async () => {
   };
 });
 
-// ===== CALL Eval() AT TOP LEVEL =====
-
 const { Eval } = await import('../../src/evals/eval');
+const { createScorer: Scorer } = await import('../../src/evals/scorers');
 
-// Create scorer function with name property
-const testScorer = async ({ output }: { output: any }) => {
+const testScorer = Scorer('test-scorer', async ({ output }: { output: any }) => {
   return {
     score: typeof output === 'string' && output.includes('output') ? 1.0 : 0.0,
   };
-};
-Object.defineProperty(testScorer, 'name', { value: 'test-scorer' });
+});
+
+const scorer1 = Scorer('scorer-1', async ({ output: _output }: { output: any }) => ({
+  score: 1.0,
+}));
+
+const scorer2 = Scorer('scorer-2', async ({ output: _output }: { output: any }) => ({
+  score: 0.8,
+}));
 
 Eval('Integration-Test-Eval', {
   data: async () => [
@@ -156,16 +150,26 @@ Eval('Integration-Test-Eval', {
   scorers: [testScorer as any],
 });
 
-// ===== ASSERTIONS: Run after all tests complete =====
+Eval('Second-Eval', {
+  data: async () => [{ input: 'input A', expected: 'expected A' }],
+  task: async ({ input }) => `output for ${input}`,
+  scorers: [scorer1 as any],
+});
+
+Eval('Third-Eval', {
+  data: async () => [
+    { input: 'input X', expected: 'expected X' },
+    { input: 'input Y', expected: 'expected Y' },
+  ],
+  task: async ({ input }) => `result for ${input}`,
+  scorers: [scorer2 as any],
+});
 
 afterAll(async () => {
-  // Restore console
   console.log = originalLog;
 
-  // Get all captured data
   const spans: ReadableSpan[] = spanExporter.getFinishedSpans();
 
-  // Assert network calls
   const baselineCall = fetchCalls.find((c) => c.url.includes('_apl'));
   const createCall = fetchCalls.find(
     (c) => c.url.includes('/api/evaluations/v3') && c.options?.method === 'POST',
@@ -178,27 +182,46 @@ afterAll(async () => {
   if (!createCall) throw new Error('Expected create evaluation call');
   if (!updateCall) throw new Error('Expected update evaluation call');
 
-  // Assert span structure
+  // Assert span structure for Integration-Test-Eval (comprehensive)
   const evalSpan = spans.find((s) => s.name.includes('eval Integration-Test-Eval'));
-  const caseSpans = spans.filter((s) => s.name.startsWith('case'));
+  const allCaseSpans = spans.filter((s) => s.name.startsWith('case'));
+  const integrationCaseSpans = allCaseSpans.slice(0, 2); // First 2 cases are from Integration-Test-Eval
   const taskSpans = spans.filter((s) => s.name === 'task');
   const scorerSpans = spans.filter((s) => s.name.includes('score'));
 
   if (!evalSpan) throw new Error('Expected eval span');
-  if (caseSpans.length !== 2) throw new Error(`Expected 2 case spans, got ${caseSpans.length}`);
-  if (taskSpans.length !== 2) throw new Error(`Expected 2 task spans, got ${taskSpans.length}`);
-  if (scorerSpans.length !== 2) {
-    throw new Error(`Expected 2 scorer spans, got ${scorerSpans.length}`);
+  if (integrationCaseSpans.length !== 2) {
+    throw new Error(
+      `Expected 2 Integration-Test-Eval case spans, got ${integrationCaseSpans.length}`,
+    );
+  }
+  if (taskSpans.length < 2) {
+    throw new Error(`Expected at least 2 task spans, got ${taskSpans.length}`);
+  }
+  if (scorerSpans.length < 2) {
+    throw new Error(`Expected at least 2 scorer spans, got ${scorerSpans.length}`);
   }
 
   // Assert span attributes
-  const firstCaseSpan = caseSpans[0];
+  const firstCaseSpan = integrationCaseSpans[0];
   const attrs = firstCaseSpan.attributes;
 
   if (!attrs['eval.case.input']) throw new Error('Expected eval.case.input attribute');
   if (!attrs['eval.case.output']) throw new Error('Expected eval.case.output attribute');
   if (!attrs['eval.case.scores']) throw new Error('Expected eval.case.scores attribute');
 
+  // Assert multiple evals ran (light validation)
+  const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval'));
+  const thirdEvalSpan = spans.find((s) => s.name.includes('Third-Eval'));
+
+  if (!secondEvalSpan) throw new Error('Expected Second-Eval span');
+  if (!thirdEvalSpan) throw new Error('Expected Third-Eval span');
+
+  // Total case count should be 2 + 1 + 2 = 5
+  if (allCaseSpans.length !== 5) {
+    throw new Error(`Expected 5 total case spans, got ${allCaseSpans.length}`);
+  }
+
   // Cleanup
   await tracerProvider.shutdown();
   await spanExporter.shutdown();
diff --git a/packages/ai/test/evals/eval-multi.test.ts b/packages/ai/test/evals/eval-multi.test.ts
deleted file mode 100644
index dc896e4b..00000000
--- a/packages/ai/test/evals/eval-multi.test.ts
+++ /dev/null
@@ -1,163 +0,0 @@
-// @vitest-environment node
-// @vitest-pool forks
-
-/**
- * Test showing multiple Eval() calls in one file
- */
-
-import { afterAll, vi } from 'vitest';
-import { InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-base';
-import { NodeTracerProvider } from '@opentelemetry/sdk-trace-node';
-import type { ResolvedAxiomConfig } from '../../src/config/index';
-
-// Setup
-const fetchCalls: Array<{ url: string; evalName?: string }> = [];
-const spanExporter = new InMemorySpanExporter();
-const tracerProvider = new NodeTracerProvider({
-  spanProcessors: [new SimpleSpanProcessor(spanExporter)],
-});
-
-const mockConfig: ResolvedAxiomConfig = {
-  eval: {
-    url: 'https://test.axiom.co',
-    token: 'test-token',
-    dataset: 'test-dataset',
-    instrumentation: null,
-    include: [],
-    exclude: [],
-    timeoutMs: 60_000,
-  },
-} as ResolvedAxiomConfig;
-
-global.fetch = vi.fn(async (url: string, _options?: any) => {
-  fetchCalls.push({ url: String(url) });
-  if (url.includes('_apl')) {
-    return new Response(JSON.stringify({ matches: [] }), {
-      status: 200,
-      headers: { 'Content-Type': 'application/json' },
-    });
-  }
-  return new Response(JSON.stringify({ success: true }), {
-    status: 200,
-    headers: { 'Content-Type': 'application/json' },
-  });
-}) as any;
-
-const originalLog = console.log;
-console.log = (..._args: any[]) => {
-  // Suppress output
-};
-
-// Mock instrumentation
-vi.doMock('../../src/evals/instrument', async () => {
-  const { trace: _trace } = await import('@opentelemetry/api');
-  const tracer = tracerProvider.getTracer('axiom-eval-test');
-  return {
-    ensureInstrumentationInitialized: vi.fn(async () => {}),
-    initInstrumentation: vi.fn(async () => {}),
-    flush: vi.fn(async () => {
-      await tracerProvider.forceFlush();
-    }),
-    startSpan: vi.fn((name: string, opts: any, ctx?: any) => tracer.startSpan(name, opts, ctx)),
-    startActiveSpan: vi.fn(async (name: string, opts: any, fn: any, ctx?: any) => {
-      const span = tracer.startSpan(name, opts, ctx);
-      try {
-        const result = await fn(span);
-        span.end();
-        return result;
-      } catch (error) {
-        span.recordException(error as Error);
-        span.end();
-        throw error;
-      }
-    }),
-  };
-});
-
-vi.doMock('../../src/evals/context/storage', () => ({
-  getAxiomConfig: vi.fn(() => mockConfig),
-  setAxiomConfig: vi.fn(),
-  getConfigScope: vi.fn(() => ({ getAllDefaultFlags: () => ({}) })),
-  withEvalContext: vi.fn(async (_opts: any, fn: any) => await fn()),
-  getEvalContext: vi.fn(() => ({ flags: {}, outOfScopeFlags: [] })),
-}));
-
-vi.doMock('../../src/evals/context/global-flags', () => ({
-  getGlobalFlagOverrides: vi.fn(() => ({})),
-  setGlobalFlagOverrides: vi.fn(),
-}));
-
-vi.doMock('vitest', async () => {
-  const actual = await vi.importActual('vitest');
-  return {
-    ...actual,
-    inject: vi.fn((key: string) => {
-      const context: Record<string, any> = {
-        baseline: undefined,
-        debug: false,
-        list: false,
-        overrides: {},
-        axiomConfig: mockConfig,
-        runId: 'test-run-123',
-      };
-      return context[key];
-    }),
-  };
-});
-
-const { Eval } = await import('../../src/evals/eval');
-
-// Create scorers
-const scorer1 = async ({ output: _output }: { output: any }) => ({ score: 1.0 });
-Object.defineProperty(scorer1, 'name', { value: 'scorer-1' });
-
-const scorer2 = async ({ output: _output }: { output: any }) => ({ score: 0.8 });
-Object.defineProperty(scorer2, 'name', { value: 'scorer-2' });
-
-// ===== FIRST EVAL =====
-Eval('First-Eval', {
-  data: async () => [{ input: 'input 1', expected: 'expected 1' }],
-  task: async ({ input }) => `output for ${input}`,
-  scorers: [scorer1 as any],
-});
-
-// ===== SECOND EVAL =====
-Eval('Second-Eval', {
-  data: async () => [
-    { input: 'input A', expected: 'expected A' },
-    { input: 'input B', expected: 'expected B' },
-  ],
-  task: async ({ input }) => `result for ${input}`,
-  scorers: [scorer2 as any],
-});
-
-afterAll(async () => {
-  console.log = originalLog;
-
-  const spans = spanExporter.getFinishedSpans();
-
-  console.log('\n=== MULTIPLE EVALS IN ONE FILE ===');
-  console.log(`Total fetch calls: ${fetchCalls.length}`);
-  console.log(`Total spans: ${spans.length}`);
-
-  const firstEvalSpan = spans.find((s) => s.name.includes('First-Eval'));
-  const secondEvalSpan = spans.find((s) => s.name.includes('Second-Eval'));
-
-  if (!firstEvalSpan) throw new Error('Expected First Eval span');
-  if (!secondEvalSpan) throw new Error('Expected Second Eval span');
-
-  console.log('✓ Both evals ran');
-  console.log(`✓ First Eval: ${firstEvalSpan.name}`);
-  console.log(`✓ Second Eval: ${secondEvalSpan.name}`);
-
-  // We should have 3 tests total (1 from First + 2 from Second)
-  const caseSpans = spans.filter((s) => s.name.startsWith('case'));
-  if (caseSpans.length !== 3) {
-    throw new Error(`Expected 2 case spans, got ${caseSpans.length}`);
-  }
-
-  console.log(`✓ All 3 test cases ran\n`);
-
-  await tracerProvider.shutdown();
-  await spanExporter.shutdown();
-});