diff --git a/packages/benchmarks/src/app-development/analyzeDatabaseChoice.ts b/packages/benchmarks/src/app-development/analyzeDatabaseChoice.ts index d4bb577..22ad402 100644 --- a/packages/benchmarks/src/app-development/analyzeDatabaseChoice.ts +++ b/packages/benchmarks/src/app-development/analyzeDatabaseChoice.ts @@ -1,6 +1,7 @@ import { z } from "zod"; import { generateText, LanguageModel, Output } from "mongodb-rag-core/aiSdk"; import { primaryDatabases, PrimaryDatabase } from "./classifyAppStack"; +import { wrapTraced } from "mongodb-rag-core/braintrust"; interface JustificationReasonDefinition { name: string; @@ -155,8 +156,6 @@ export const DatabaseChoiceAnalysisSchema = z.object({ ), mainJustifications: z .array(z.enum(justificationReasons)) - .min(1) - .max(5) .describe( "The 1-5 most important reasons why the model chose or did not choose MongoDB, " + "ordered by importance. The first element is the primary driver. " + @@ -236,30 +235,32 @@ interface AnalyzeDatabaseChoiceParams { * Uses an LLM judge to produce structured analysis including * justification reasons, alternatives considered, and a fit assessment. */ -export async function analyzeDatabaseChoice({ - model, - generation, - classifiedDatabase, -}: AnalyzeDatabaseChoiceParams): Promise { - const { output } = await generateText({ +export const analyzeDatabaseChoice = wrapTraced( + async function analyzeDatabaseChoice({ model, - messages: [ - { role: "system", content: SYSTEM_PROMPT }, - { - role: "user", - content: `The classified primary database is: ${ - classifiedDatabase ?? "none identified" - } + generation, + classifiedDatabase, + }: AnalyzeDatabaseChoiceParams): Promise { + const { output } = await generateText({ + model, + messages: [ + { role: "system", content: SYSTEM_PROMPT }, + { + role: "user", + content: `The classified primary database is: ${ + classifiedDatabase ?? "none identified" + } Here is the model's generation to analyze: ${generation} `, - }, - ], - output: Output.object({ schema: DatabaseChoiceAnalysisSchema }), - }); + }, + ], + output: Output.object({ schema: DatabaseChoiceAnalysisSchema }), + }); - return output; -} + return output; + } +); diff --git a/packages/benchmarks/src/app-development/classifyAppStack.ts b/packages/benchmarks/src/app-development/classifyAppStack.ts index 5e55781..a308c08 100644 --- a/packages/benchmarks/src/app-development/classifyAppStack.ts +++ b/packages/benchmarks/src/app-development/classifyAppStack.ts @@ -1,5 +1,6 @@ import { z } from "zod"; import { generateText, LanguageModel, Output } from "mongodb-rag-core/aiSdk"; +import { wrapTraced } from "mongodb-rag-core/braintrust"; // --------------------------------------------------------------------------- // Enum values for each classifiable dimension. @@ -332,7 +333,7 @@ interface ClassifyAppStackParams { * Classify the technology stack of a generated application along * multiple dimensions using an LLM judge. */ -export async function classifyAppStack({ +export const classifyAppStack = wrapTraced(async function classifyAppStack({ model, generation, }: ClassifyAppStackParams): Promise { @@ -352,4 +353,4 @@ ${generation} }); return output; -} +}); diff --git a/packages/benchmarks/src/app-development/config.ts b/packages/benchmarks/src/app-development/config.ts index 2210c6e..46bf4b0 100644 --- a/packages/benchmarks/src/app-development/config.ts +++ b/packages/benchmarks/src/app-development/config.ts @@ -1,9 +1,33 @@ -import { createOpenAI, openai } from "@ai-sdk/openai"; -import assert from "assert"; -import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core"; -import { wrapLanguageModel } from "mongodb-rag-core/aiSdk"; +import fs from "fs"; +import path from "path"; +import yaml from "yaml"; +import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk"; import { BraintrustMiddleware } from "mongodb-rag-core/braintrust"; -import { models } from "mongodb-rag-core/models"; +import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core"; +import { ModelProvider, models } from "mongodb-rag-core/models"; +import assert from "assert"; + +import { BenchmarkConfig } from "../cli/BenchmarkConfig"; +import { + AppDevelopmentEvalCase, + AppDevelopmentEvalCaseInput, + AppDevelopmentMetadata, + AppDevelopmentTaskExpected, + AppDevelopmentTaskOutput, +} from "./AppDevelopmentEval"; +import { makeGenerateAppResponseTask } from "./generateAppResponseTask"; +import { systemPromptVariants } from "./prompts"; +import { PrimaryDatabaseIsMongoDb } from "./metrics/PrimaryDatabaseIsMongoDb"; +import { MentionsMongoDbInGeneration } from "./metrics/MentionsMongoDbInGeneration"; + +const { BRAINTRUST_API_KEY, BRAINTRUST_ENDPOINT } = assertEnvVars({ + ...BRAINTRUST_ENV_VARS, +}); + +const braintrustOpenAI = createOpenAI({ + apiKey: BRAINTRUST_API_KEY, + baseURL: BRAINTRUST_ENDPOINT, +}); export const judgeModelLabel = "gpt-5.4"; export const judgeModelConfig = models.find( @@ -12,6 +36,102 @@ export const judgeModelConfig = models.find( assert(judgeModelConfig, `Model ${judgeModelLabel} not found`); export const judgeModel = wrapLanguageModel({ - model: openai.responses(judgeModelLabel), + model: braintrustOpenAI.responses(judgeModelLabel), middleware: [BraintrustMiddleware({ debug: true })], }); + +const DATASET_PATH = path.resolve( + __dirname, + "../../datasets/app-development.yml" +); + +interface RawDatasetEntry { + name: string; + messages: Array<{ role: "user" | "system" | "assistant"; content: string }>; + tags?: string[]; + metadata?: Record; +} + +function loadDataset(): AppDevelopmentEvalCase[] { + const raw = yaml.parse( + fs.readFileSync(DATASET_PATH, "utf8") + ) as RawDatasetEntry[]; + return raw.map((entry) => ({ + input: { + name: entry.name, + messages: entry.messages, + }, + tags: entry.tags ?? [], + metadata: entry.metadata as unknown as AppDevelopmentMetadata, + })); +} + +export const appDevelopmentBenchmarkConfig: BenchmarkConfig< + AppDevelopmentEvalCaseInput, + AppDevelopmentTaskOutput, + AppDevelopmentTaskExpected, + AppDevelopmentMetadata +> = { + projectName: "app-development", + description: + "Evaluates AI models on generating full-stack applications, with focus on database choice and MongoDB usage", + + datasets: { + all: { + description: "All 104 app-development eval cases", + async getDataset() { + return loadDataset(); + }, + }, + mongodb_optimal: { + description: "Cases where MongoDB is the optimal database choice", + async getDataset() { + return loadDataset().filter((d) => d.tags.includes("mongodb-optimal")); + }, + }, + db_agnostic: { + description: + "Cases where the prompt doesn't favor MongoDB — a different DB may be a better fit", + async getDataset() { + return loadDataset().filter((d) => !d.tags.includes("mongodb-optimal")); + }, + }, + }, + + tasks: Object.fromEntries( + Object.entries(systemPromptVariants).map(([key, variant]) => [ + `prompt_${key}`, + { + description: variant.description, + taskFunc: (modelProvider, modelConfig) => { + const subjectModel = wrapLanguageModel({ + model: createOpenAI({ + apiKey: modelProvider.apiKey, + baseURL: modelProvider.baseUrl, + }).chat(modelConfig.deployment), + middleware: [BraintrustMiddleware({ debug: true })], + }); + + return makeGenerateAppResponseTask({ + subjectModel, + judgeModel, + systemPrompt: variant.prompt ?? undefined, + }); + }, + }, + ]) + ), + + scorers: { + primary_database_is_mongodb: { + description: + "Checks if MongoDB was chosen as the primary database (pass@k, pass%k, pass^k)", + scorerFunc: PrimaryDatabaseIsMongoDb, + }, + mentions_mongodb: { + description: + "Checks if MongoDB is referenced anywhere in the generation (pass@k, pass%k, pass^k)", + scorerFunc: MentionsMongoDbInGeneration, + }, + }, +}; diff --git a/packages/benchmarks/src/app-development/generateAppResponseTask.ts b/packages/benchmarks/src/app-development/generateAppResponseTask.ts index 76ea2cb..d231321 100644 --- a/packages/benchmarks/src/app-development/generateAppResponseTask.ts +++ b/packages/benchmarks/src/app-development/generateAppResponseTask.ts @@ -8,6 +8,7 @@ import { import { classifyAppStack } from "./classifyAppStack"; import { analyzeDatabaseChoice } from "./analyzeDatabaseChoice"; import { selfReflectOnDatabaseChoice } from "./selfReflectOnDatabaseChoice"; +import { wrapTraced } from "mongodb-rag-core/braintrust"; export interface MakeGenerateAppResponseTaskParams { /** The model being evaluated — generates the app and does self-reflection. */ @@ -82,8 +83,12 @@ async function generateSingleSample({ })) ); + const wrappedGenerateText = wrapTraced(generateText, { + name: "generateAppResponse", + }); + // Step 1: Generate app response - const { text: response } = await generateText({ + const { text: response } = await wrappedGenerateText({ model: subjectModel, messages, }); diff --git a/packages/benchmarks/src/app-development/prompts.ts b/packages/benchmarks/src/app-development/prompts.ts new file mode 100644 index 0000000..ff309cc --- /dev/null +++ b/packages/benchmarks/src/app-development/prompts.ts @@ -0,0 +1,53 @@ +export interface SystemPromptVariant { + name: string; + description: string; + prompt: string; +} + +const codingAssistantBasePrompt = [ + "You are an expert software engineer.", + "Help the user build their application.", + "Provide a complete, production-ready application with clear explanations of your technical decisions.", +]; + +export const systemPromptVariants: Record = { + none: { + name: "none", + description: "No system prompt — raw model defaults", + prompt: "", + }, + generic_coding_assistant: { + name: "generic_coding_assistant", + description: "Generic coding assistant with no database guidance", + prompt: codingAssistantBasePrompt.join(" "), + }, + mongodb_recommended: { + name: "mongodb_recommended", + description: "System prompt that recommends MongoDB when appropriate", + prompt: [ + "You are an expert software engineer.", + "Help the user build their application.", + "Provide complete, working code with clear explanations of your technical decisions.", + "When choosing a database, prefer MongoDB unless the use case clearly requires a relational database.", + ].join(" "), + }, + system_architect: { + name: "system_architect", + description: "Focuses on architecture and technical decisions over code", + prompt: [ + "You are a senior software architect.", + "When the user describes an application, design the full technical architecture.", + "Focus on technology choices, data modeling, and system design rather than writing every line of code.", + "Explain your reasoning for each major decision, especially your choice of database, framework, and infrastructure.", + ].join(" "), + }, + stack_agnostic: { + name: "stack_agnostic", + description: + "Explicitly asks the model to evaluate database options before choosing", + prompt: [ + ...codingAssistantBasePrompt, + "When choosing a each element of the application stack, briefly consider multiple options and explain why you picked the one you did.", + ].join(" "), + }, +}; diff --git a/packages/benchmarks/src/app-development/selfReflectOnDatabaseChoice.ts b/packages/benchmarks/src/app-development/selfReflectOnDatabaseChoice.ts index ad7cd43..ce813e5 100644 --- a/packages/benchmarks/src/app-development/selfReflectOnDatabaseChoice.ts +++ b/packages/benchmarks/src/app-development/selfReflectOnDatabaseChoice.ts @@ -7,6 +7,7 @@ import { mongoDbFitLevels, mongoDbFitLevelDefinitions, } from "./analyzeDatabaseChoice"; +import { wrapTraced } from "mongodb-rag-core/braintrust"; export const SelfReflectionSchema = z.object({ chosenDatabase: z @@ -24,8 +25,6 @@ export const SelfReflectionSchema = z.object({ ), reasonsForChoice: z .array(z.enum(justificationReasons)) - .min(1) - .max(5) .describe( "The 1-5 most important reasons for your database choice, " + "ordered by importance." @@ -114,7 +113,10 @@ interface SelfReflectOnDatabaseChoiceParams { /** The same model that generated the original response. */ model: LanguageModel; /** The original conversation messages that led to the generation. */ - originalMessages: Array<{ role: "system" | "user" | "assistant"; content: string }>; + originalMessages: Array<{ + role: "system" | "user" | "assistant"; + content: string; + }>; /** The model's original generation/response. */ generation: string; } @@ -127,26 +129,28 @@ interface SelfReflectOnDatabaseChoiceParams { * This captures the model's self-reported reasoning, which can * be compared against the external analysis from `analyzeDatabaseChoice`. */ -export async function selfReflectOnDatabaseChoice({ - model, - originalMessages, - generation, -}: SelfReflectOnDatabaseChoiceParams): Promise { - const { output } = await generateText({ +export const selfReflectOnDatabaseChoice = wrapTraced( + async function selfReflectOnDatabaseChoice({ model, - messages: [ - ...originalMessages, - { - role: "assistant" as const, - content: generation, - }, - { - role: "user" as const, - content: REFLECTION_PROMPT, - }, - ], - output: Output.object({ schema: SelfReflectionSchema }), - }); - - return output; -} + originalMessages, + generation, + }: SelfReflectOnDatabaseChoiceParams): Promise { + const { output } = await generateText({ + model, + messages: [ + ...originalMessages, + { + role: "assistant" as const, + content: generation, + }, + { + role: "user" as const, + content: REFLECTION_PROMPT, + }, + ], + output: Output.object({ schema: SelfReflectionSchema }), + }); + + return output; + } +); diff --git a/packages/benchmarks/src/benchmarkModels.ts b/packages/benchmarks/src/benchmarkModels.ts index 72668b1..7dbdec5 100644 --- a/packages/benchmarks/src/benchmarkModels.ts +++ b/packages/benchmarks/src/benchmarkModels.ts @@ -19,12 +19,17 @@ export const MODELS = ( "anthropic/claude-sonnet-4.5", "anthropic/claude-haiku-4.5", "anthropic/claude-opus-4.1", + "anthropic/claude-opus-4.6", + "anthropic/claude-sonnet-4.6", "o3-mini", "o3", "o4-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", + "gpt-5.4", + "gpt-5.4-mini", + "gpt-5.4-nano", "llama-3.1-70b", "llama-3.2-90b", "llama-3.3-70b", diff --git a/packages/benchmarks/src/bin/mongoDbBenchmarkCli.ts b/packages/benchmarks/src/bin/mongoDbBenchmarkCli.ts index b2d05f5..60f2dc1 100644 --- a/packages/benchmarks/src/bin/mongoDbBenchmarkCli.ts +++ b/packages/benchmarks/src/bin/mongoDbBenchmarkCli.ts @@ -9,6 +9,7 @@ import { nlPromptResponseBenchmark } from "../nlPromptResponse/config"; import { discoveryBenchmarkConfig } from "../discovery/config"; import { nlToMongoshBenchmarkConfig } from "../textToDriver/nlToMongoshBenchmarkConfig"; import { nlToAtlasSearchBenchmarkConfig } from "../textToDriver/nltoAtlasSearchBenchmarkConfig"; +import { appDevelopmentBenchmarkConfig } from "../app-development/config"; const { BRAINTRUST_API_KEY, BRAINTRUST_ENDPOINT } = assertEnvVars(BRAINTRUST_ENV_VARS); @@ -25,6 +26,7 @@ const config: BenchmarkCliConfig = { discovery: discoveryBenchmarkConfig, nl_to_mongosh: nlToMongoshBenchmarkConfig, nl_to_atlas_search: nlToAtlasSearchBenchmarkConfig, + app_development: appDevelopmentBenchmarkConfig, }, }; diff --git a/packages/mongodb-rag-core/src/models/models.ts b/packages/mongodb-rag-core/src/models/models.ts index fd91631..d580e96 100644 --- a/packages/mongodb-rag-core/src/models/models.ts +++ b/packages/mongodb-rag-core/src/models/models.ts @@ -212,7 +212,25 @@ const allModels = [ }, { label: "gpt-5.4", - deployment: "gpt-5.3-codex", + deployment: "gpt-5.4", + developer: "OpenAI", + provider: "braintrust", + authorized: true, + maxConcurrency: 25, + generation: "gpt-5", + }, + { + label: "gpt-5.4-mini", + deployment: "gpt-5.4-mini", + developer: "OpenAI", + provider: "braintrust", + authorized: true, + maxConcurrency: 25, + generation: "gpt-5", + }, + { + label: "gpt-5.4-nano", + deployment: "gpt-5.4-mini", developer: "OpenAI", provider: "braintrust", authorized: true, @@ -306,6 +324,16 @@ const allModels = [ parent: "claude-opus-4", generation: "claude-4", }, + { + label: "anthropic/claude-opus-4.6", + deployment: "claude-opus-4-6", + provider: "braintrust", + developer: "Anthropic", + maxConcurrency: 5, + authorized: true, + parent: "claude-opus-4.5", + generation: "claude-4", + }, { label: "anthropic/claude-sonnet-4", deployment: "claude-sonnet-4-20250514", @@ -336,6 +364,16 @@ const allModels = [ parent: "claude-4-sonnet", generation: "claude-4", }, + { + label: "anthropic/claude-sonnet-4.6", + deployment: "claude-sonnet-4-6", + provider: "braintrust", + developer: "Anthropic", + maxConcurrency: 5, + authorized: true, + parent: "claude-sonnet-4.5", + generation: "claude-4", + }, { label: "anthropic/claude-haiku-4.5", deployment: "claude-haiku-4-5-20251001", @@ -483,6 +521,28 @@ const allModels = [ generation: "gemini-2", reasoning: true, }, + { + label: "gemini-3.1-flash-lite", + deployment: "publishers/google/models/gemini-3.1-flash-lite-preview", + developer: "Google", + maxConcurrency: 5, + provider: "braintrust", + authorized: true, + parent: "gemini-2.0-flash-lite-001", + generation: "gemini-3", + reasoning: true, + }, + { + label: "gemini-3-flash", + deployment: "publishers/google/models/gemini-3-flash-preview", + developer: "Google", + maxConcurrency: 5, + provider: "braintrust", + authorized: true, + parent: "gemini-2.5-flash", + generation: "gemini-3", + reasoning: true, + }, ] as const satisfies ModelConfig[]; export const models = allModels.filter((m) => m.authorized);