Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 22 additions & 21 deletions packages/benchmarks/src/app-development/analyzeDatabaseChoice.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { z } from "zod";
import { generateText, LanguageModel, Output } from "mongodb-rag-core/aiSdk";
import { primaryDatabases, PrimaryDatabase } from "./classifyAppStack";
import { wrapTraced } from "mongodb-rag-core/braintrust";

interface JustificationReasonDefinition {
name: string;
Expand Down Expand Up @@ -155,8 +156,6 @@ export const DatabaseChoiceAnalysisSchema = z.object({
),
mainJustifications: z
.array(z.enum(justificationReasons))
.min(1)
.max(5)
.describe(
"The 1-5 most important reasons why the model chose or did not choose MongoDB, " +
"ordered by importance. The first element is the primary driver. " +
Expand Down Expand Up @@ -236,30 +235,32 @@ interface AnalyzeDatabaseChoiceParams {
* Uses an LLM judge to produce structured analysis including
* justification reasons, alternatives considered, and a fit assessment.
*/
export async function analyzeDatabaseChoice({
model,
generation,
classifiedDatabase,
}: AnalyzeDatabaseChoiceParams): Promise<DatabaseChoiceAnalysis> {
const { output } = await generateText({
export const analyzeDatabaseChoice = wrapTraced(
async function analyzeDatabaseChoice({
model,
messages: [
{ role: "system", content: SYSTEM_PROMPT },
{
role: "user",
content: `The classified primary database is: ${
classifiedDatabase ?? "none identified"
}
generation,
classifiedDatabase,
}: AnalyzeDatabaseChoiceParams): Promise<DatabaseChoiceAnalysis> {
const { output } = await generateText({
model,
messages: [
{ role: "system", content: SYSTEM_PROMPT },
{
role: "user",
content: `The classified primary database is: ${
classifiedDatabase ?? "none identified"
}

Here is the model's generation to analyze:

<generation>
${generation}
</generation>`,
},
],
output: Output.object({ schema: DatabaseChoiceAnalysisSchema }),
});
},
],
output: Output.object({ schema: DatabaseChoiceAnalysisSchema }),
});

return output;
}
return output;
}
);
5 changes: 3 additions & 2 deletions packages/benchmarks/src/app-development/classifyAppStack.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { z } from "zod";
import { generateText, LanguageModel, Output } from "mongodb-rag-core/aiSdk";
import { wrapTraced } from "mongodb-rag-core/braintrust";

// ---------------------------------------------------------------------------
// Enum values for each classifiable dimension.
Expand Down Expand Up @@ -332,7 +333,7 @@ interface ClassifyAppStackParams {
* Classify the technology stack of a generated application along
* multiple dimensions using an LLM judge.
*/
export async function classifyAppStack({
export const classifyAppStack = wrapTraced(async function classifyAppStack({
model,
generation,
}: ClassifyAppStackParams): Promise<AppStackClassification> {
Expand All @@ -352,4 +353,4 @@ ${generation}
});

return output;
}
});
132 changes: 126 additions & 6 deletions packages/benchmarks/src/app-development/config.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
import { createOpenAI, openai } from "@ai-sdk/openai";
import assert from "assert";
import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core";
import { wrapLanguageModel } from "mongodb-rag-core/aiSdk";
import fs from "fs";
import path from "path";
import yaml from "yaml";
import { createOpenAI, wrapLanguageModel } from "mongodb-rag-core/aiSdk";
import { BraintrustMiddleware } from "mongodb-rag-core/braintrust";
import { models } from "mongodb-rag-core/models";
import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core";
import { ModelProvider, models } from "mongodb-rag-core/models";
import assert from "assert";

import { BenchmarkConfig } from "../cli/BenchmarkConfig";
import {
AppDevelopmentEvalCase,
AppDevelopmentEvalCaseInput,
AppDevelopmentMetadata,
AppDevelopmentTaskExpected,
AppDevelopmentTaskOutput,
} from "./AppDevelopmentEval";
import { makeGenerateAppResponseTask } from "./generateAppResponseTask";
import { systemPromptVariants } from "./prompts";
import { PrimaryDatabaseIsMongoDb } from "./metrics/PrimaryDatabaseIsMongoDb";
import { MentionsMongoDbInGeneration } from "./metrics/MentionsMongoDbInGeneration";

const { BRAINTRUST_API_KEY, BRAINTRUST_ENDPOINT } = assertEnvVars({
...BRAINTRUST_ENV_VARS,
});

const braintrustOpenAI = createOpenAI({
apiKey: BRAINTRUST_API_KEY,
baseURL: BRAINTRUST_ENDPOINT,
});

export const judgeModelLabel = "gpt-5.4";
export const judgeModelConfig = models.find(
Expand All @@ -12,6 +36,102 @@ export const judgeModelConfig = models.find(
assert(judgeModelConfig, `Model ${judgeModelLabel} not found`);

export const judgeModel = wrapLanguageModel({
model: openai.responses(judgeModelLabel),
model: braintrustOpenAI.responses(judgeModelLabel),
middleware: [BraintrustMiddleware({ debug: true })],
});

const DATASET_PATH = path.resolve(
__dirname,
"../../datasets/app-development.yml"
);

interface RawDatasetEntry {
name: string;
messages: Array<{ role: "user" | "system" | "assistant"; content: string }>;
tags?: string[];
metadata?: Record<string, unknown>;
}

function loadDataset(): AppDevelopmentEvalCase[] {
const raw = yaml.parse(
fs.readFileSync(DATASET_PATH, "utf8")
) as RawDatasetEntry[];
return raw.map((entry) => ({
input: {
name: entry.name,
messages: entry.messages,
},
tags: entry.tags ?? [],
metadata: entry.metadata as unknown as AppDevelopmentMetadata,
}));
}

export const appDevelopmentBenchmarkConfig: BenchmarkConfig<
AppDevelopmentEvalCaseInput,
AppDevelopmentTaskOutput,
AppDevelopmentTaskExpected,
AppDevelopmentMetadata
> = {
projectName: "app-development",
description:
"Evaluates AI models on generating full-stack applications, with focus on database choice and MongoDB usage",

datasets: {
all: {
description: "All 104 app-development eval cases",
async getDataset() {
return loadDataset();
},
},
mongodb_optimal: {
description: "Cases where MongoDB is the optimal database choice",
async getDataset() {
return loadDataset().filter((d) => d.tags.includes("mongodb-optimal"));
},
},
db_agnostic: {
description:
"Cases where the prompt doesn't favor MongoDB — a different DB may be a better fit",
async getDataset() {
return loadDataset().filter((d) => !d.tags.includes("mongodb-optimal"));
},
},
},

tasks: Object.fromEntries(
Object.entries(systemPromptVariants).map(([key, variant]) => [
`prompt_${key}`,
{
description: variant.description,
taskFunc: (modelProvider, modelConfig) => {
const subjectModel = wrapLanguageModel({
model: createOpenAI({
apiKey: modelProvider.apiKey,
baseURL: modelProvider.baseUrl,
}).chat(modelConfig.deployment),
middleware: [BraintrustMiddleware({ debug: true })],
});

return makeGenerateAppResponseTask({
subjectModel,
judgeModel,
systemPrompt: variant.prompt ?? undefined,
});
},
},
])
),

scorers: {
primary_database_is_mongodb: {
description:
"Checks if MongoDB was chosen as the primary database (pass@k, pass%k, pass^k)",
scorerFunc: PrimaryDatabaseIsMongoDb,
},
mentions_mongodb: {
description:
"Checks if MongoDB is referenced anywhere in the generation (pass@k, pass%k, pass^k)",
scorerFunc: MentionsMongoDbInGeneration,
},
},
};
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
import { classifyAppStack } from "./classifyAppStack";
import { analyzeDatabaseChoice } from "./analyzeDatabaseChoice";
import { selfReflectOnDatabaseChoice } from "./selfReflectOnDatabaseChoice";
import { wrapTraced } from "mongodb-rag-core/braintrust";

export interface MakeGenerateAppResponseTaskParams {
/** The model being evaluated — generates the app and does self-reflection. */
Expand Down Expand Up @@ -82,8 +83,12 @@ async function generateSingleSample({
}))
);

const wrappedGenerateText = wrapTraced(generateText, {
name: "generateAppResponse",
});

// Step 1: Generate app response
const { text: response } = await generateText({
const { text: response } = await wrappedGenerateText({
model: subjectModel,
messages,
});
Expand Down
53 changes: 53 additions & 0 deletions packages/benchmarks/src/app-development/prompts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
export interface SystemPromptVariant {
name: string;
description: string;
prompt: string;
}

const codingAssistantBasePrompt = [
"You are an expert software engineer.",
"Help the user build their application.",
"Provide a complete, production-ready application with clear explanations of your technical decisions.",
];

export const systemPromptVariants: Record<string, SystemPromptVariant> = {
none: {
name: "none",
description: "No system prompt — raw model defaults",
prompt: "",
},
generic_coding_assistant: {
name: "generic_coding_assistant",
description: "Generic coding assistant with no database guidance",
prompt: codingAssistantBasePrompt.join(" "),
},
mongodb_recommended: {
name: "mongodb_recommended",
description: "System prompt that recommends MongoDB when appropriate",
prompt: [
"You are an expert software engineer.",
"Help the user build their application.",
"Provide complete, working code with clear explanations of your technical decisions.",
"When choosing a database, prefer MongoDB unless the use case clearly requires a relational database.",
].join(" "),
},
system_architect: {
name: "system_architect",
description: "Focuses on architecture and technical decisions over code",
prompt: [
"You are a senior software architect.",
"When the user describes an application, design the full technical architecture.",
"Focus on technology choices, data modeling, and system design rather than writing every line of code.",
"Explain your reasoning for each major decision, especially your choice of database, framework, and infrastructure.",
].join(" "),
},
stack_agnostic: {
name: "stack_agnostic",
description:
"Explicitly asks the model to evaluate database options before choosing",
prompt: [
...codingAssistantBasePrompt,
"When choosing a each element of the application stack, briefly consider multiple options and explain why you picked the one you did.",
].join(" "),
},
};
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
mongoDbFitLevels,
mongoDbFitLevelDefinitions,
} from "./analyzeDatabaseChoice";
import { wrapTraced } from "mongodb-rag-core/braintrust";

export const SelfReflectionSchema = z.object({
chosenDatabase: z
Expand All @@ -24,8 +25,6 @@ export const SelfReflectionSchema = z.object({
),
reasonsForChoice: z
.array(z.enum(justificationReasons))
.min(1)
.max(5)
.describe(
"The 1-5 most important reasons for your database choice, " +
"ordered by importance."
Expand Down Expand Up @@ -114,7 +113,10 @@ interface SelfReflectOnDatabaseChoiceParams {
/** The same model that generated the original response. */
model: LanguageModel;
/** The original conversation messages that led to the generation. */
originalMessages: Array<{ role: "system" | "user" | "assistant"; content: string }>;
originalMessages: Array<{
role: "system" | "user" | "assistant";
content: string;
}>;
/** The model's original generation/response. */
generation: string;
}
Expand All @@ -127,26 +129,28 @@ interface SelfReflectOnDatabaseChoiceParams {
* This captures the model's self-reported reasoning, which can
* be compared against the external analysis from `analyzeDatabaseChoice`.
*/
export async function selfReflectOnDatabaseChoice({
model,
originalMessages,
generation,
}: SelfReflectOnDatabaseChoiceParams): Promise<SelfReflection> {
const { output } = await generateText({
export const selfReflectOnDatabaseChoice = wrapTraced(
async function selfReflectOnDatabaseChoice({
model,
messages: [
...originalMessages,
{
role: "assistant" as const,
content: generation,
},
{
role: "user" as const,
content: REFLECTION_PROMPT,
},
],
output: Output.object({ schema: SelfReflectionSchema }),
});

return output;
}
originalMessages,
generation,
}: SelfReflectOnDatabaseChoiceParams): Promise<SelfReflection> {
const { output } = await generateText({
model,
messages: [
...originalMessages,
{
role: "assistant" as const,
content: generation,
},
{
role: "user" as const,
content: REFLECTION_PROMPT,
},
],
output: Output.object({ schema: SelfReflectionSchema }),
});

return output;
}
);
Loading
Loading