From f1ffc82122ac471a9835dbe56a17b3055193ac71 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Tue, 19 Aug 2025 10:38:37 -0400 Subject: [PATCH 1/4] process answer options through F, handle multiple correct --- .../bin/loadBadgeQuizQuestions.ts | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts b/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts index e9586564f..e7d0509b9 100644 --- a/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts +++ b/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts @@ -2,12 +2,26 @@ import fs from "fs"; import path from "path"; import csv from "csv-parser"; import { QuizQuestionData, QuizQuestionDataSchema } from "../QuizQuestionData"; -import { makeTags } from "./makeTags"; const testDataPath = path.resolve(__dirname, "..", "..", "..", "testData"); const csvFileInPath = path.resolve(testDataPath, "badge-questions.csv"); const jsonFileOutPath = path.resolve(testDataPath, "badge-questions.json"); +const handleAnswers = (row: any) => { + const correctAnswers = row.Answer.trim()?.split("") || []; + const answers = ["A", "B", "C", "D", "E", "F"] + .map((label) => { + const isCorrect = correctAnswers.includes(label); + return { + answer: row[label], + isCorrect, + label, + }; + }) + .filter((answer) => answer.answer && answer.answer.trim() !== ""); // Remove empty answers + return answers; +}; + const parseCSV = async (filePath: string): Promise => { return new Promise((resolve, reject) => { const results: QuizQuestionData[] = []; @@ -15,22 +29,16 @@ const parseCSV = async (filePath: string): Promise => { .pipe(csv()) .on("data", (row) => { try { - const answers = ["A", "B", "C", "D"].map((label, index) => ({ - answer: row[label], - isCorrect: row.Answer === (index + 1).toString(), - label, - })); - + const answers = handleAnswers(row); const questionData: QuizQuestionData = QuizQuestionDataSchema.parse({ questionText: row["Question Text"], title: row["Assessment"], topicType: "badge", // Defaulting topic type - questionType: "singleCorrect", // Assuming single correct answer + questionType: + row["Answer"].length > 1 ? "multipleCorrect" : "singleCorrect", answers, - explanation: row["Reference"], tags: row["tags"] ? row["tags"].split(",") : [], }); - questionData.tags = makeTags(questionData); results.push(questionData); } catch (error) { console.error("Validation error:", error); From 27f65ee39724ebb78887c554c279a7a8983cbdb0 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Fri, 29 Aug 2025 15:25:16 -0400 Subject: [PATCH 2/4] wip --- .../bin/loadBadgeQuizQuestions.ts | 51 ++++++++++++++++++- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts b/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts index e7d0509b9..2702d172d 100644 --- a/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts +++ b/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts @@ -22,24 +22,71 @@ const handleAnswers = (row: any) => { return answers; }; +// const createTagsFromAssessmentName = (assessmentName: string) => { +// return assessmentName.split(",").map((tag) => tag.trim()); +// }; + +const assessmentNameToTagsMap = { + 'MongoDB Aggregation Fundamentals': ['aggregation'], + 'MongoDB Query Optimization Techniques': ['query'], + "From Relational Model (SQL) to MongoDB's Document Model": ['data modeling'], + 'MongoDB Schema Design Patterns and Antipatterns': ['data modeling'], + 'MongoDB Advanced Schema Design Patterns and Antipatterns': ['data modeling'], + 'MongoDB Schema Design Optimization': ['data modeling'], + 'Building AI Agents with MongoDB': ['gen-ai'], + 'Building AI-Powered Search with MongoDB Vector Search': ['gen-ai'], + 'Building RAG Apps Using MongoDB': ['gen-ai'], + 'MongoDB Indexing Design Fundamentals': ['indexing'], + 'Monitoring MongoDB with Built-in Tools': ['monitoring tuning and automation'], + 'Optimizing MongoDB Performance with Tuning Tools': ['monitoring tuning and automation'], + 'CRUD Operations in MongoDB': ['query'], + 'Search with MongoDB': ['search'], + 'Securing MongoDB Atlas: Authentication & Authorization': ['security'], + 'Securing MongoDB Self-Managed: Authentication & Authorization': ['security'], + 'MongoDB Sharding Strategies': ['sharding'], + 'Optimizing and Maintaining MongoDB Cluster Reliability': ['performance at scale'], +}; + +// excluded: +// 'MongoDB Overview: Core Concepts and Architecture' + const parseCSV = async (filePath: string): Promise => { return new Promise((resolve, reject) => { const results: QuizQuestionData[] = []; + const assessments = new Set(); fs.createReadStream(filePath) .pipe(csv()) .on("data", (row) => { + // console.log("HIT TRY"); try { + const assessmentName = row["Assessment"]?.trim(); + if (!assessmentName) { + console.warn("Skipping row with missing assessment name"); + return; + } + + // Type guard to ensure assessmentName is a valid key + if (assessmentName in assessmentNameToTagsMap) { + console.log('>> tags', assessmentNameToTagsMap[assessmentName as keyof typeof assessmentNameToTagsMap]); + } else { + console.warn(`Assessment name not found in map: "${assessmentName}"`); + } + const answers = handleAnswers(row); const questionData: QuizQuestionData = QuizQuestionDataSchema.parse({ questionText: row["Question Text"], - title: row["Assessment"], + title: assessmentName, topicType: "badge", // Defaulting topic type questionType: row["Answer"].length > 1 ? "multipleCorrect" : "singleCorrect", answers, - tags: row["tags"] ? row["tags"].split(",") : [], + // tags: row["tags"] ? row["tags"].split(",") : [], + tags: assessmentName in assessmentNameToTagsMap + ? assessmentNameToTagsMap[assessmentName as keyof typeof assessmentNameToTagsMap] + : [], }); results.push(questionData); + // console.log(">>>> assessments >>>>", assessments); } catch (error) { console.error("Validation error:", error); } From 3712018f81c741ad06a25c157fbd245a66ebe069 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Sun, 7 Sep 2025 19:59:37 -0400 Subject: [PATCH 3/4] run reports --- .../src/quizQuestions/QuizQuestionEval.ts | 2 +- .../bin/loadBadgeQuizQuestions.ts | 20 +++--- .../src/quizQuestions/bin/makeTags.ts | 5 +- ...reportMongoDbUniversityAllQuizQuestions.ts | 46 ++++++++++-- ...MongoDbUniversityBadgeQuestionBenchmark.ts | 72 ++++++++++--------- 5 files changed, 89 insertions(+), 56 deletions(-) diff --git a/packages/benchmarks/src/quizQuestions/QuizQuestionEval.ts b/packages/benchmarks/src/quizQuestions/QuizQuestionEval.ts index 0b4733ded..44021bbdb 100644 --- a/packages/benchmarks/src/quizQuestions/QuizQuestionEval.ts +++ b/packages/benchmarks/src/quizQuestions/QuizQuestionEval.ts @@ -139,7 +139,7 @@ export function runQuizQuestionEval({ reasoning_enabled: modelId.includes("gemini-2.5") ? true : undefined, reasoning_budget: modelId.includes("gemini-2.5") ? 1024 : undefined, }; - + console.log("reasoningOptions", reasoningOptions); llmOptions = { ...reasoningOptions, ...(llmOptions ?? {}), diff --git a/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts b/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts index 2702d172d..4eb172cbf 100644 --- a/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts +++ b/packages/benchmarks/src/quizQuestions/bin/loadBadgeQuizQuestions.ts @@ -29,22 +29,22 @@ const handleAnswers = (row: any) => { const assessmentNameToTagsMap = { 'MongoDB Aggregation Fundamentals': ['aggregation'], 'MongoDB Query Optimization Techniques': ['query'], - "From Relational Model (SQL) to MongoDB's Document Model": ['data modeling'], - 'MongoDB Schema Design Patterns and Antipatterns': ['data modeling'], - 'MongoDB Advanced Schema Design Patterns and Antipatterns': ['data modeling'], - 'MongoDB Schema Design Optimization': ['data modeling'], - 'Building AI Agents with MongoDB': ['gen-ai'], - 'Building AI-Powered Search with MongoDB Vector Search': ['gen-ai'], - 'Building RAG Apps Using MongoDB': ['gen-ai'], + "From Relational Model (SQL) to MongoDB's Document Model": ['data_modeling'], + 'MongoDB Schema Design Patterns and Antipatterns': ['data_modeling'], + 'MongoDB Advanced Schema Design Patterns and Antipatterns': ['data_modeling'], + 'MongoDB Schema Design Optimization': ['data_modeling'], + 'Building AI Agents with MongoDB': ['gen_ai'], + 'Building AI-Powered Search with MongoDB Vector Search': ['gen_ai'], + 'Building RAG Apps Using MongoDB': ['gen_ai'], 'MongoDB Indexing Design Fundamentals': ['indexing'], - 'Monitoring MongoDB with Built-in Tools': ['monitoring tuning and automation'], - 'Optimizing MongoDB Performance with Tuning Tools': ['monitoring tuning and automation'], + 'Monitoring MongoDB with Built-in Tools': ['monitoring_tuning_and_automation'], + 'Optimizing MongoDB Performance with Tuning Tools': ['monitoring_tuning_and_automation'], 'CRUD Operations in MongoDB': ['query'], 'Search with MongoDB': ['search'], 'Securing MongoDB Atlas: Authentication & Authorization': ['security'], 'Securing MongoDB Self-Managed: Authentication & Authorization': ['security'], 'MongoDB Sharding Strategies': ['sharding'], - 'Optimizing and Maintaining MongoDB Cluster Reliability': ['performance at scale'], + 'Optimizing and Maintaining MongoDB Cluster Reliability': ['performance_at_scale'], }; // excluded: diff --git a/packages/benchmarks/src/quizQuestions/bin/makeTags.ts b/packages/benchmarks/src/quizQuestions/bin/makeTags.ts index 8f42529fc..7b30345c1 100644 --- a/packages/benchmarks/src/quizQuestions/bin/makeTags.ts +++ b/packages/benchmarks/src/quizQuestions/bin/makeTags.ts @@ -1,9 +1,6 @@ -import mongoDbMetadata from "mongodb-rag-core/mongoDbMetadata"; +import { mongoDbProductNames, mongoDbTopics, mongoDbProgrammingLanguages } from "mongodb-rag-core/mongoDbMetadata"; import { QuizQuestionData } from "../QuizQuestionData"; -const { mongoDbProductNames, mongoDbTopics, mongoDbProgrammingLanguages } = - mongoDbMetadata; - const programmingLanguageTags = [ ...mongoDbProgrammingLanguages .map((pl) => pl.id) diff --git a/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityAllQuizQuestions.ts b/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityAllQuizQuestions.ts index 95b913888..10828b8c7 100644 --- a/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityAllQuizQuestions.ts +++ b/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityAllQuizQuestions.ts @@ -14,12 +14,46 @@ import { aggregateExperimentScoreMean } from "../../reporting/aggregateExperimen const { BRAINTRUST_API_KEY } = assertEnvVars(BRAINTRUST_ENV_VARS); const projectName = "mongodb-multiple-choice"; const experiments = [ - { experimentName: "mistral-large-2", model: "Mistral Large 2" }, - { experimentName: "gemini-2-flash", model: "Gemini 2 Flash" }, - { experimentName: "claude-35-sonnet-v2", model: "Claude 3.5 Sonnet v2" }, - { experimentName: "llama-3.1-70b", model: "Llama 3.1 70B" }, - { experimentName: "nova-pro-v1:0", model: "Nova Pro v1" }, - { experimentName: "gpt-4o", model: "GPT-4o" }, + { + model: "GPT 5", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5&datasets=mdbu_quiz_all-ad20cb9f", + }, + { + model: "GPT 5 Mini", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5-mini&datasets=mdbu_quiz_all-ab823151", + }, + { + model: "GPT 4o", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o&datasets=mdbu_quiz_all-80e9b77e", + }, + { + model: "GPT 4o Mini", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o-mini&datasets=mdbu_quiz_all", + }, + { + model: "Gemini 2.5 Pro", + experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-pro&datasets=mdbu_quiz_all", + }, + { + model: "Gemini 2.5 Flash", + experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-flash&datasets=mdbu_quiz_all", + }, + { + model: "Claude 4.1 Opus", + experimentName: "multiple_choice?experimentType=answer_question&model=claude-opus-4.1&datasets=mdbu_quiz_all", + }, + { + model: "Claude 4 Sonnet", + experimentName: "multiple_choice?experimentType=answer_question&model=claude-sonnet-4&datasets=mdbu_quiz_all", + }, + { + model: "Claude 3.7 Sonnet", + experimentName: "multiple_choice?experimentType=answer_question&model=claude-37-sonnet&datasets=mdbu_quiz_all", + }, + { + model: "Mistral Large 2", + experimentName: "multiple_choice?experimentType=answer_question&model=mistral-large-2&datasets=mdbu_quiz_all", + }, ]; const basePathOut = path.resolve(__dirname, "..", "..", "..", "testData"); diff --git a/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts b/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts index 7ef0aa93b..342f69437 100644 --- a/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts +++ b/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts @@ -19,56 +19,44 @@ const { BRAINTRUST_API_KEY } = assertEnvVars(BRAINTRUST_ENV_VARS); const projectName = "mongodb-multiple-choice"; const experiments = [ { - model: "GPT-4o", - experimentName: "gpt-4o-badge-631d3a9b", + model: "GPT 5", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5&datasets=mdbu_quiz_badge-0bf72a0a", }, { - model: "Claude 3.5 Sonnet v2", - experimentName: "claude-35-sonnet-v2-badge-cb743d9f", + model: "GPT 5 Mini", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-5-mini&datasets=mdbu_quiz_badge-7ed6151c", }, { - model: "Claude 3.5 Sonnet", - experimentName: "claude-35-sonnet-badge-f3427e16", + model: "GPT 4o", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o&datasets=mdbu_quiz_badge-3de142aa", }, - { model: "Gemini 2 Flash", experimentName: "gemini-2-flash-badge-76fea4f5" }, { - model: "Claude 3.5 Haiku", - experimentName: "claude-35-haiku-badge-4f4d32bb", + model: "GPT 4o Mini", + experimentName: "multiple_choice?experimentType=answer_question&model=gpt-4o-mini&datasets=mdbu_quiz_badge-470f00b2", }, - { model: "Nova Pro v1", experimentName: "nova-pro-v1:0-badge-e76a0833" }, - { model: "Llama 3.1 70B", experimentName: "llama-3.1-70b-badge-f2e28e86" }, - { model: "Llama 3.2 90B", experimentName: "llama-3.2-90b-badge-81111f12" }, { - model: "Claude 3.5 Haiku", - experimentName: "claude-35-haiku-badge-4f4d32bb", + model: "Gemini 2.5 Pro", + experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-pro&datasets=mdbu_quiz_badge-9be5ad74", }, { - model: "Gemini 1.5 Flash", - experimentName: "gemini-1.5-flash-002-badge-e0141bec", + model: "Gemini 2.5 Flash", + experimentName: "multiple_choice?experimentType=answer_question&model=gemini-2.5-flash&datasets=mdbu_quiz_badge-1cc693a9", }, - { model: "Nova Lite v1", experimentName: "nova-lite-v1:0-badge-c896c5f3" }, - { model: "Llama 3 70B", experimentName: "llama-3-70b-badge-54545f72" }, - { model: "GPT-4o Mini", experimentName: "gpt-4o-mini-badge" }, { - model: "Gemini 1.5 Pro", - experimentName: "gemini-1.5-pro-002-badge-fc8268f2", + model: "Claude 4.1 Opus", + experimentName: "multiple_choice?experimentType=answer_question&model=claude-opus-4.1&datasets=mdbu_quiz_badge-6308714f", }, { - model: "GPT-35 Turbo 16k", - experimentName: "gpt-35-turbo-16k-badge-6282561d", + model: "Claude 4 Sonnet", + experimentName: "multiple_choice?experimentType=answer_question&model=claude-sonnet-4&datasets=mdbu_quiz_badge-98f0f3af", }, { - model: "Gemini 1.0 Pro", - experimentName: "gemini-1.0-pro-002-badge-62c646b1", + model: "Claude 3.7 Sonnet", + experimentName: "multiple_choice?experimentType=answer_question&model=claude-37-sonnet&datasets=mdbu_quiz_badge-d9c09975", }, - { model: "Nova Micro v1", experimentName: "nova-micro-v1:0-badge-e415a9d7" }, { model: "Mistral Large 2", - experimentName: "mistral-large-2-badge-2526d48b", - }, - { - model: "Claude 3 Sonnet", - experimentName: "claude-3-sonnet-badge-a8791d43", + experimentName: "multiple_choice?experimentType=answer_question&model=mistral-large-2&datasets=mdbu_quiz_badge-5aad3637", }, ]; @@ -132,10 +120,24 @@ async function main() { ensureOutputDirectory(outputDir); const titleTags = [ - "Relational to Document Model", - "Schema Patterns and Antipatterns", - "Schema Design Optimization", - "Advanced Schema Patterns and Antipatterns", + "MongoDB Aggregation Fundamentals", + "MongoDB Query Optimization Techniques", + "From Relational Model (SQL) to MongoDB's Document Model", + "MongoDB Schema Design Patterns and Antipatterns", + "MongoDB Advanced Schema Design Patterns and Antipatterns", + "MongoDB Schema Design Optimization", + "Building AI Agents with MongoDB", + "Building AI-Powered Search with MongoDB Vector Search", + "Building RAG Apps Using MongoDB", + "MongoDB Indexing Design Fundamentals", + "Monitoring MongoDB with Built-in Tools", + "Optimizing MongoDB Performance with Tuning Tools", + "CRUD Operations in MongoDB", + "Search with MongoDB", + "Securing MongoDB Atlas: Authentication & Authorization", + "Securing MongoDB Self-Managed: Authentication & Authorization", + "MongoDB Sharding Strategies", + "Optimizing and Maintaining MongoDB Cluster Reliability", ] as const; // Define a type for the quiz titles From cacf9237dcf47f586d5136ebe19e86a98eb8c3b7 Mon Sep 17 00:00:00 2001 From: Alla Yakubova Date: Mon, 15 Sep 2025 12:40:38 -0400 Subject: [PATCH 4/4] create csv with breakdown by question --- ...MongoDbUniversityBadgeQuestionBenchmark.ts | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts b/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts index 342f69437..474e0cfb7 100644 --- a/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts +++ b/packages/benchmarks/src/quizQuestions/bin/reportMongoDbUniversityBadgeQuestionBenchmark.ts @@ -115,6 +115,30 @@ function createCsvHeaders(data: CsvData[]): CsvHeader[] { })); } +/** +Extracts model name from experiment name. +The model name appears between "&model=" and "&datasets". + +@param experimentName The experiment name to extract model from. +@returns The extracted model name. +*/ +function extractModelName(experimentName: string): string { + const modelMatch = experimentName.match(/&model=([^&]+)&datasets/); + return modelMatch ? modelMatch[1] : 'unknown'; +} + +/** +Type definition for detailed quiz results CSV data. +*/ +type DetailedQuizResult = { + title: string; + questionText: string; + answers: string; + expected: string; + "Correct Count": number; + [key: string]: string | number; // Dynamic model columns and Correct Count +}; + async function main() { const outputDir = path.join(basePathOut, "csv", "badge"); ensureOutputDirectory(outputDir); @@ -150,6 +174,7 @@ async function main() { } & Partial>; const experimentAggregates: ExperimentAggregate[] = []; + const detailedResults: DetailedQuizResult[] = []; // Process each experiment for (const { experimentName, model } of experiments) { @@ -166,6 +191,39 @@ async function main() { apiKey: BRAINTRUST_API_KEY, }); + // Extract model name from experiment name + const extractedModelName = extractModelName(experimentName); + + // Process detailed results for the new CSV + results.forEach((result) => { + // Find existing detailed result or create new one + let detailedResult = detailedResults.find( + (dr) => + dr.questionText === result.input.questionText && + dr.expected === result.expected + ); + + if (!detailedResult) { + // Create new detailed result + detailedResult = { + title: result.metadata?.title || '', + questionText: result.input.questionText, + answers: JSON.stringify(result.input.answers), + expected: result.expected, + "Correct Count": 0, + }; + detailedResults.push(detailedResult); + } + + // Add the model's output to the detailed result + detailedResult[extractedModelName] = result.output || ''; + + // Add to correct count if this result is correct + if (result.scores?.CorrectQuizAnswer === 1) { + detailedResult["Correct Count"] += 1; + } + }); + // Add the quiz name as a tag if metadata.title is defined // Note: in the future we should do better tagging in advance to avoid hacks like this. const resultsWithQuizNameTag = results.map((result) => { @@ -227,6 +285,13 @@ async function main() { createCsvHeaders(experimentAggregates), path.join(outputDir, "badge_quiz_question_experiment_aggregates.csv") ); + + // Write detailed quiz results to CSV + await writeDataToCsv( + detailedResults, + createCsvHeaders(detailedResults), + path.join(outputDir, "detailed_quiz_question_results.csv") + ); } main().catch(console.error);