Phase 9: Doc Evaluation Harness\n\n- eval-questions.js: Generates ground-truth questions from raw source data\n- eval.js: LLM-as-judge scoring harness (answers from docs, scores against truth)\n- Generated 33 questions covering config, dependencies, resources, and interactions\n- Baseline score: 66.7% (configuration 93%, dependencies 77%, structural 31%)

This commit is contained in:
Jarvis Prime
2026-03-09 22:32:41 +00:00
parent d9fa087e22
commit b99341e8bc
4 changed files with 1680 additions and 0 deletions

392
eval-questions.js Normal file
View File

@@ -0,0 +1,392 @@
/**
* Eval Harness: Question Bank Generator
*
* Generates ground-truth Q&A pairs directly from source code and Helm charts.
* Questions are categorized by type and difficulty.
*
* Usage: node eval-questions.js <foxtrot-root> <output.json>
*/
const fs = require('fs');
const path = require('path');
const { discoverCharts } = require('./extract-helm.js');
const GraphStore = require('./graph.js');
const { buildSubsystems } = require('./subsystem.js');
function generateQuestions(srcRoot, snapshotPath) {
const questions = [];
const helmIgnore = new Set([
'node_modules', '.git', 'venv', '__pycache__', '.terraform',
'_bmad', '_bmad-output', '.codex', '.claude', '.cursor', '.gemini', '.kiro', '.agents'
]);
// Load Helm data
const charts = discoverCharts(srcRoot, helmIgnore);
// Load graph + subsystems
const graph = GraphStore.loadSnapshot(snapshotPath);
const subs = buildSubsystems(graph, {
srcDir: srcRoot.endsWith('/') ? srcRoot : srcRoot + '/',
minTraffic: 3,
crossCuttingThreshold: 0.6
});
// ─── Category 1: Structural (chart metadata) ───
// Q: How many Helm charts exist?
questions.push({
id: 'struct-001',
category: 'structural',
difficulty: 'easy',
audience: ['human', 'machine'],
question: 'How many Helm charts are defined across the Foxtrot monorepo?',
answer: String(charts.length),
answerType: 'exact',
source: 'Chart.yaml discovery',
});
// Q: Which charts have the most K8s resources?
const topByResources = [...charts]
.sort((a, b) => b.templates.resources.length - a.templates.resources.length)
.slice(0, 5);
questions.push({
id: 'struct-002',
category: 'structural',
difficulty: 'medium',
audience: ['human', 'machine'],
question: 'Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.',
answer: topByResources.map(c => `${c.chart.name} (${c.dir}): ${c.templates.resources.length}`).join('\n'),
answerType: 'ranked-list',
source: 'template scanning',
});
// Q: How many subsystems are detected?
questions.push({
id: 'struct-003',
category: 'structural',
difficulty: 'easy',
audience: ['human', 'machine'],
question: 'How many subsystems does the Foxtrot codebase contain?',
answer: String(subs.subsystems.length),
answerType: 'exact',
source: 'subsystem aggregation',
});
// Q: Which subsystem has the most files?
const topBySub = [...subs.subsystems].sort((a, b) => b.files.length - a.files.length)[0];
questions.push({
id: 'struct-004',
category: 'structural',
difficulty: 'easy',
audience: ['human'],
question: 'Which subsystem contains the most files, and how many?',
answer: `${topBySub.name}: ${topBySub.files.length} files`,
answerType: 'exact',
source: 'subsystem aggregation',
});
// ─── Category 2: Dependencies (chart-to-chart) ───
// Charts with dependencies
const chartsWithDeps = charts.filter(c => c.chart.dependencies.length > 0);
for (const c of chartsWithDeps.slice(0, 5)) {
questions.push({
id: `dep-${c.chart.name.replace(/[^a-z0-9]/g, '-')}-001`,
category: 'dependency',
difficulty: 'medium',
audience: ['human', 'machine'],
question: `What are the dependencies of the "${c.chart.name}" chart (at ${c.dir})?`,
answer: c.chart.dependencies.map(d => `${d.name} (${d.version})${d.condition ? ` [condition: ${d.condition}]` : ''}`).join('\n'),
answerType: 'list',
source: `${c.dir}/Chart.yaml`,
});
}
// ─── Category 3: Contracts (shared secrets, config refs) ───
// Shared secrets
const configUsers = {};
for (const c of charts) {
for (const i of c.interactions) {
if (i.type === 'config-ref') {
if (!configUsers[i.target]) configUsers[i.target] = [];
configUsers[i.target].push(c.chart.name);
}
}
}
const sharedSecrets = Object.entries(configUsers).filter(([, users]) => users.length > 1);
if (sharedSecrets.length > 0) {
questions.push({
id: 'contract-001',
category: 'contract',
difficulty: 'medium',
audience: ['human', 'machine'],
question: 'Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.',
answer: sharedSecrets.map(([name, users]) => `${name}: ${[...new Set(users)].join(', ')}`).join('\n'),
answerType: 'list',
source: 'template interaction scanning',
});
}
// ─── Category 4: Configuration Surface (values.yaml) ───
// Specific value questions from important charts
const importantCharts = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis'];
for (const name of importantCharts) {
const chart = charts.find(c => c.chart.name === name && c.values.keys.length > 10);
if (!chart) continue;
// What's the default domain?
const domainKey = chart.values.keys.find(k => k.name === 'domain' || k.name === 'rootDomain');
if (domainKey && domainKey.defaultValue) {
questions.push({
id: `config-${name}-domain`,
category: 'configuration',
difficulty: 'easy',
audience: ['human'],
question: `What is the default value of "${domainKey.name}" in the ${name} chart?`,
answer: String(domainKey.defaultValue),
answerType: 'exact',
source: `${chart.dir}/values.yaml`,
});
}
// How many configurable values?
questions.push({
id: `config-${name}-count`,
category: 'configuration',
difficulty: 'easy',
audience: ['machine'],
question: `How many top-level configuration keys does the ${name} chart (at ${chart.dir}) expose in its values.yaml?`,
answer: String(chart.values.keys.length),
answerType: 'exact',
source: `${chart.dir}/values.yaml`,
});
}
// ─── Category 5: Interactions (service-to-service) ───
const svcRefs = [];
for (const c of charts) {
for (const i of c.interactions) {
if (i.type === 'k8s-service') {
svcRefs.push({ from: c.chart.name, dir: c.dir, to: i.target });
}
}
}
if (svcRefs.length > 0) {
questions.push({
id: 'interaction-001',
category: 'interaction',
difficulty: 'medium',
audience: ['human', 'machine'],
question: 'Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.',
answer: svcRefs.map(r => `${r.from} (${r.dir}) → ${r.to}`).join('\n'),
answerType: 'list',
source: 'template interaction scanning',
});
}
// Port allocation
const portMap = {};
for (const c of charts) {
for (const i of c.interactions) {
if (i.type === 'port' && i.target !== '0') {
if (!portMap[i.target]) portMap[i.target] = [];
if (!portMap[i.target].includes(c.chart.name)) portMap[i.target].push(c.chart.name);
}
}
}
const sharedPorts = Object.entries(portMap).filter(([, users]) => users.length > 1);
if (sharedPorts.length > 0) {
questions.push({
id: 'interaction-002',
category: 'interaction',
difficulty: 'hard',
audience: ['human', 'machine'],
question: 'Which network ports are used by multiple Helm charts? List each port and the charts that expose it.',
answer: sharedPorts.sort((a, b) => Number(a[0]) - Number(b[0])).map(([port, users]) => `Port ${port}: ${users.join(', ')}`).join('\n'),
answerType: 'list',
source: 'template port scanning',
});
}
// ─── Category 6: Resource Types ───
const kindCounts = {};
for (const c of charts) {
for (const r of c.templates.resources) {
kindCounts[r.kind] = (kindCounts[r.kind] || 0) + 1;
}
}
const topKinds = Object.entries(kindCounts).sort((a, b) => b[1] - a[1]).slice(0, 10);
questions.push({
id: 'resource-001',
category: 'resource',
difficulty: 'medium',
audience: ['human', 'machine'],
question: 'What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?',
answer: topKinds.map(([kind, count]) => `${kind}: ${count}`).join('\n'),
answerType: 'ranked-list',
source: 'template resource scanning',
});
// Specific chart resource questions
for (const name of ['mdm-app', 'cassandra', 'jenkins']) {
const chart = charts.find(c => c.chart.name === name && c.templates.resources.length > 5);
if (!chart) continue;
const kinds = {};
for (const r of chart.templates.resources) {
kinds[r.kind] = (kinds[r.kind] || 0) + 1;
}
questions.push({
id: `resource-${name}-001`,
category: 'resource',
difficulty: 'medium',
audience: ['human', 'machine'],
question: `What Kubernetes resource types does the ${name} chart generate? List each type and count.`,
answer: Object.entries(kinds).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}: ${v}`).join('\n'),
answerType: 'list',
source: `${chart.dir}/templates/`,
});
}
// ─── Category 7: Cross-Subsystem (code-level) ───
// Which subsystems depend on app-tools?
const appToolsDeps = Object.entries(subs.dependencyMatrix)
.filter(([k, v]) => k.endsWith('→app-tools') && (v.calls + v.imports) > 0)
.map(([k, v]) => ({ from: k.split('→')[0], weight: v.calls + v.imports }));
if (appToolsDeps.length > 0) {
questions.push({
id: 'cross-sub-001',
category: 'cross-subsystem',
difficulty: 'hard',
audience: ['human', 'machine'],
question: 'Which subsystems depend on app-tools, and how heavily (by call+import count)?',
answer: appToolsDeps.sort((a, b) => b.weight - a.weight).map(d => `${d.from}: ${d.weight}`).join('\n'),
answerType: 'list',
source: 'dependency matrix',
});
}
// Cross-cutting concerns
if (subs.crossCutting.length > 0) {
questions.push({
id: 'cross-sub-002',
category: 'cross-subsystem',
difficulty: 'easy',
audience: ['human'],
question: 'Which subsystems are identified as cross-cutting concerns?',
answer: subs.crossCutting.join(', '),
answerType: 'list',
source: 'subsystem aggregation',
});
}
// ─── Category 8: Architectural Reasoning ───
// Why are certain subsystems empty?
const emptySubs = subs.subsystems.filter(s => s.entities.functions === 0 && s.entities.modules === 0);
if (emptySubs.length > 0) {
questions.push({
id: 'arch-001',
category: 'architectural',
difficulty: 'hard',
audience: ['human'],
question: `The following subsystems have 0 detected functions and 0 modules: ${emptySubs.map(s => s.name).join(', ')}. Why might this be the case, and what do they actually contain?`,
answer: `These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.`,
answerType: 'explanation',
source: 'architectural analysis',
});
}
// Chart version question
const mdmChart = charts.find(c => c.chart.name === 'mdm-app');
if (mdmChart) {
questions.push({
id: 'arch-002',
category: 'architectural',
difficulty: 'easy',
audience: ['human', 'machine'],
question: 'What is the current version and appVersion of the mdm-app Helm chart?',
answer: `version: ${mdmChart.chart.version}, appVersion: ${mdmChart.chart.appVersion}`,
answerType: 'exact',
source: `${mdmChart.dir}/Chart.yaml`,
});
}
// ─── Category 9: Scenario-Based (human-focused) ───
questions.push({
id: 'scenario-001',
category: 'scenario',
difficulty: 'hard',
audience: ['human'],
question: 'If you need to rotate the "vault-secret" shared secret, which Helm charts would be affected and need redeployment?',
answer: (configUsers['vault-secret'] || []).join(', '),
answerType: 'list',
source: 'template interaction scanning',
});
questions.push({
id: 'scenario-002',
category: 'scenario',
difficulty: 'hard',
audience: ['human'],
question: 'A new engineer needs to deploy the MDM application stack. What charts, configuration values, and external dependencies should they understand first?',
answer: mdmChart ? `Chart: ${mdmChart.chart.name} (${mdmChart.dir}), Version: ${mdmChart.chart.version}\nKey values: ${mdmChart.values.keys.slice(0, 10).map(k => k.name).join(', ')} (+${Math.max(0, mdmChart.values.keys.length - 10)} more)\nResources generated: ${mdmChart.templates.resources.length} K8s resources\nInteractions: ${mdmChart.interactions.map(i => `${i.type}: ${i.target}`).join(', ') || 'none detected'}` : 'mdm-app chart not found',
answerType: 'explanation',
source: `${mdmChart?.dir || 'N/A'}`,
});
// Cassandra ports
const cassChart = charts.find(c => c.chart.name === 'cassandra');
if (cassChart) {
const cassPorts = cassChart.interactions.filter(i => i.type === 'port').map(i => i.target);
questions.push({
id: 'scenario-003',
category: 'scenario',
difficulty: 'medium',
audience: ['human'],
question: 'What network ports does the Cassandra chart expose, and what are they typically used for?',
answer: `Ports: ${cassPorts.join(', ')}\n7000: Cassandra inter-node cluster communication\n7001: Cassandra SSL inter-node\n7199: JMX monitoring\n9042: CQL native transport (client connections)\n9160: Thrift client API (legacy)`,
answerType: 'explanation',
source: `${cassChart.dir}/templates/`,
});
}
return questions;
}
if (require.main === module) {
const srcRoot = process.argv[2];
const snapshotPath = process.argv[3];
const outPath = process.argv[4] || './eval-questions.json';
if (!srcRoot || !snapshotPath) {
console.error('Usage: node eval-questions.js <foxtrot-root> <snapshot.json> [output.json]');
process.exit(1);
}
const questions = generateQuestions(srcRoot, snapshotPath);
// Summary
const cats = {};
for (const q of questions) {
cats[q.category] = (cats[q.category] || 0) + 1;
}
console.log(`Generated ${questions.length} questions:`);
for (const [cat, count] of Object.entries(cats).sort((a, b) => b[1] - a[1])) {
console.log(` ${cat}: ${count}`);
}
console.log(`\nDifficulty: easy=${questions.filter(q => q.difficulty === 'easy').length}, medium=${questions.filter(q => q.difficulty === 'medium').length}, hard=${questions.filter(q => q.difficulty === 'hard').length}`);
console.log(`Audience: human=${questions.filter(q => q.audience.includes('human')).length}, machine=${questions.filter(q => q.audience.includes('machine')).length}`);
fs.writeFileSync(outPath, JSON.stringify({ generated: new Date().toISOString(), count: questions.length, questions }, null, 2));
console.log(`\nWritten to ${outPath}`);
}
module.exports = { generateQuestions };

419
eval-questions.json Normal file
View File

@@ -0,0 +1,419 @@
{
"generated": "2026-03-09T21:29:29.763Z",
"count": 33,
"questions": [
{
"id": "struct-001",
"category": "structural",
"difficulty": "easy",
"audience": [
"human",
"machine"
],
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
"answer": "76",
"answerType": "exact",
"source": "Chart.yaml discovery"
},
{
"id": "struct-002",
"category": "structural",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
"answer": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
"answerType": "ranked-list",
"source": "template scanning"
},
{
"id": "struct-003",
"category": "structural",
"difficulty": "easy",
"audience": [
"human",
"machine"
],
"question": "How many subsystems does the Foxtrot codebase contain?",
"answer": "11",
"answerType": "exact",
"source": "subsystem aggregation"
},
{
"id": "struct-004",
"category": "structural",
"difficulty": "easy",
"audience": [
"human"
],
"question": "Which subsystem contains the most files, and how many?",
"answer": "runtime: 16309 files",
"answerType": "exact",
"source": "subsystem aggregation"
},
{
"id": "dep-external-dns-001",
"category": "dependency",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
"answer": "external-dns (1.17.0) [condition: external-dns.enabled]",
"answerType": "list",
"source": "app-common/apps-provider/addons/charts/common/external-dns/Chart.yaml"
},
{
"id": "dep-ingress-nginx-001",
"category": "dependency",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
"answer": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
"answerType": "list",
"source": "app-common/apps-provider/addons/charts/common/ingress-nginx/Chart.yaml"
},
{
"id": "dep-kube-state-metrics-001",
"category": "dependency",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
"answer": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
"answerType": "list",
"source": "app-common/apps-provider/addons/charts/common/kube-state-metrics/Chart.yaml"
},
{
"id": "dep-open-telemetry-001",
"category": "dependency",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
"answer": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
"answerType": "list",
"source": "app-common/apps-provider/addons/charts/common/otel-collector/Chart.yaml"
},
{
"id": "dep-reltio-prometheus-001",
"category": "dependency",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
"answer": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
"answerType": "list",
"source": "app-common/apps-provider/addons/charts/common/prometheus/Chart.yaml"
},
{
"id": "contract-001",
"category": "contract",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
"answer": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
"answerType": "list",
"source": "template interaction scanning"
},
{
"id": "config-mdm-app-domain",
"category": "configuration",
"difficulty": "easy",
"audience": [
"human"
],
"question": "What is the default value of \"domain\" in the mdm-app chart?",
"answer": "cloud.reltio.com",
"answerType": "exact",
"source": "app-common/charts/mdm-app/values.yaml"
},
{
"id": "config-mdm-app-count",
"category": "configuration",
"difficulty": "easy",
"audience": [
"machine"
],
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
"answer": "38",
"answerType": "exact",
"source": "app-common/charts/mdm-app/values.yaml"
},
{
"id": "config-ai-app-domain",
"category": "configuration",
"difficulty": "easy",
"audience": [
"human"
],
"question": "What is the default value of \"domain\" in the ai-app chart?",
"answer": "cloud.reltio.com",
"answerType": "exact",
"source": "app-common/charts/ai-app/values.yaml"
},
{
"id": "config-ai-app-count",
"category": "configuration",
"difficulty": "easy",
"audience": [
"machine"
],
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
"answer": "34",
"answerType": "exact",
"source": "app-common/charts/ai-app/values.yaml"
},
{
"id": "config-elasticsearch-domain",
"category": "configuration",
"difficulty": "easy",
"audience": [
"human"
],
"question": "What is the default value of \"domain\" in the elasticsearch chart?",
"answer": "etalon.reltio.com",
"answerType": "exact",
"source": "app-common/apps-provider/addons/charts/state-apps/es/values.yaml"
},
{
"id": "config-elasticsearch-count",
"category": "configuration",
"difficulty": "easy",
"audience": [
"machine"
],
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
"answer": "32",
"answerType": "exact",
"source": "app-common/apps-provider/addons/charts/state-apps/es/values.yaml"
},
{
"id": "config-hazelcast-domain",
"category": "configuration",
"difficulty": "easy",
"audience": [
"human"
],
"question": "What is the default value of \"domain\" in the hazelcast chart?",
"answer": "etalon.reltio.com",
"answerType": "exact",
"source": "app-common/apps-provider/addons/charts/state-apps/hz/values.yaml"
},
{
"id": "config-hazelcast-count",
"category": "configuration",
"difficulty": "easy",
"audience": [
"machine"
],
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
"answer": "32",
"answerType": "exact",
"source": "app-common/apps-provider/addons/charts/state-apps/hz/values.yaml"
},
{
"id": "config-redis-domain",
"category": "configuration",
"difficulty": "easy",
"audience": [
"human"
],
"question": "What is the default value of \"domain\" in the redis chart?",
"answer": "internal.reltio.com",
"answerType": "exact",
"source": "app-common/charts/redis/values.yaml"
},
{
"id": "config-redis-count",
"category": "configuration",
"difficulty": "easy",
"audience": [
"machine"
],
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
"answer": "23",
"answerType": "exact",
"source": "app-common/charts/redis/values.yaml"
},
{
"id": "interaction-001",
"category": "interaction",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
"answer": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
"answerType": "list",
"source": "template interaction scanning"
},
{
"id": "interaction-002",
"category": "interaction",
"difficulty": "hard",
"audience": [
"human",
"machine"
],
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
"answer": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
"answerType": "list",
"source": "template port scanning"
},
{
"id": "resource-001",
"category": "resource",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
"answer": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
"answerType": "ranked-list",
"source": "template resource scanning"
},
{
"id": "resource-mdm-app-001",
"category": "resource",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
"answer": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
"answerType": "list",
"source": "app-common/charts/mdm-app/templates/"
},
{
"id": "resource-cassandra-001",
"category": "resource",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
"answer": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
"answerType": "list",
"source": "app-common/apps-provider/addons/charts/state-cluster/cassandra/templates/"
},
{
"id": "resource-jenkins-001",
"category": "resource",
"difficulty": "medium",
"audience": [
"human",
"machine"
],
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
"answer": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
"answerType": "list",
"source": "app-common/charts/jenkins/templates/"
},
{
"id": "cross-sub-001",
"category": "cross-subsystem",
"difficulty": "hard",
"audience": [
"human",
"machine"
],
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
"answer": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
"answerType": "list",
"source": "dependency matrix"
},
{
"id": "cross-sub-002",
"category": "cross-subsystem",
"difficulty": "easy",
"audience": [
"human"
],
"question": "Which subsystems are identified as cross-cutting concerns?",
"answer": "app-tools",
"answerType": "list",
"source": "subsystem aggregation"
},
{
"id": "arch-001",
"category": "architectural",
"difficulty": "hard",
"audience": [
"human"
],
"question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core. Why might this be the case, and what do they actually contain?",
"answer": "These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.",
"answerType": "explanation",
"source": "architectural analysis"
},
{
"id": "arch-002",
"category": "architectural",
"difficulty": "easy",
"audience": [
"human",
"machine"
],
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
"answer": "version: 0.7.4, appVersion: 2025.12-12-develop",
"answerType": "exact",
"source": "app-common/charts/mdm-app/Chart.yaml"
},
{
"id": "scenario-001",
"category": "scenario",
"difficulty": "hard",
"audience": [
"human"
],
"question": "If you need to rotate the \"vault-secret\" shared secret, which Helm charts would be affected and need redeployment?",
"answer": "ai-app, cleanse-app, pms-app, rdm-app, shared-app",
"answerType": "list",
"source": "template interaction scanning"
},
{
"id": "scenario-002",
"category": "scenario",
"difficulty": "hard",
"audience": [
"human"
],
"question": "A new engineer needs to deploy the MDM application stack. What charts, configuration values, and external dependencies should they understand first?",
"answer": "Chart: mdm-app (app-common/charts/mdm-app), Version: 0.7.4\nKey values: clusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain (+28 more)\nResources generated: 22 K8s resources\nInteractions: none detected",
"answerType": "explanation",
"source": "app-common/charts/mdm-app"
},
{
"id": "scenario-003",
"category": "scenario",
"difficulty": "medium",
"audience": [
"human"
],
"question": "What network ports does the Cassandra chart expose, and what are they typically used for?",
"answer": "Ports: 7000, 7001, 7199, 9042, 9160\n7000: Cassandra inter-node cluster communication\n7001: Cassandra SSL inter-node\n7199: JMX monitoring\n9042: CQL native transport (client connections)\n9160: Thrift client API (legacy)",
"answerType": "explanation",
"source": "app-common/apps-provider/addons/charts/state-cluster/cassandra/templates/"
}
]
}

594
eval-report.json Normal file
View File

@@ -0,0 +1,594 @@
{
"timestamp": "2026-03-09T22:32:37.490Z",
"docsDir": "./foxtrot-docs",
"questionsFile": "./eval-questions.json",
"totalQuestions": 33,
"overallScore": 66.7,
"avgAccuracy": 2.76,
"avgCompleteness": 2.79,
"avgPrecision": 4.45,
"notFoundCount": 6,
"notFoundRate": "18.2%",
"byCategory": {
"structural": {
"avg": "31.5",
"count": 4
},
"dependency": {
"avg": "77.2",
"count": 5
},
"contract": {
"avg": "20.0",
"count": 1
},
"configuration": {
"avg": "93.3",
"count": 10
},
"interaction": {
"avg": "56.5",
"count": 2
},
"resource": {
"avg": "81.5",
"count": 4
},
"cross-subsystem": {
"avg": "20.0",
"count": 2
},
"architectural": {
"avg": "66.5",
"count": 2
},
"scenario": {
"avg": "40.0",
"count": 3
}
},
"byDifficulty": {
"easy": {
"avg": "75.5",
"count": 15
},
"medium": {
"avg": "67.0",
"count": 13
},
"hard": {
"avg": "38.6",
"count": 5
}
},
"results": [
{
"id": "struct-001",
"category": "structural",
"difficulty": "easy",
"question": "How many Helm charts are defined across the Foxtrot monorepo?",
"groundTruth": "76",
"llmAnswer": "\n\n27",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 4,
"notes": "AI significantly undercounted Helm charts (27 vs 76). No hallucinated details, but the core answer is factually incorrect and captures only ~35% of actual charts."
},
"composite": 27,
"notFound": false
},
{
"id": "struct-002",
"category": "structural",
"difficulty": "medium",
"question": "Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.",
"groundTruth": "cluster (app-common/apps-provider/addons/charts/common/cluster): 55\ncluster (app-common/charts/cluster): 32\njenkins (app-common/charts/jenkins): 29\nkube-state-metrics (app-common/charts/otel-collector/charts/kube-state-metrics): 27\nfoxtrot-eks (compute-common/templates/foxtrot-eks): 24",
"llmAnswer": "NOT_FOUND\n\nThe documentation does not provide explicit resource counts for the charts. While the \"Resources Generated\" sections list individual resources for each chart, they do not include a total count per chart. To accurately rank the 5 charts producing the most resources, I would need either:\n\n1. Explicit resource count totals in each chart's documentation\n2. A summary table comparing resource counts across charts\n\nThe documentation lists resources by type and template file, but requires manual counting to determine totals, which is not a reliable method for documentation-based answers.",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI returned NOT_FOUND instead of the ranked list. While it avoided hallucination by not inventing data, it failed to provide any of the 5 required charts or their resource counts. The ground truth shows this information was available and should have been retrievable."
},
"composite": 33,
"notFound": true
},
{
"id": "struct-003",
"category": "structural",
"difficulty": "easy",
"question": "How many subsystems does the Foxtrot codebase contain?",
"groundTruth": "11",
"llmAnswer": "NOT_FOUND\n\nThe documentation provided contains only one explicit subsystem definition (the \"root\" subsystem). While the root subsystem's description references other domains (app, compute, ipam, runtime), the documentation does not provide a complete inventory or count of all subsystems in the Foxtrot codebase.",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI correctly avoided hallucination by stating NOT_FOUND rather than guessing, but failed to provide the ground truth answer of 11 subsystems. The response demonstrates appropriate epistemic humility but does not meet the requirement of answering the factual question."
},
"composite": 33,
"notFound": true
},
{
"id": "struct-004",
"category": "structural",
"difficulty": "easy",
"question": "Which subsystem contains the most files, and how many?",
"groundTruth": "runtime: 16309 files",
"llmAnswer": "ipam-core, 24",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI identified wrong subsystem (ipam-core vs runtime) and provided drastically incorrect file count (24 vs 16309). Answer format is correct but all factual content is wrong. No hallucinated extra information, just fundamentally incorrect core data."
},
"composite": 33,
"notFound": false
},
{
"id": "dep-external-dns-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"external-dns\" chart (at app-common/apps-provider/addons/charts/common/external-dns)?",
"groundTruth": "external-dns (1.17.0) [condition: external-dns.enabled]",
"llmAnswer": "\n\nexternal-dns (1.17.0)",
"score": {
"accuracy": 3,
"completeness": 3,
"precision": 5,
"notes": "AI correctly identified the dependency name and version but omitted the condition specification '[condition: external-dns.enabled]' which is part of the ground truth. The core information is accurate but incomplete."
},
"composite": 73,
"notFound": false
},
{
"id": "dep-ingress-nginx-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"ingress-nginx\" chart (at app-common/apps-provider/addons/charts/common/ingress-nginx)?",
"groundTruth": "ingress-nginx (4.12.0) [condition: ingress-nginx.enabled]",
"llmAnswer": "\n\ningress-nginx (4.12.0)",
"score": {
"accuracy": 3,
"completeness": 3,
"precision": 5,
"notes": "AI correctly identified the dependency name and version (ingress-nginx 4.12.0) but omitted the condition specification [condition: ingress-nginx.enabled] that is part of the complete ground truth answer."
},
"composite": 73,
"notFound": false
},
{
"id": "dep-kube-state-metrics-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"kube-state-metrics\" chart (at app-common/apps-provider/addons/charts/common/kube-state-metrics)?",
"groundTruth": "kube-state-metrics (5.26.0) [condition: kube-state-metrics.enabled]",
"llmAnswer": "\n\nkube-state-metrics (5.26.0)",
"score": {
"accuracy": 3,
"completeness": 3,
"precision": 5,
"notes": "AI correctly identified the dependency name and version (kube-state-metrics 5.26.0) but omitted the condition metadata [condition: kube-state-metrics.enabled] present in the ground truth. Core information is accurate with no hallucination, but the answer is incomplete."
},
"composite": 73,
"notFound": false
},
{
"id": "dep-open-telemetry-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"open-telemetry\" chart (at app-common/apps-provider/addons/charts/common/otel-collector)?",
"groundTruth": "opentelemetry-operator (0.26.4) [condition: opentelemetry-operator.enabled]\nk8s-metrics (0.1.0) [condition: k8s-metrics.enabled]\nagent (0.1.1) [condition: agent.enabled]\ncommon (0.1.0) [condition: common.enabled]\ngw-metrics (0.1.0) [condition: gw-metrics.enabled]\ngw-traces (0.1.0) [condition: gw-traces.enabled]\nprom-scraper (0.1.0) [condition: prom-scraper.enabled]",
"llmAnswer": "\n\nopentelemetry-operator (0.26.4)\nk8s-metrics (0.1.0)\nagent (0.1.1)\ncommon (0.1.0)\ngw-metrics (0.1.0)\ngw-traces (0.1.0)\nprom-scraper (0.1.0)",
"score": {
"accuracy": 4,
"completeness": 4,
"precision": 5,
"notes": "AI correctly identified all 7 dependencies with accurate versions, but omitted the conditional metadata ([condition: ...]) that was part of the ground truth specification. Core facts are accurate and complete, but the answer lacks full specification details."
},
"composite": 87,
"notFound": false
},
{
"id": "dep-reltio-prometheus-001",
"category": "dependency",
"difficulty": "medium",
"question": "What are the dependencies of the \"reltio-prometheus\" chart (at app-common/apps-provider/addons/charts/common/prometheus)?",
"groundTruth": "kube-prometheus-stack (69.4.1) [condition: kube-prometheus-stack.enabled]\nprometheus-adapter (4.11.0) [condition: prometheus-adapter.enabled]",
"llmAnswer": "\n\nkube-prometheus-stack (69.4.1)\nprometheus-adapter (4.11.0)",
"score": {
"accuracy": 4,
"completeness": 3,
"precision": 5,
"notes": "AI correctly identified both dependencies with accurate versions (69.4.1 and 4.11.0), but omitted the conditional clauses [condition: kube-prometheus-stack.enabled] and [condition: prometheus-adapter.enabled] that are part of the complete ground truth specification. The core facts are accurate but the answer lacks the full dependency specification."
},
"composite": 80,
"notFound": false
},
{
"id": "contract-001",
"category": "contract",
"difficulty": "medium",
"question": "Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.",
"groundTruth": "logdna-agent-key: cluster\nlogdna-agent-tags: cluster\nvault-secret: ai-app, cleanse-app, pms-app, rdm-app, shared-app\nvault-secret-rdm-tasks: ai-app, cleanse-app, pms-app, rdm-app, shared-app",
"llmAnswer": "vault-secret\ncluster (app-common/apps-provider/addons/charts/common/cluster), argocd-apps, mdm-app, shared-app, ai-app, auth-app, cleanse-app, pms-app, rdm-app\n\nvault-component\nmdm-app, shared-app, ai-app, auth-app, cleanse-app, pms-app, rdm-app\n\ngcrregcred\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\nlogdna\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\nlogdna-agent\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\nsentinelone\ncluster (app-common/apps-provider/addons/charts/common/cluster), cluster (app-common/charts/cluster)\n\ntwistlock-secrets\ntwistlock-defender, twistlock",
"score": {
"accuracy": 1,
"completeness": 1,
"precision": 1,
"notes": "AI found vault-secret but with incorrect chart list (includes cluster, argocd-apps, mdm-app, auth-app not in ground truth). Completely missed vault-secret-rdm-tasks, logdna-agent-key, and logdna-agent-tags. Introduced multiple hallucinated secrets (vault-component, gcrregcred, logdna, logdna-agent, sentinelone, twistlock-secrets) not in ground truth. Only 1 of 4 required items partially present with errors."
},
"composite": 20,
"notFound": false
},
{
"id": "config-mdm-app-domain",
"category": "configuration",
"difficulty": "easy",
"question": "What is the default value of \"domain\" in the mdm-app chart?",
"groundTruth": "cloud.reltio.com",
"llmAnswer": "cloud.reltio.com",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. Direct, factual response with no hallucination or missing information."
},
"composite": 100,
"notFound": false
},
{
"id": "config-mdm-app-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the mdm-app chart (at app-common/charts/mdm-app) expose in its values.yaml?",
"groundTruth": "38",
"llmAnswer": "38",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. The response provides the correct count of 38 top-level configuration keys with no extraneous information or hallucination."
},
"composite": 100,
"notFound": false
},
{
"id": "config-ai-app-domain",
"category": "configuration",
"difficulty": "easy",
"question": "What is the default value of \"domain\" in the ai-app chart?",
"groundTruth": "cloud.reltio.com",
"llmAnswer": "cloud.reltio.com",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. No hallucination, no missing information, direct and correct response."
},
"composite": 100,
"notFound": false
},
{
"id": "config-ai-app-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the ai-app chart (at app-common/charts/ai-app) expose in its values.yaml?",
"groundTruth": "34",
"llmAnswer": "34",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. Correct count of 34 top-level configuration keys in values.yaml with no extraneous information."
},
"composite": 100,
"notFound": false
},
{
"id": "config-elasticsearch-domain",
"category": "configuration",
"difficulty": "easy",
"question": "What is the default value of \"domain\" in the elasticsearch chart?",
"groundTruth": "etalon.reltio.com",
"llmAnswer": "\n\netalon.reltio.com",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly with no extraneous information or hallucination"
},
"composite": 100,
"notFound": false
},
{
"id": "config-elasticsearch-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the elasticsearch chart (at app-common/apps-provider/addons/charts/state-apps/es) expose in its values.yaml?",
"groundTruth": "32",
"llmAnswer": "32",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. Provided the correct count of 32 top-level configuration keys with no extraneous information or hallucination."
},
"composite": 100,
"notFound": false
},
{
"id": "config-hazelcast-domain",
"category": "configuration",
"difficulty": "easy",
"question": "What is the default value of \"domain\" in the hazelcast chart?",
"groundTruth": "etalon.reltio.com",
"llmAnswer": "internal.reltio.com",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI provided incorrect value (internal.reltio.com instead of etalon.reltio.com). Answer is factually wrong but contains no hallucinated extra information."
},
"composite": 33,
"notFound": false
},
{
"id": "config-hazelcast-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the hazelcast chart (at app-common/apps-provider/addons/charts/state-apps/hz) expose in its values.yaml?",
"groundTruth": "32",
"llmAnswer": "32",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. Correct numerical response to a factual query with no extraneous information."
},
"composite": 100,
"notFound": false
},
{
"id": "config-redis-domain",
"category": "configuration",
"difficulty": "easy",
"question": "What is the default value of \"domain\" in the redis chart?",
"groundTruth": "internal.reltio.com",
"llmAnswer": "internal.reltio.com",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. Direct, factual response with no hallucination or missing information."
},
"composite": 100,
"notFound": false
},
{
"id": "config-redis-count",
"category": "configuration",
"difficulty": "easy",
"question": "How many top-level configuration keys does the redis chart (at app-common/charts/redis) expose in its values.yaml?",
"groundTruth": "23",
"llmAnswer": "23",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. Provided the correct count of 23 top-level configuration keys with no extraneous information or hallucination."
},
"composite": 100,
"notFound": false
},
{
"id": "interaction-001",
"category": "interaction",
"difficulty": "medium",
"question": "Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.",
"groundTruth": "hazelcast (app-common/apps-provider/addons/charts/state-apps/hz) → kubernetes\nhazelcast (app-common/charts/hazelcast) → kubernetes\ngw-metrics (app-common/charts/otel-collector/charts/gw-metrics) → argo-rollouts-metrics\nredis (app-common/charts/redis) → headless",
"llmAnswer": "hazelcast (app-common/charts/hazelcast) - kubernetes\nhazelcast (app-common/apps-provider/addons/charts/state-apps/hz) - kubernetes\nredis - headless",
"score": {
"accuracy": 3,
"completeness": 3,
"precision": 5,
"notes": "AI correctly identified 3 of 4 items with accurate paths and service references. Missing the gw-metrics chart entry (app-common/charts/otel-collector/charts/gw-metrics → argo-rollouts-metrics). No hallucinated information."
},
"composite": 73,
"notFound": false
},
{
"id": "interaction-002",
"category": "interaction",
"difficulty": "hard",
"question": "Which network ports are used by multiple Helm charts? List each port and the charts that expose it.",
"groundTruth": "Port 443: cluster, twistlock-defender, eck-operator, twistlock\nPort 8889: gw-metrics, gw-traces, kube-state-metrics\nPort 9200: eck-operator, elasticsearch\nPort 13133: agent, gw-metrics, gw-traces, k8s-metrics, prom-scraper, elasticsearch",
"llmAnswer": "Port 443\n- eck-operator\n- cluster\n- twistlock-defender\n- twistlock\n\nPort 8889\n- kube-state-metrics\n- gw-traces\n\nPort 9200\n- eck-operator\n- elasticsearch (app-common/apps-provider/addons/charts/state-apps/es)\n- elasticsearch (app-common/charts/elasticsearch)\n\nPort 13133\n- gw-traces\n- elasticsearch (app-common/apps-provider/addons/charts/state-apps/es)\n- elasticsearch (app-common/charts/elasticsearch)\n- k8s-metrics\n\nPort 0\n- elasticsearch (app-common/apps-provider/addons/charts/state-apps/es)\n- elasticsearch (app-common/charts/elasticsearch)\n- k8s-metrics",
"score": {
"accuracy": 2,
"completeness": 2,
"precision": 2,
"notes": "AI correctly identifies ports 443 and 9200 with accurate chart listings. However, it omits gw-metrics from port 8889, and critically misses agent, gw-metrics, and prom-scraper from port 13133. Additionally, it hallucinates Port 0 which does not appear in the ground truth. The answer is incomplete and contains factual errors in chart-to-port mappings."
},
"composite": 40,
"notFound": false
},
{
"id": "resource-001",
"category": "resource",
"difficulty": "medium",
"question": "What are the 10 most common Kubernetes resource types generated across all Foxtrot Helm charts?",
"groundTruth": "ClusterRole: 51\nServiceAccount: 49\nSecret: 43\nExternalSecret: 38\nService: 36\nRole: 30\nClusterRoleBinding: 29\nDeployment: 29\nConfigMap: 25\nRoleBinding: 25",
"llmAnswer": "NOT_FOUND\n\nThe documentation provided does not contain a comprehensive analysis or summary table that aggregates resource type counts across all Foxtrot Helm charts. While the documentation lists resources generated by individual charts, it does not provide the aggregated count needed to determine the 10 most common resource types across all Foxtrot charts.\n\nTo answer this question accurately, one would need to manually count occurrences across all documented charts and rank them, which is beyond what the provided documentation explicitly states.",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI answered NOT_FOUND instead of providing the ranked list of 10 resource types. While the response contains no hallucinated information, it failed to retrieve or provide any of the correct facts from the ground truth. The answer is factually incorrect and incomplete, though it avoided fabricating data."
},
"composite": 33,
"notFound": true
},
{
"id": "resource-mdm-app-001",
"category": "resource",
"difficulty": "medium",
"question": "What Kubernetes resource types does the mdm-app chart generate? List each type and count.",
"groundTruth": "Secret: 5\nJob: 4\nDeployment: 2\nPersistentVolumeClaim: 2\nService: 2\nExternalSecret: 2\nHorizontalPodAutoscaler: 1\nRollout: 1\nPersistentVolume: 1\nPodDisruptionBudget: 1\nServiceAccount: 1",
"llmAnswer": "\n\nHorizontalPodAutoscaler: 1\nDeployment: 2\nRollout: 1\nJob: 4\nPersistentVolume: 1\nPersistentVolumeClaim: 2\nService: 2\nPodDisruptionBudget: 1\nSecret: 5\nExternalSecret: 2\nServiceAccount: 1",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer contains all 11 resource types with exact correct counts matching ground truth. Order differs but content is identical."
},
"composite": 100,
"notFound": false
},
{
"id": "resource-cassandra-001",
"category": "resource",
"difficulty": "medium",
"question": "What Kubernetes resource types does the cassandra chart generate? List each type and count.",
"groundTruth": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
"llmAnswer": "Service: 2\nStatefulSet: 1\nExternalSecret: 1\nClusterSecretStore: 1\nConfigMap: 1\nSecret: 1\nServiceAccount: 1\nRole: 1\nRoleBinding: 1\nClusterRole: 1\nClusterRoleBinding: 1\nCronJob: 1\nJob: 1",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer matches ground truth exactly. All 13 resource types listed with correct counts. No omissions or hallucinations."
},
"composite": 100,
"notFound": false
},
{
"id": "resource-jenkins-001",
"category": "resource",
"difficulty": "medium",
"question": "What Kubernetes resource types does the jenkins chart generate? List each type and count.",
"groundTruth": "ConfigMap: 5\nService: 3\nSecret: 3\nIngress: 2\nServiceAccount: 2\nPersistentVolumeClaim: 1\nSecurityGroupPolicy: 1\nPrometheusRule: 1\nBackendConfig: 1\nNetworkPolicy: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nRoute: 1\nServiceMonitor: 1\nStatefulSet: 1\nRole: 1\nRoleBinding: 1\nSecretClaim: 1\nPod: 1",
"llmAnswer": "BackendConfig: 1\nConfigMap: 4\nIngress: 2\nNetworkPolicy: 1\nPod: 1\nPodDisruptionBudget: 1\nPodMonitoring: 1\nPersistentVolumeClaim: 1\nPrometheusRule: 1\nRole: 1\nRoleBinding: 1\nRoute: 1\nSecret: 3\nSecretClaim: 1\nService: 3\nServiceAccount: 2\nServiceMonitor: 1\nSecurityGroupPolicy: 1\nStatefulSet: 1",
"score": {
"accuracy": 4,
"completeness": 5,
"precision": 5,
"notes": "AI lists all 19 resource types correctly but miscounts ConfigMap as 4 instead of 5. All other counts and types match ground truth exactly."
},
"composite": 93,
"notFound": false
},
{
"id": "cross-sub-001",
"category": "cross-subsystem",
"difficulty": "hard",
"question": "Which subsystems depend on app-tools, and how heavily (by call+import count)?",
"groundTruth": "ipam-tools: 6\ncompute-tools: 4\napp-common: 1",
"llmAnswer": "\n\nNOT_FOUND\n\nThe provided documentation does not contain information about which subsystems depend on app-tools or their dependency metrics (call+import counts). While the app-tools subsystem documentation describes its public exports and internal structure, there is no cross-reference documentation showing downstream dependencies or usage patterns from other subsystems.",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI correctly avoided hallucination by stating information unavailable, but failed to locate or provide the ground truth data showing three subsystems with specific dependency counts (ipam-tools: 6, compute-tools: 4, app-common: 1)."
},
"composite": 33,
"notFound": true
},
{
"id": "cross-sub-002",
"category": "cross-subsystem",
"difficulty": "easy",
"question": "Which subsystems are identified as cross-cutting concerns?",
"groundTruth": "app-tools",
"llmAnswer": "root",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 1,
"notes": "AI answered 'root' instead of the correct answer 'app-tools'. The response is factually incorrect and fails to identify the actual cross-cutting concern subsystem specified in the ground truth."
},
"composite": 7,
"notFound": false
},
{
"id": "arch-001",
"category": "architectural",
"difficulty": "hard",
"question": "The following subsystems have 0 detected functions and 0 modules: account-common, network-common, network-core. Why might this be the case, and what do they actually contain?",
"groundTruth": "These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax ({{ }}) which doesn't produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.",
"llmAnswer": "NOT_FOUND\n\nThe documentation provided does not contain subsystem reference pages for account-common, network-common, or network-core. There is only a single reference to account-common in the helm charts section, which shows it contains a Helm template for account creation, but this does not explain why these subsystems register 0 functions and 0 modules, nor does it describe their complete contents.",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI responded with 'NOT_FOUND' rather than providing the ground truth explanation. The correct answer requires understanding that Helm templates with Go template syntax ({{ }}) don't produce detectable functions/modules in traditional code analysis pipelines, and that these subsystems' content is captured through Helm chart extraction instead. AI correctly avoided hallucination but failed to locate or infer the actual explanation."
},
"composite": 33,
"notFound": true
},
{
"id": "arch-002",
"category": "architectural",
"difficulty": "easy",
"question": "What is the current version and appVersion of the mdm-app Helm chart?",
"groundTruth": "version: 0.7.4, appVersion: 2025.12-12-develop",
"llmAnswer": "Version: 0.7.4\nApp Version: 2025.12-12-develop",
"score": {
"accuracy": 5,
"completeness": 5,
"precision": 5,
"notes": "AI answer contains both required values with exact matches to ground truth. No hallucination or missing information."
},
"composite": 100,
"notFound": false
},
{
"id": "scenario-001",
"category": "scenario",
"difficulty": "hard",
"question": "If you need to rotate the \"vault-secret\" shared secret, which Helm charts would be affected and need redeployment?",
"groundTruth": "ai-app, cleanse-app, pms-app, rdm-app, shared-app",
"llmAnswer": "\n\nshared-app\nai-app\ncleanse-app\npms-app\nargocd-apps",
"score": {
"accuracy": 3,
"completeness": 3,
"precision": 3,
"notes": "AI answer includes 4 of 5 correct charts (shared-app, ai-app, cleanse-app, pms-app) but misses rdm-app and incorrectly includes argocd-apps, which is not affected by vault-secret rotation."
},
"composite": 60,
"notFound": false
},
{
"id": "scenario-002",
"category": "scenario",
"difficulty": "hard",
"question": "A new engineer needs to deploy the MDM application stack. What charts, configuration values, and external dependencies should they understand first?",
"groundTruth": "Chart: mdm-app (app-common/charts/mdm-app), Version: 0.7.4\nKey values: clusterName, environmentType, arch, cloudProvider, region, vaultEnv, imagePullPolicy, clusterDomain, terminationGracePeriodSeconds, etalonsubDomain (+28 more)\nResources generated: 22 K8s resources\nInteractions: none detected",
"llmAnswer": "\n\nA new engineer deploying the MDM application stack should first understand:\n\n**Core Charts**\nThe deployment orchestrates multiple Helm charts: `mdm-app` (the primary application), `cluster` (cluster-level infrastructure and security agents), `argocd-apps` (GitOps deployment automation), `secret-operator` (External Secrets Operator for credential management), and supporting application charts (`shared-app`, `rdm-app`, `auth-app`, `cleanse-app`, `pms-app`, `ai-app`). Infrastructure charts like `milvus` (vector database), `opentelemetry-operator` (observability), and `external-dns` (DNS automation) provide foundational services.\n\n**Critical Configuration Values**\nCluster identity (`clusterName`, `clusterDomain`, `region`, `cloudProvider`), domain routing (`domain`, `rootDomain`, `etalonsubDomain`), vault integration (`vaultEnv`, vault secret stores), image registry access (`baseRegistryURL`, `imagePullSecrets`, `gcrregcred`), and feature flags (`deployIstio`, `deployLogdna`, `deploySentinelOne`, `deployPrisma`, `efsEnabled`). Storage configuration (`persistent` storage classes, `efsid` for EFS) and workload placement (`nodeSelector`, `tolerations`) determine resource allocation.\n\n**External Dependencies**\nHashiCorp Vault for secrets, External Secrets Operator for credential injection, Istio for service mesh and TLS, LogDNA for centralized logging, SentinelOne and Prisma for security agents, Milvus as a vector database backend, OpenTelemetry for distributed tracing and metrics, cert-manager for certificate lifecycle, ArgoCD for GitOps-driven deployments, and kube-state-metrics for Kubernetes cluster monitoring. Each dependency requires specific configuration values and external secret references to function.",
"score": {
"accuracy": 1,
"completeness": 2,
"precision": 1,
"notes": "AI significantly hallucinated. Ground truth specifies mdm-app chart (v0.7.4) with 22 K8s resources and no interactions. AI describes a complex multi-chart ecosystem with extensive dependencies and interactions not present in ground truth. Covers some correct values (clusterName, region, vaultEnv, clusterDomain) but misses key ones (environmentType, arch, imagePullPolicy, terminationGracePeriodSeconds). Introduces numerous unverified charts, dependencies, and configuration values not in the specification."
},
"composite": 27,
"notFound": false
},
{
"id": "scenario-003",
"category": "scenario",
"difficulty": "medium",
"question": "What network ports does the Cassandra chart expose, and what are they typically used for?",
"groundTruth": "Ports: 7000, 7001, 7199, 9042, 9160\n7000: Cassandra inter-node cluster communication\n7001: Cassandra SSL inter-node\n7199: JMX monitoring\n9042: CQL native transport (client connections)\n9160: Thrift client API (legacy)",
"llmAnswer": "NOT_FOUND\n\nThe documentation lists the Cassandra chart exposes five ports (7000, 7001, 7199, 9042, and 9160) but does not provide information about what these ports are typically used for.",
"score": {
"accuracy": 0,
"completeness": 0,
"precision": 5,
"notes": "AI correctly identified it could not find port usage information in documentation. No hallucination, but no answer provided to the question."
},
"composite": 33,
"notFound": true
}
]
}

275
eval.js Normal file
View File

@@ -0,0 +1,275 @@
/**
* Eval Harness: Doc Quality Scorer
*
* Scores generated documentation against ground-truth questions.
* Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy.
*
* Usage: node eval.js <docs-dir> <questions.json> [output.json]
*/
const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');
/** Recursively read all .md files from a directory into a single context string */
function loadDocs(docsDir, maxChars = 200000) {
const docs = [];
let totalChars = 0;
function walk(dir) {
let entries;
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
for (const e of entries) {
const fp = path.join(dir, e.name);
if (e.isDirectory()) { walk(fp); continue; }
if (!e.name.endsWith('.md')) continue;
try {
const content = fs.readFileSync(fp, 'utf8');
if (totalChars + content.length > maxChars) continue;
docs.push({ path: path.relative(docsDir, fp), content });
totalChars += content.length;
} catch {}
}
}
walk(docsDir);
return docs;
}
/** Phase 1: Have an LLM answer a question using only the docs */
async function answerFromDocs(question, docsContext, llmOpts) {
const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND".
Be precise and factual. Match the expected answer format:
- For "exact" answers: give the exact value
- For "list" answers: list each item on its own line
- For "ranked-list" answers: list items in order with counts
- For "explanation" answers: give a concise explanation
QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}
DOCUMENTATION:
${docsContext}
Answer:`;
return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
}
/** Phase 2: LLM-as-judge scores the answer against ground truth */
async function scoreAnswer(question, llmAnswer, llmOpts) {
const prompt = `You are a strict evaluator scoring an AI's answer against ground truth.
QUESTION: ${question.question}
EXPECTED ANSWER TYPE: ${question.answerType}
GROUND TRUTH: ${question.answer}
AI ANSWER: ${llmAnswer}
Score the AI answer on these dimensions (0-5 each):
1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct)
2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete)
3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination)
If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it).
Respond in EXACTLY this JSON format, nothing else:
{"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`;
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
try {
// Extract JSON from response (handle markdown wrapping)
const jsonMatch = raw.match(/\{[\s\S]*\}/);
if (jsonMatch) return JSON.parse(jsonMatch[0]);
} catch {}
return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}
/** Build a focused doc context for a question (avoid sending 200K to every question) */
function buildContext(question, docs) {
// Keywords from the question
const keywords = question.question.toLowerCase().split(/\s+/)
.filter(w => w.length > 3)
.filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w));
// Score each doc by keyword relevance
const scored = docs.map(d => {
const lower = d.content.toLowerCase();
let score = 0;
for (const kw of keywords) {
const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const count = (lower.match(new RegExp(escaped, 'g')) || []).length;
score += Math.min(count, 10); // cap per keyword
}
return { ...d, score };
});
// Take top docs up to ~50K chars
scored.sort((a, b) => b.score - a.score);
let context = '';
let used = 0;
for (const d of scored) {
if (d.score === 0 && used > 3) break;
if (context.length + d.content.length > 50000) continue;
context += `\n--- ${d.path} ---\n${d.content}\n`;
used++;
}
return context;
}
/** Run the full evaluation */
async function runEval(docsDir, questionsPath, llmOpts = {}) {
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
const questions = questionsData.questions;
const docs = loadDocs(docsDir);
console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`);
const results = [];
let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0;
let notFound = 0;
for (let i = 0; i < questions.length; i++) {
const q = questions[i];
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
// Build focused context
const context = buildContext(q, docs);
// Phase 1: Answer from docs
let llmAnswer;
try {
llmAnswer = await answerFromDocs(q, context, llmOpts);
} catch (err) {
llmAnswer = `ERROR: ${err.message}`;
}
// Phase 2: Score
let score;
try {
score = await scoreAnswer(q, llmAnswer, llmOpts);
} catch (err) {
score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` };
}
const isNotFound = llmAnswer.includes('NOT_FOUND');
if (isNotFound) notFound++;
totalAccuracy += score.accuracy;
totalCompleteness += score.completeness;
totalPrecision += score.precision;
const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0);
console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`);
results.push({
id: q.id,
category: q.category,
difficulty: q.difficulty,
question: q.question,
groundTruth: q.answer,
llmAnswer,
score,
composite: Number(composite),
notFound: isNotFound,
});
}
// Aggregate scores
const n = questions.length;
const avgAccuracy = (totalAccuracy / n).toFixed(2);
const avgCompleteness = (totalCompleteness / n).toFixed(2);
const avgPrecision = (totalPrecision / n).toFixed(2);
const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1);
// Category breakdown
const catScores = {};
for (const r of results) {
if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 };
catScores[r.category].total += r.composite;
catScores[r.category].count++;
}
// Difficulty breakdown
const diffScores = {};
for (const r of results) {
if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 };
diffScores[r.difficulty].total += r.composite;
diffScores[r.difficulty].count++;
}
const report = {
timestamp: new Date().toISOString(),
docsDir,
questionsFile: questionsPath,
totalQuestions: n,
overallScore: Number(overallScore),
avgAccuracy: Number(avgAccuracy),
avgCompleteness: Number(avgCompleteness),
avgPrecision: Number(avgPrecision),
notFoundCount: notFound,
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
byCategory: Object.fromEntries(
Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }])
),
byDifficulty: Object.fromEntries(
Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }])
),
results,
};
return report;
}
if (require.main === module) {
const docsDir = process.argv[2];
const questionsPath = process.argv[3];
const outPath = process.argv[4] || './eval-report.json';
if (!docsDir || !questionsPath) {
console.error('Usage: node eval.js <docs-dir> <questions.json> [output.json]');
process.exit(1);
}
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
console.log(`Using model: ${model}`);
(async () => {
try {
const report = await runEval(docsDir, questionsPath, { model });
console.log('\n' + '═'.repeat(60));
console.log('EVAL REPORT');
console.log('═'.repeat(60));
console.log(`Overall Score: ${report.overallScore}%`);
console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5`);
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
console.log('\nBy Category:');
for (const [cat, s] of Object.entries(report.byCategory)) {
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
}
console.log('\nBy Difficulty:');
for (const [diff, s] of Object.entries(report.byDifficulty)) {
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
}
// Worst performers
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
console.log('\nWeakest Questions:');
for (const w of worst) {
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`);
}
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
console.log(`\nFull report: ${outPath}`);
} catch (err) {
console.error('Eval failed:', err);
process.exit(1);
}
})();
}
module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };