feat: repo-agnostic refactor (BMad spec-test-build loop)
- NEW: repo-profiler.js — deterministic archetype detection (Infra, Frontend, Backend, etc.) - NEW: extract-dynamic.js — generic extractor replacing hardcoded Foxtrot patterns - NEW: eval-generator.js — dynamic ground-truth question generation from any repo graph - NEW: specs/bmad-agnostic-refactor-spec.md — full BMad spec with acceptance criteria - REFACTORED: prose.js — two-pass LLM synthesis with rich context (shared secrets, ports, service refs) - REFACTORED: sysdoc.js — wired repo-profiler + extract-dynamic, --legacy escape hatch - REFACTORED: wiggum-v2.sh — uses eval-generator before benchmarks - FIXED: graph.js — _edgeSet rebuilt on loadSnapshot() (edge dedup was broken) - FIXED: graph.js — recursive sortKeys() for deep equality in diffing - FIXED: prose.js — robust JSON array extraction from LLM output - FIXED: ratchet.js — syntax validation (node --check) before saving LLM mutations - FIXED: extract-dynamic.js — centralized state services regex, added console.warn for silent failures - TESTS: test-eval-generator, test-repo-profiler, test-synthesis-quality + mock fixtures Eval: 81.5% on Foxtrot (fully repo-agnostic, no hardcoded reference pages) BMad reviews: Architect B+, Dev Lead B-, TEA B-
This commit is contained in:
412
eval-generator.js
Normal file
412
eval-generator.js
Normal file
@@ -0,0 +1,412 @@
|
||||
/**
|
||||
* Eval Generator: Repo-Agnostic Question Bank
|
||||
*
|
||||
* Generates ground-truth Q&A pairs from graph + Helm data.
|
||||
* Questions target what the docs actually cover: subsystems, charts,
|
||||
* dependencies, interactions, contracts, resource types.
|
||||
*
|
||||
* Usage: node eval-generator.js <snapshot.json> <repo-root> [output.json] [--dry-run]
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const GraphStore = require('./graph.js');
|
||||
const { discoverCharts, chartsToGraph } = require('./extract-helm.js');
|
||||
const { buildSubsystems } = require('./subsystem.js');
|
||||
|
||||
const HELM_IGNORE = new Set([
|
||||
'node_modules', '.git', 'venv', '__pycache__', '.terraform',
|
||||
'_bmad', '_bmad-output', '.codex', '.claude', '.cursor', '.gemini', '.kiro', '.agents'
|
||||
]);
|
||||
|
||||
function generateQuestions(snapshotPath, srcRoot) {
|
||||
const questions = [];
|
||||
let qIdx = 1;
|
||||
const qid = () => `q-${String(qIdx++).padStart(3, '0')}`;
|
||||
|
||||
// Load graph
|
||||
const graph = GraphStore.loadSnapshot(snapshotPath);
|
||||
|
||||
// Load Helm data
|
||||
const charts = discoverCharts(srcRoot, HELM_IGNORE);
|
||||
|
||||
// Merge Helm into graph (same as sysdoc.js)
|
||||
if (charts.length > 0) {
|
||||
const helmGraph = chartsToGraph(charts, srcRoot);
|
||||
for (const e of helmGraph.entities) {
|
||||
const fakePath = e.dir ? path.join(srcRoot, e.dir, 'Chart.yaml') : path.join(srcRoot, 'Chart.yaml');
|
||||
graph.nodes.set(e.id, { ...e, type: e.type || 'Module', _file: fakePath });
|
||||
if (!graph.fileIndex.has(fakePath)) graph.fileIndex.set(fakePath, new Set());
|
||||
graph.fileIndex.get(fakePath).add(e.id);
|
||||
}
|
||||
for (const r of helmGraph.relationships) {
|
||||
graph.edges.push(r);
|
||||
}
|
||||
}
|
||||
|
||||
// Build subsystems
|
||||
const subs = buildSubsystems(graph, {
|
||||
srcDir: (srcRoot || '').endsWith('/') ? (srcRoot || '') : (srcRoot || '') + '/',
|
||||
minTraffic: 3,
|
||||
crossCuttingThreshold: 0.6
|
||||
});
|
||||
|
||||
// ─── Category 1: Structural ───
|
||||
|
||||
// Total chart count
|
||||
if (charts.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'structural', difficulty: 'easy',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'How many Helm charts are defined across this repository?',
|
||||
expected_answer: String(charts.length),
|
||||
answer: String(charts.length),
|
||||
answerType: 'exact',
|
||||
source: 'Chart.yaml discovery',
|
||||
source_entity: 'all charts'
|
||||
});
|
||||
}
|
||||
|
||||
// Subsystem count
|
||||
if (subs.subsystems.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'structural', difficulty: 'easy',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'How many subsystems does this codebase contain?',
|
||||
expected_answer: String(subs.subsystems.length),
|
||||
answer: String(subs.subsystems.length),
|
||||
answerType: 'exact',
|
||||
source: 'subsystem aggregation',
|
||||
source_entity: 'all subsystems'
|
||||
});
|
||||
}
|
||||
|
||||
// Top charts by K8s resources
|
||||
const topByResources = [...charts]
|
||||
.filter(c => c.templates && c.templates.resources)
|
||||
.sort((a, b) => b.templates.resources.length - a.templates.resources.length)
|
||||
.slice(0, 5);
|
||||
if (topByResources.length > 0 && topByResources[0].templates.resources.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'structural', difficulty: 'medium',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'Which 5 Helm charts produce the most Kubernetes resources? List them with their resource counts.',
|
||||
expected_answer: topByResources.map(c => `${c.chart.name} (${c.dir}): ${c.templates.resources.length}`).join('\n'),
|
||||
answer: topByResources.map(c => `${c.chart.name} (${c.dir}): ${c.templates.resources.length}`).join('\n'),
|
||||
answerType: 'ranked-list',
|
||||
source: 'template scanning',
|
||||
source_entity: topByResources.map(c => c.chart.name).join(', ')
|
||||
});
|
||||
}
|
||||
|
||||
// Largest subsystem
|
||||
const sortedSubs = [...subs.subsystems].sort((a, b) => b.files.length - a.files.length);
|
||||
if (sortedSubs.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'structural', difficulty: 'easy',
|
||||
audience: ['human'],
|
||||
question: 'Which subsystem contains the most files, and how many?',
|
||||
expected_answer: `${sortedSubs[0].name}: ${sortedSubs[0].files.length} files`,
|
||||
answer: `${sortedSubs[0].name}: ${sortedSubs[0].files.length} files`,
|
||||
answerType: 'exact',
|
||||
source: 'subsystem aggregation',
|
||||
source_entity: sortedSubs[0].name
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 2: Dependencies ───
|
||||
|
||||
const chartsWithDeps = charts.filter(c => c.chart.dependencies && c.chart.dependencies.length > 0);
|
||||
for (const c of chartsWithDeps.slice(0, 5)) {
|
||||
questions.push({
|
||||
id: qid(), category: 'dependency', difficulty: 'medium',
|
||||
audience: ['human', 'machine'],
|
||||
question: `What are the dependencies of the "${c.chart.name}" chart (at ${c.dir})?`,
|
||||
expected_answer: c.chart.dependencies.map(d => `${d.name} (${d.version})${d.condition ? ` [condition: ${d.condition}]` : ''}`).join('\n'),
|
||||
answer: c.chart.dependencies.map(d => `${d.name} (${d.version})${d.condition ? ` [condition: ${d.condition}]` : ''}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: `${c.dir}/Chart.yaml`,
|
||||
source_entity: c.chart.name
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 3: Contracts (shared secrets/configs) ───
|
||||
|
||||
const configUsers = {};
|
||||
for (const c of charts) {
|
||||
for (const i of (c.interactions || [])) {
|
||||
if (i.type === 'config-ref') {
|
||||
if (!configUsers[i.target]) configUsers[i.target] = [];
|
||||
configUsers[i.target].push(c.chart.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
const sharedSecrets = Object.entries(configUsers).filter(([, users]) => users.length > 1);
|
||||
if (sharedSecrets.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'contract', difficulty: 'medium',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'Which secrets or ConfigMaps are shared across multiple Helm charts? List each with the charts that use it.',
|
||||
expected_answer: sharedSecrets.map(([name, users]) => `${name}: ${[...new Set(users)].join(', ')}`).join('\n'),
|
||||
answer: sharedSecrets.map(([name, users]) => `${name}: ${[...new Set(users)].join(', ')}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: 'template interaction scanning',
|
||||
source_entity: sharedSecrets.map(([n]) => n).join(', ')
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 4: Configuration Surface ───
|
||||
|
||||
// Top charts by config surface (most values.yaml keys)
|
||||
const chartsByValues = [...charts]
|
||||
.filter(c => c.values && c.values.keys && c.values.keys.length > 5)
|
||||
.sort((a, b) => b.values.keys.length - a.values.keys.length);
|
||||
|
||||
for (const chart of chartsByValues.slice(0, 5)) {
|
||||
questions.push({
|
||||
id: qid(), category: 'configuration', difficulty: 'easy',
|
||||
audience: ['machine'],
|
||||
question: `How many top-level configuration keys does the ${chart.chart.name} chart (at ${chart.dir}) expose in its values.yaml?`,
|
||||
expected_answer: String(chart.values.keys.length),
|
||||
answer: String(chart.values.keys.length),
|
||||
answerType: 'exact',
|
||||
source: `${chart.dir}/values.yaml`,
|
||||
source_entity: chart.chart.name
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 5: Interactions (service-to-service) ───
|
||||
|
||||
const svcRefs = [];
|
||||
for (const c of charts) {
|
||||
for (const i of (c.interactions || [])) {
|
||||
if (i.type === 'k8s-service') {
|
||||
svcRefs.push({ from: c.chart.name, dir: c.dir, to: i.target });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (svcRefs.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'interaction', difficulty: 'medium',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'Which Helm charts reference Kubernetes services from other components? List each chart and the service it calls.',
|
||||
expected_answer: svcRefs.map(r => `${r.from} (${r.dir}) → ${r.to}`).join('\n'),
|
||||
answer: svcRefs.map(r => `${r.from} (${r.dir}) → ${r.to}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: 'template interaction scanning',
|
||||
source_entity: svcRefs.map(r => r.from).join(', ')
|
||||
});
|
||||
}
|
||||
|
||||
// Shared ports
|
||||
const portMap = {};
|
||||
for (const c of charts) {
|
||||
for (const i of (c.interactions || [])) {
|
||||
if (i.type === 'port' && i.target !== '0') {
|
||||
if (!portMap[i.target]) portMap[i.target] = [];
|
||||
if (!portMap[i.target].includes(c.chart.name)) portMap[i.target].push(c.chart.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
const sharedPorts = Object.entries(portMap).filter(([, users]) => users.length > 1);
|
||||
if (sharedPorts.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'interaction', difficulty: 'hard',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'Which network ports are used by multiple Helm charts? List each port and the charts that expose it.',
|
||||
expected_answer: sharedPorts.sort((a, b) => Number(a[0]) - Number(b[0])).map(([port, users]) => `Port ${port}: ${users.join(', ')}`).join('\n'),
|
||||
answer: sharedPorts.sort((a, b) => Number(a[0]) - Number(b[0])).map(([port, users]) => `Port ${port}: ${users.join(', ')}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: 'template port scanning',
|
||||
source_entity: 'shared ports'
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 6: Resource Types ───
|
||||
|
||||
const kindCounts = {};
|
||||
for (const c of charts) {
|
||||
for (const r of (c.templates?.resources || [])) {
|
||||
kindCounts[r.kind] = (kindCounts[r.kind] || 0) + 1;
|
||||
}
|
||||
}
|
||||
const topKinds = Object.entries(kindCounts).sort((a, b) => b[1] - a[1]).slice(0, 10);
|
||||
if (topKinds.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'resource', difficulty: 'medium',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'What are the most common Kubernetes resource types generated across all Helm charts?',
|
||||
expected_answer: topKinds.map(([kind, count]) => `${kind}: ${count}`).join('\n'),
|
||||
answer: topKinds.map(([kind, count]) => `${kind}: ${count}`).join('\n'),
|
||||
answerType: 'ranked-list',
|
||||
source: 'template resource scanning',
|
||||
source_entity: 'all charts'
|
||||
});
|
||||
}
|
||||
|
||||
// Per-chart resource breakdown for top 3 charts by resource count
|
||||
for (const chart of topByResources.slice(0, 3)) {
|
||||
const kinds = {};
|
||||
for (const r of chart.templates.resources) {
|
||||
kinds[r.kind] = (kinds[r.kind] || 0) + 1;
|
||||
}
|
||||
questions.push({
|
||||
id: qid(), category: 'resource', difficulty: 'medium',
|
||||
audience: ['human', 'machine'],
|
||||
question: `What Kubernetes resource types does the ${chart.chart.name} chart generate? List each type and count.`,
|
||||
expected_answer: Object.entries(kinds).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}: ${v}`).join('\n'),
|
||||
answer: Object.entries(kinds).sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}: ${v}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: `${chart.dir}/templates/`,
|
||||
source_entity: chart.chart.name
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 7: Cross-Subsystem ───
|
||||
|
||||
if (subs.crossCutting && subs.crossCutting.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'cross-subsystem', difficulty: 'easy',
|
||||
audience: ['human'],
|
||||
question: 'Which subsystems are identified as cross-cutting concerns?',
|
||||
expected_answer: subs.crossCutting.join(', '),
|
||||
answer: subs.crossCutting.join(', '),
|
||||
answerType: 'list',
|
||||
source: 'subsystem aggregation',
|
||||
source_entity: subs.crossCutting.join(', ')
|
||||
});
|
||||
}
|
||||
|
||||
// Dependency matrix questions
|
||||
if (subs.dependencyMatrix) {
|
||||
const heavyDeps = Object.entries(subs.dependencyMatrix)
|
||||
.filter(([, v]) => (v.calls + v.imports) > 3)
|
||||
.sort((a, b) => (b[1].calls + b[1].imports) - (a[1].calls + a[1].imports))
|
||||
.slice(0, 5);
|
||||
if (heavyDeps.length > 0) {
|
||||
const targetSub = heavyDeps[0][0].split('→')[1];
|
||||
const depsForTarget = heavyDeps.filter(([k]) => k.endsWith(`→${targetSub}`));
|
||||
if (depsForTarget.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'cross-subsystem', difficulty: 'hard',
|
||||
audience: ['human', 'machine'],
|
||||
question: `Which subsystems depend on ${targetSub}, and how heavily (by call+import count)?`,
|
||||
expected_answer: depsForTarget.map(([k, v]) => `${k.split('→')[0]}: ${v.calls + v.imports}`).join('\n'),
|
||||
answer: depsForTarget.map(([k, v]) => `${k.split('→')[0]}: ${v.calls + v.imports}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: 'dependency matrix',
|
||||
source_entity: targetSub
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Category 8: Architectural ───
|
||||
|
||||
// Empty subsystems (Helm-only)
|
||||
const emptySubs = subs.subsystems.filter(s => s.entities.functions === 0 && s.entities.modules === 0);
|
||||
if (emptySubs.length > 0) {
|
||||
questions.push({
|
||||
id: qid(), category: 'architectural', difficulty: 'hard',
|
||||
audience: ['human'],
|
||||
question: `The following subsystems have 0 detected functions and 0 modules: ${emptySubs.map(s => s.name).join(', ')}. Why might this be the case, and what do they actually contain?`,
|
||||
expected_answer: 'These subsystems primarily contain Helm charts with Go-templated YAML, Terraform HCL, and/or Crossplane compositions. The code analysis pipeline detects functions/modules from Python, Go, TypeScript, and shell scripts — but Helm templates use Go template syntax which does not produce traditional function/module entities. Their content is captured through the Helm chart extraction phase instead.',
|
||||
answer: 'These subsystems primarily contain Helm charts, Terraform, or Crossplane compositions rather than traditional code.',
|
||||
answerType: 'explanation',
|
||||
source: 'architectural analysis',
|
||||
source_entity: emptySubs.map(s => s.name).join(', ')
|
||||
});
|
||||
}
|
||||
|
||||
// Chart version for top chart
|
||||
if (charts.length > 0) {
|
||||
const topChart = topByResources[0] || charts[0];
|
||||
questions.push({
|
||||
id: qid(), category: 'architectural', difficulty: 'easy',
|
||||
audience: ['human', 'machine'],
|
||||
question: `What is the current version and appVersion of the ${topChart.chart.name} Helm chart?`,
|
||||
expected_answer: `version: ${topChart.chart.version}, appVersion: ${topChart.chart.appVersion}`,
|
||||
answer: `version: ${topChart.chart.version}, appVersion: ${topChart.chart.appVersion}`,
|
||||
answerType: 'exact',
|
||||
source: `${topChart.dir}/Chart.yaml`,
|
||||
source_entity: topChart.chart.name
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Category 9: Scenario-Based ───
|
||||
|
||||
// Secret rotation scenario
|
||||
if (sharedSecrets.length > 0) {
|
||||
const [secretName, secretUsers] = sharedSecrets[0];
|
||||
questions.push({
|
||||
id: qid(), category: 'scenario', difficulty: 'hard',
|
||||
audience: ['human'],
|
||||
question: `If you need to rotate the "${secretName}" shared secret, which Helm charts would be affected and need redeployment?`,
|
||||
expected_answer: [...new Set(secretUsers)].join(', '),
|
||||
answer: [...new Set(secretUsers)].join(', '),
|
||||
answerType: 'list',
|
||||
source: 'template interaction scanning',
|
||||
source_entity: secretName
|
||||
});
|
||||
}
|
||||
|
||||
// Deployment scenario for top chart
|
||||
if (topByResources.length > 0) {
|
||||
const chart = topByResources[0];
|
||||
questions.push({
|
||||
id: qid(), category: 'scenario', difficulty: 'hard',
|
||||
audience: ['human'],
|
||||
question: `A new engineer needs to deploy the ${chart.chart.name} application. What charts, configuration values, and external dependencies should they understand first?`,
|
||||
expected_answer: `Chart: ${chart.chart.name} (${chart.dir}), Version: ${chart.chart.version}\nKey values: ${(chart.values?.keys || []).slice(0, 10).map(k => k.name).join(', ')}${chart.values?.keys?.length > 10 ? ` (+${chart.values.keys.length - 10} more)` : ''}\nResources generated: ${chart.templates.resources.length} K8s resources\nInteractions: ${(chart.interactions || []).map(i => `${i.type}: ${i.target}`).join(', ') || 'none detected'}`,
|
||||
answer: `Chart: ${chart.chart.name} (${chart.dir}), Version: ${chart.chart.version}`,
|
||||
answerType: 'explanation',
|
||||
source: `${chart.dir}`,
|
||||
source_entity: chart.chart.name
|
||||
});
|
||||
}
|
||||
|
||||
return questions;
|
||||
}
|
||||
|
||||
module.exports = { generateQuestions };
|
||||
|
||||
if (require.main === module) {
|
||||
const args = process.argv.slice(2).filter(a => !a.startsWith('-'));
|
||||
const snapshotPath = args[0];
|
||||
const srcRoot = args[1];
|
||||
const outFile = args[2] || null;
|
||||
|
||||
if (!snapshotPath || !srcRoot) {
|
||||
console.error('Usage: node eval-generator.js <snapshot.json> <repo-root> [output.json] [--dry-run]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!fs.existsSync(snapshotPath)) {
|
||||
console.error(`Snapshot not found: ${snapshotPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const questions = generateQuestions(snapshotPath, srcRoot);
|
||||
|
||||
// Summary
|
||||
const cats = {};
|
||||
for (const q of questions) {
|
||||
cats[q.category] = (cats[q.category] || 0) + 1;
|
||||
}
|
||||
|
||||
console.log(`Generated ${questions.length} questions:`);
|
||||
for (const [cat, count] of Object.entries(cats).sort((a, b) => b[1] - a[1])) {
|
||||
console.log(` ${cat}: ${count}`);
|
||||
}
|
||||
console.log(`Difficulty: easy=${questions.filter(q => q.difficulty === 'easy').length}, medium=${questions.filter(q => q.difficulty === 'medium').length}, hard=${questions.filter(q => q.difficulty === 'hard').length}`);
|
||||
|
||||
const result = { generated: new Date().toISOString(), count: questions.length, questions };
|
||||
const json = JSON.stringify(result, null, 2);
|
||||
|
||||
if (outFile) {
|
||||
fs.writeFileSync(outFile, json);
|
||||
console.log(`\nWritten to ${outFile}`);
|
||||
} else {
|
||||
console.log(json);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user