397 lines
14 KiB
JavaScript
397 lines
14 KiB
JavaScript
|
|
/**
|
||
|
|
* Quality Ratchet: Automated Doc Improvement Loop
|
||
|
|
*
|
||
|
|
* BMad-inspired workflow:
|
||
|
|
* 1. EVAL: Run question bank against docs, get scores
|
||
|
|
* 2. DIAGNOSE: Identify weakest categories and specific failures
|
||
|
|
* 3. FIX: LLM proposes concrete code changes to the doc generator
|
||
|
|
* 4. APPLY: Apply fixes, regenerate docs
|
||
|
|
* 5. RE-EVAL: Score again
|
||
|
|
* 6. RATCHET: If passing threshold, add harder questions
|
||
|
|
* 7. REPEAT until target score or max iterations
|
||
|
|
*
|
||
|
|
* Usage: node ratchet.js <foxtrot-root> <snapshot> <docs-dir> [--target=80] [--max-iter=5]
|
||
|
|
*/
|
||
|
|
|
||
|
|
const fs = require('fs');
|
||
|
|
const path = require('path');
|
||
|
|
const { execSync } = require('child_process');
|
||
|
|
const { callLLM } = require('./prose.js');
|
||
|
|
const { runEval } = require('./eval.js');
|
||
|
|
const { generateQuestions } = require('./eval-questions.js');
|
||
|
|
|
||
|
|
const PROJECT_DIR = __dirname;
|
||
|
|
|
||
|
|
function parseArgs() {
|
||
|
|
const args = process.argv.slice(2);
|
||
|
|
const opts = {
|
||
|
|
srcRoot: args[0],
|
||
|
|
snapshot: args[1],
|
||
|
|
docsDir: args[2],
|
||
|
|
target: 80,
|
||
|
|
maxIter: 5,
|
||
|
|
model: process.env.LLM_MODEL || 'claude-haiku-4.5',
|
||
|
|
fixModel: process.env.FIX_MODEL || 'claude-sonnet-4.6',
|
||
|
|
};
|
||
|
|
for (const a of args) {
|
||
|
|
if (a.startsWith('--target=')) opts.target = Number(a.split('=')[1]);
|
||
|
|
if (a.startsWith('--max-iter=')) opts.maxIter = Number(a.split('=')[1]);
|
||
|
|
}
|
||
|
|
return opts;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Diagnose failures and produce a fix plan */
|
||
|
|
async function diagnose(report, opts) {
|
||
|
|
// Collect the worst-performing questions with full context
|
||
|
|
const failures = report.results
|
||
|
|
.filter(r => r.composite < 60)
|
||
|
|
.sort((a, b) => a.composite - b.composite)
|
||
|
|
.slice(0, 10);
|
||
|
|
|
||
|
|
if (failures.length === 0) return { fixes: [], diagnosis: 'All questions above 60%. No critical failures.' };
|
||
|
|
|
||
|
|
// Read current sysdoc.js for context
|
||
|
|
const sysdocSource = fs.readFileSync(path.join(PROJECT_DIR, 'sysdoc.js'), 'utf8');
|
||
|
|
|
||
|
|
// Read extract-helm.js
|
||
|
|
const helmSource = fs.readFileSync(path.join(PROJECT_DIR, 'extract-helm.js'), 'utf8');
|
||
|
|
|
||
|
|
const failureDetails = failures.map(f =>
|
||
|
|
`[${f.id}] Score: ${f.composite}% (A:${f.score.accuracy} C:${f.score.completeness} P:${f.score.precision})
|
||
|
|
Q: ${f.question}
|
||
|
|
Expected: ${f.groundTruth.substring(0, 300)}
|
||
|
|
Got: ${f.llmAnswer.substring(0, 300)}
|
||
|
|
Judge notes: ${f.score.notes}
|
||
|
|
NOT_FOUND: ${f.notFound}`
|
||
|
|
).join('\n\n');
|
||
|
|
|
||
|
|
const prompt = `You are a documentation pipeline engineer. Analyze these evaluation failures and propose CONCRETE fixes to the doc generator code.
|
||
|
|
|
||
|
|
## Current Pipeline
|
||
|
|
The doc generator (sysdoc.js) produces Markdown documentation from:
|
||
|
|
1. Code analysis graph (subsystems, functions, modules)
|
||
|
|
2. Helm chart extraction (Chart.yaml, values.yaml, templates)
|
||
|
|
3. LLM prose generation
|
||
|
|
|
||
|
|
## Failures (sorted worst-first)
|
||
|
|
${failureDetails}
|
||
|
|
|
||
|
|
## Score Summary
|
||
|
|
Overall: ${report.overallScore}%
|
||
|
|
By category: ${JSON.stringify(report.byCategory)}
|
||
|
|
NOT_FOUND rate: ${report.notFoundRate}
|
||
|
|
|
||
|
|
## Key Source Files
|
||
|
|
sysdoc.js generates the docs. extract-helm.js extracts Helm data.
|
||
|
|
|
||
|
|
## Rules
|
||
|
|
1. Each fix must be a SPECIFIC change to sysdoc.js or extract-helm.js
|
||
|
|
2. Focus on information that IS extracted but NOT surfaced in the docs
|
||
|
|
3. If data is missing from extraction, propose extraction improvements
|
||
|
|
4. Prioritize fixes that improve multiple questions at once
|
||
|
|
5. Do NOT propose changes to the eval harness or questions
|
||
|
|
|
||
|
|
Respond in this JSON format:
|
||
|
|
{
|
||
|
|
"diagnosis": "2-3 sentence summary of root causes",
|
||
|
|
"fixes": [
|
||
|
|
{
|
||
|
|
"id": "fix-001",
|
||
|
|
"file": "sysdoc.js",
|
||
|
|
"description": "What to change and why",
|
||
|
|
"impact": ["question-id-1", "question-id-2"],
|
||
|
|
"expectedImprovement": "+15%",
|
||
|
|
"code_hint": "Brief description of the code change needed"
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}`;
|
||
|
|
|
||
|
|
const raw = await callLLM(prompt, { model: opts.fixModel, maxTokens: 2048, temperature: 0.2 });
|
||
|
|
|
||
|
|
try {
|
||
|
|
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||
|
|
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||
|
|
} catch {}
|
||
|
|
|
||
|
|
return { diagnosis: raw.substring(0, 500), fixes: [] };
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Apply a fix by having the LLM generate the actual code change */
|
||
|
|
async function applyFix(fix, opts) {
|
||
|
|
const filePath = path.join(PROJECT_DIR, fix.file);
|
||
|
|
const source = fs.readFileSync(filePath, 'utf8');
|
||
|
|
|
||
|
|
const prompt = `You are modifying ${fix.file} to improve documentation quality.
|
||
|
|
|
||
|
|
CHANGE NEEDED: ${fix.description}
|
||
|
|
CODE HINT: ${fix.code_hint}
|
||
|
|
|
||
|
|
CURRENT SOURCE (${fix.file}):
|
||
|
|
${source}
|
||
|
|
|
||
|
|
Generate ONLY the specific code change. Output in this format:
|
||
|
|
SEARCH:
|
||
|
|
\`\`\`
|
||
|
|
exact lines to find
|
||
|
|
\`\`\`
|
||
|
|
REPLACE:
|
||
|
|
\`\`\`
|
||
|
|
replacement lines
|
||
|
|
\`\`\`
|
||
|
|
|
||
|
|
If multiple changes are needed, repeat the SEARCH/REPLACE blocks.
|
||
|
|
Be precise — the SEARCH text must match the source exactly.`;
|
||
|
|
|
||
|
|
const raw = await callLLM(prompt, { model: opts.fixModel, maxTokens: 4096, temperature: 0.1 });
|
||
|
|
|
||
|
|
// Parse SEARCH/REPLACE blocks
|
||
|
|
const changes = [];
|
||
|
|
const blockRegex = /SEARCH:\s*```[^\n]*\n([\s\S]*?)```\s*REPLACE:\s*```[^\n]*\n([\s\S]*?)```/g;
|
||
|
|
let match;
|
||
|
|
while ((match = blockRegex.exec(raw)) !== null) {
|
||
|
|
changes.push({ search: match[1].trimEnd(), replace: match[2].trimEnd() });
|
||
|
|
}
|
||
|
|
|
||
|
|
if (changes.length === 0) {
|
||
|
|
console.log(` ⚠ No parseable changes from LLM for fix ${fix.id}`);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Apply changes
|
||
|
|
let modified = source;
|
||
|
|
let applied = 0;
|
||
|
|
for (const change of changes) {
|
||
|
|
if (modified.includes(change.search)) {
|
||
|
|
modified = modified.replace(change.search, change.replace);
|
||
|
|
applied++;
|
||
|
|
} else {
|
||
|
|
console.log(` ⚠ SEARCH block not found in ${fix.file} for fix ${fix.id}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (applied > 0) {
|
||
|
|
fs.writeFileSync(filePath, modified);
|
||
|
|
console.log(` ✓ Applied ${applied}/${changes.length} changes to ${fix.file}`);
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Regenerate docs */
|
||
|
|
function regenerateDocs(opts) {
|
||
|
|
console.log(' Regenerating docs...');
|
||
|
|
execSync(`rm -rf ${opts.docsDir}`, { cwd: PROJECT_DIR });
|
||
|
|
const cmd = `LLM_MODEL=${opts.model} node sysdoc.js ${opts.snapshot} ${opts.srcRoot} ${opts.docsDir} --prose`;
|
||
|
|
execSync(cmd, { cwd: PROJECT_DIR, timeout: 600000, stdio: 'pipe' });
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Generate harder questions based on current performance */
|
||
|
|
function ratchetQuestions(currentQuestions, report, srcRoot, snapshotPath) {
|
||
|
|
// Find categories scoring > 90% — make them harder
|
||
|
|
const easyCategories = Object.entries(report.byCategory)
|
||
|
|
.filter(([, s]) => Number(s.avg) > 90)
|
||
|
|
.map(([cat]) => cat);
|
||
|
|
|
||
|
|
if (easyCategories.length === 0) return currentQuestions;
|
||
|
|
|
||
|
|
console.log(` Ratcheting: categories scoring >90%: ${easyCategories.join(', ')}`);
|
||
|
|
|
||
|
|
// Add more specific questions for high-scoring categories
|
||
|
|
const newQuestions = [...currentQuestions];
|
||
|
|
const { discoverCharts } = require('./extract-helm.js');
|
||
|
|
const charts = discoverCharts(srcRoot, new Set(['node_modules', '.git', 'venv', '__pycache__', '.terraform', '_bmad', '_bmad-output', '.codex', '.claude']));
|
||
|
|
|
||
|
|
if (easyCategories.includes('configuration')) {
|
||
|
|
// Add nested value questions (harder than top-level)
|
||
|
|
for (const chart of charts.filter(c => c.values.keys.length > 20).slice(0, 3)) {
|
||
|
|
const objectKeys = chart.values.keys.filter(k => k.type === 'object');
|
||
|
|
if (objectKeys.length > 0) {
|
||
|
|
newQuestions.push({
|
||
|
|
id: `ratchet-config-${chart.chart.name}-nested`,
|
||
|
|
category: 'configuration',
|
||
|
|
difficulty: 'hard',
|
||
|
|
audience: ['human', 'machine'],
|
||
|
|
question: `In the ${chart.chart.name} chart, which configuration keys are complex objects (not simple values)? List them.`,
|
||
|
|
answer: objectKeys.map(k => k.name).join(', '),
|
||
|
|
answerType: 'list',
|
||
|
|
source: `${chart.dir}/values.yaml`,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (easyCategories.includes('resource')) {
|
||
|
|
// Add cross-chart resource comparison questions
|
||
|
|
const deployers = charts.filter(c => c.templates.resources.some(r => r.kind === 'Deployment'));
|
||
|
|
if (deployers.length > 0) {
|
||
|
|
newQuestions.push({
|
||
|
|
id: 'ratchet-resource-deployments',
|
||
|
|
category: 'resource',
|
||
|
|
difficulty: 'hard',
|
||
|
|
audience: ['human', 'machine'],
|
||
|
|
question: 'Which Helm charts create Kubernetes Deployments? List all of them.',
|
||
|
|
answer: deployers.map(c => `${c.chart.name} (${c.dir})`).join('\n'),
|
||
|
|
answerType: 'list',
|
||
|
|
source: 'template scanning',
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (easyCategories.includes('dependency')) {
|
||
|
|
// Add transitive dependency questions
|
||
|
|
const withDeps = charts.filter(c => c.chart.dependencies.length > 2);
|
||
|
|
for (const chart of withDeps.slice(0, 2)) {
|
||
|
|
newQuestions.push({
|
||
|
|
id: `ratchet-dep-${chart.chart.name}-conditions`,
|
||
|
|
category: 'dependency',
|
||
|
|
difficulty: 'hard',
|
||
|
|
audience: ['machine'],
|
||
|
|
question: `What are the enable conditions for each dependency of the "${chart.chart.name}" chart?`,
|
||
|
|
answer: chart.chart.dependencies.map(d => `${d.name}: ${d.condition || 'always enabled'}`).join('\n'),
|
||
|
|
answerType: 'list',
|
||
|
|
source: `${chart.dir}/Chart.yaml`,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
const added = newQuestions.length - currentQuestions.length;
|
||
|
|
if (added > 0) console.log(` Added ${added} harder questions`);
|
||
|
|
return newQuestions;
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Main ratchet loop */
|
||
|
|
async function ratchetLoop(opts) {
|
||
|
|
console.log('═'.repeat(60));
|
||
|
|
console.log('QUALITY RATCHET — BMad Improvement Loop');
|
||
|
|
console.log('═'.repeat(60));
|
||
|
|
console.log(`Target: ${opts.target}% Max iterations: ${opts.maxIter}`);
|
||
|
|
console.log(`Eval model: ${opts.model} Fix model: ${opts.fixModel}`);
|
||
|
|
console.log();
|
||
|
|
|
||
|
|
// Load initial questions
|
||
|
|
const questionsPath = path.join(PROJECT_DIR, 'eval-questions.json');
|
||
|
|
let questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||
|
|
let questions = questionsData.questions;
|
||
|
|
|
||
|
|
const history = [];
|
||
|
|
|
||
|
|
for (let iter = 1; iter <= opts.maxIter; iter++) {
|
||
|
|
console.log(`\n${'─'.repeat(60)}`);
|
||
|
|
console.log(`ITERATION ${iter}/${opts.maxIter}`);
|
||
|
|
console.log('─'.repeat(60));
|
||
|
|
|
||
|
|
// Write current questions
|
||
|
|
const iterQuestionsPath = path.join(PROJECT_DIR, `eval-questions-iter${iter}.json`);
|
||
|
|
fs.writeFileSync(iterQuestionsPath, JSON.stringify({ generated: new Date().toISOString(), count: questions.length, questions }, null, 2));
|
||
|
|
|
||
|
|
// Step 1: Eval
|
||
|
|
console.log('\n📊 EVAL');
|
||
|
|
const report = await runEval(opts.docsDir, iterQuestionsPath, { model: opts.model });
|
||
|
|
|
||
|
|
console.log(` Score: ${report.overallScore}% (A:${report.avgAccuracy} C:${report.avgCompleteness} P:${report.avgPrecision})`);
|
||
|
|
console.log(` NOT_FOUND: ${report.notFoundRate}`);
|
||
|
|
|
||
|
|
history.push({
|
||
|
|
iteration: iter,
|
||
|
|
score: report.overallScore,
|
||
|
|
questions: questions.length,
|
||
|
|
notFoundRate: report.notFoundRate,
|
||
|
|
});
|
||
|
|
|
||
|
|
// Check if we've hit the target
|
||
|
|
if (report.overallScore >= opts.target) {
|
||
|
|
console.log(`\n🎯 TARGET REACHED: ${report.overallScore}% >= ${opts.target}%`);
|
||
|
|
|
||
|
|
// Ratchet: make it harder
|
||
|
|
const harderQuestions = ratchetQuestions(questions, report, opts.srcRoot, opts.snapshot);
|
||
|
|
if (harderQuestions.length > questions.length) {
|
||
|
|
console.log(` Ratcheting up: ${questions.length} → ${harderQuestions.length} questions`);
|
||
|
|
questions = harderQuestions;
|
||
|
|
// Don't break — run another iteration with harder questions
|
||
|
|
continue;
|
||
|
|
} else {
|
||
|
|
console.log(' No harder questions to add. Pipeline is solid.');
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 2: Diagnose
|
||
|
|
console.log('\n🔍 DIAGNOSE');
|
||
|
|
const diagnosis = await diagnose(report, opts);
|
||
|
|
console.log(` ${diagnosis.diagnosis}`);
|
||
|
|
console.log(` Proposed fixes: ${diagnosis.fixes.length}`);
|
||
|
|
|
||
|
|
if (diagnosis.fixes.length === 0) {
|
||
|
|
console.log(' No actionable fixes proposed. Stopping.');
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 3: Apply fixes
|
||
|
|
console.log('\n🔧 FIX');
|
||
|
|
let anyApplied = false;
|
||
|
|
for (const fix of diagnosis.fixes.slice(0, 3)) { // Max 3 fixes per iteration
|
||
|
|
console.log(` [${fix.id}] ${fix.description}`);
|
||
|
|
console.log(` Impact: ${fix.impact.join(', ')} | Expected: ${fix.expectedImprovement}`);
|
||
|
|
const applied = await applyFix(fix, opts);
|
||
|
|
if (applied) anyApplied = true;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!anyApplied) {
|
||
|
|
console.log(' No fixes could be applied. Stopping.');
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 4: Regenerate docs
|
||
|
|
console.log('\n📝 REGENERATE');
|
||
|
|
try {
|
||
|
|
regenerateDocs(opts);
|
||
|
|
} catch (err) {
|
||
|
|
console.log(` ⚠ Doc generation failed: ${err.message}`);
|
||
|
|
console.log(' Reverting changes...');
|
||
|
|
execSync(`git checkout -- sysdoc.js extract-helm.js`, { cwd: PROJECT_DIR });
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Final summary
|
||
|
|
console.log('\n' + '═'.repeat(60));
|
||
|
|
console.log('RATCHET SUMMARY');
|
||
|
|
console.log('═'.repeat(60));
|
||
|
|
for (const h of history) {
|
||
|
|
console.log(` Iter ${h.iteration}: ${h.score}% (${h.questions} questions, NOT_FOUND: ${h.notFoundRate})`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const finalScore = history[history.length - 1]?.score || 0;
|
||
|
|
console.log(`\nFinal: ${finalScore}% ${finalScore >= opts.target ? '✅ PASS' : '❌ BELOW TARGET'}`);
|
||
|
|
|
||
|
|
// Save history
|
||
|
|
const historyPath = path.join(PROJECT_DIR, 'ratchet-history.json');
|
||
|
|
fs.writeFileSync(historyPath, JSON.stringify({
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
target: opts.target,
|
||
|
|
history,
|
||
|
|
finalScore,
|
||
|
|
passed: finalScore >= opts.target,
|
||
|
|
}, null, 2));
|
||
|
|
console.log(`History: ${historyPath}`);
|
||
|
|
|
||
|
|
return { history, finalScore, passed: finalScore >= opts.target };
|
||
|
|
}
|
||
|
|
|
||
|
|
if (require.main === module) {
|
||
|
|
const opts = parseArgs();
|
||
|
|
if (!opts.srcRoot || !opts.snapshot || !opts.docsDir) {
|
||
|
|
console.error('Usage: node ratchet.js <foxtrot-root> <snapshot> <docs-dir> [--target=80] [--max-iter=5]');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
ratchetLoop(opts).then(result => {
|
||
|
|
process.exit(result.passed ? 0 : 1);
|
||
|
|
}).catch(err => {
|
||
|
|
console.error('Ratchet failed:', err);
|
||
|
|
process.exit(1);
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
module.exports = { ratchetLoop };
|