Phase 9b: structural documentation improvements\n\n- sysdoc.js: Added Summary Statistics, Top Charts, and K8s Resource Types to architecture doc\n- Addresses ratchet failures where system-wide rollups were missing from generated prose\n- Eval v2 shows minor improvement, though RAG context window still limits wide scatter-gather queries
This commit is contained in:
396
ratchet.js
Normal file
396
ratchet.js
Normal file
@@ -0,0 +1,396 @@
|
||||
/**
|
||||
* Quality Ratchet: Automated Doc Improvement Loop
|
||||
*
|
||||
* BMad-inspired workflow:
|
||||
* 1. EVAL: Run question bank against docs, get scores
|
||||
* 2. DIAGNOSE: Identify weakest categories and specific failures
|
||||
* 3. FIX: LLM proposes concrete code changes to the doc generator
|
||||
* 4. APPLY: Apply fixes, regenerate docs
|
||||
* 5. RE-EVAL: Score again
|
||||
* 6. RATCHET: If passing threshold, add harder questions
|
||||
* 7. REPEAT until target score or max iterations
|
||||
*
|
||||
* Usage: node ratchet.js <foxtrot-root> <snapshot> <docs-dir> [--target=80] [--max-iter=5]
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { execSync } = require('child_process');
|
||||
const { callLLM } = require('./prose.js');
|
||||
const { runEval } = require('./eval.js');
|
||||
const { generateQuestions } = require('./eval-questions.js');
|
||||
|
||||
const PROJECT_DIR = __dirname;
|
||||
|
||||
function parseArgs() {
|
||||
const args = process.argv.slice(2);
|
||||
const opts = {
|
||||
srcRoot: args[0],
|
||||
snapshot: args[1],
|
||||
docsDir: args[2],
|
||||
target: 80,
|
||||
maxIter: 5,
|
||||
model: process.env.LLM_MODEL || 'claude-haiku-4.5',
|
||||
fixModel: process.env.FIX_MODEL || 'claude-sonnet-4.6',
|
||||
};
|
||||
for (const a of args) {
|
||||
if (a.startsWith('--target=')) opts.target = Number(a.split('=')[1]);
|
||||
if (a.startsWith('--max-iter=')) opts.maxIter = Number(a.split('=')[1]);
|
||||
}
|
||||
return opts;
|
||||
}
|
||||
|
||||
/** Diagnose failures and produce a fix plan */
|
||||
async function diagnose(report, opts) {
|
||||
// Collect the worst-performing questions with full context
|
||||
const failures = report.results
|
||||
.filter(r => r.composite < 60)
|
||||
.sort((a, b) => a.composite - b.composite)
|
||||
.slice(0, 10);
|
||||
|
||||
if (failures.length === 0) return { fixes: [], diagnosis: 'All questions above 60%. No critical failures.' };
|
||||
|
||||
// Read current sysdoc.js for context
|
||||
const sysdocSource = fs.readFileSync(path.join(PROJECT_DIR, 'sysdoc.js'), 'utf8');
|
||||
|
||||
// Read extract-helm.js
|
||||
const helmSource = fs.readFileSync(path.join(PROJECT_DIR, 'extract-helm.js'), 'utf8');
|
||||
|
||||
const failureDetails = failures.map(f =>
|
||||
`[${f.id}] Score: ${f.composite}% (A:${f.score.accuracy} C:${f.score.completeness} P:${f.score.precision})
|
||||
Q: ${f.question}
|
||||
Expected: ${f.groundTruth.substring(0, 300)}
|
||||
Got: ${f.llmAnswer.substring(0, 300)}
|
||||
Judge notes: ${f.score.notes}
|
||||
NOT_FOUND: ${f.notFound}`
|
||||
).join('\n\n');
|
||||
|
||||
const prompt = `You are a documentation pipeline engineer. Analyze these evaluation failures and propose CONCRETE fixes to the doc generator code.
|
||||
|
||||
## Current Pipeline
|
||||
The doc generator (sysdoc.js) produces Markdown documentation from:
|
||||
1. Code analysis graph (subsystems, functions, modules)
|
||||
2. Helm chart extraction (Chart.yaml, values.yaml, templates)
|
||||
3. LLM prose generation
|
||||
|
||||
## Failures (sorted worst-first)
|
||||
${failureDetails}
|
||||
|
||||
## Score Summary
|
||||
Overall: ${report.overallScore}%
|
||||
By category: ${JSON.stringify(report.byCategory)}
|
||||
NOT_FOUND rate: ${report.notFoundRate}
|
||||
|
||||
## Key Source Files
|
||||
sysdoc.js generates the docs. extract-helm.js extracts Helm data.
|
||||
|
||||
## Rules
|
||||
1. Each fix must be a SPECIFIC change to sysdoc.js or extract-helm.js
|
||||
2. Focus on information that IS extracted but NOT surfaced in the docs
|
||||
3. If data is missing from extraction, propose extraction improvements
|
||||
4. Prioritize fixes that improve multiple questions at once
|
||||
5. Do NOT propose changes to the eval harness or questions
|
||||
|
||||
Respond in this JSON format:
|
||||
{
|
||||
"diagnosis": "2-3 sentence summary of root causes",
|
||||
"fixes": [
|
||||
{
|
||||
"id": "fix-001",
|
||||
"file": "sysdoc.js",
|
||||
"description": "What to change and why",
|
||||
"impact": ["question-id-1", "question-id-2"],
|
||||
"expectedImprovement": "+15%",
|
||||
"code_hint": "Brief description of the code change needed"
|
||||
}
|
||||
]
|
||||
}`;
|
||||
|
||||
const raw = await callLLM(prompt, { model: opts.fixModel, maxTokens: 2048, temperature: 0.2 });
|
||||
|
||||
try {
|
||||
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||||
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||||
} catch {}
|
||||
|
||||
return { diagnosis: raw.substring(0, 500), fixes: [] };
|
||||
}
|
||||
|
||||
/** Apply a fix by having the LLM generate the actual code change */
|
||||
async function applyFix(fix, opts) {
|
||||
const filePath = path.join(PROJECT_DIR, fix.file);
|
||||
const source = fs.readFileSync(filePath, 'utf8');
|
||||
|
||||
const prompt = `You are modifying ${fix.file} to improve documentation quality.
|
||||
|
||||
CHANGE NEEDED: ${fix.description}
|
||||
CODE HINT: ${fix.code_hint}
|
||||
|
||||
CURRENT SOURCE (${fix.file}):
|
||||
${source}
|
||||
|
||||
Generate ONLY the specific code change. Output in this format:
|
||||
SEARCH:
|
||||
\`\`\`
|
||||
exact lines to find
|
||||
\`\`\`
|
||||
REPLACE:
|
||||
\`\`\`
|
||||
replacement lines
|
||||
\`\`\`
|
||||
|
||||
If multiple changes are needed, repeat the SEARCH/REPLACE blocks.
|
||||
Be precise — the SEARCH text must match the source exactly.`;
|
||||
|
||||
const raw = await callLLM(prompt, { model: opts.fixModel, maxTokens: 4096, temperature: 0.1 });
|
||||
|
||||
// Parse SEARCH/REPLACE blocks
|
||||
const changes = [];
|
||||
const blockRegex = /SEARCH:\s*```[^\n]*\n([\s\S]*?)```\s*REPLACE:\s*```[^\n]*\n([\s\S]*?)```/g;
|
||||
let match;
|
||||
while ((match = blockRegex.exec(raw)) !== null) {
|
||||
changes.push({ search: match[1].trimEnd(), replace: match[2].trimEnd() });
|
||||
}
|
||||
|
||||
if (changes.length === 0) {
|
||||
console.log(` ⚠ No parseable changes from LLM for fix ${fix.id}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Apply changes
|
||||
let modified = source;
|
||||
let applied = 0;
|
||||
for (const change of changes) {
|
||||
if (modified.includes(change.search)) {
|
||||
modified = modified.replace(change.search, change.replace);
|
||||
applied++;
|
||||
} else {
|
||||
console.log(` ⚠ SEARCH block not found in ${fix.file} for fix ${fix.id}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (applied > 0) {
|
||||
fs.writeFileSync(filePath, modified);
|
||||
console.log(` ✓ Applied ${applied}/${changes.length} changes to ${fix.file}`);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Regenerate docs */
|
||||
function regenerateDocs(opts) {
|
||||
console.log(' Regenerating docs...');
|
||||
execSync(`rm -rf ${opts.docsDir}`, { cwd: PROJECT_DIR });
|
||||
const cmd = `LLM_MODEL=${opts.model} node sysdoc.js ${opts.snapshot} ${opts.srcRoot} ${opts.docsDir} --prose`;
|
||||
execSync(cmd, { cwd: PROJECT_DIR, timeout: 600000, stdio: 'pipe' });
|
||||
}
|
||||
|
||||
/** Generate harder questions based on current performance */
|
||||
function ratchetQuestions(currentQuestions, report, srcRoot, snapshotPath) {
|
||||
// Find categories scoring > 90% — make them harder
|
||||
const easyCategories = Object.entries(report.byCategory)
|
||||
.filter(([, s]) => Number(s.avg) > 90)
|
||||
.map(([cat]) => cat);
|
||||
|
||||
if (easyCategories.length === 0) return currentQuestions;
|
||||
|
||||
console.log(` Ratcheting: categories scoring >90%: ${easyCategories.join(', ')}`);
|
||||
|
||||
// Add more specific questions for high-scoring categories
|
||||
const newQuestions = [...currentQuestions];
|
||||
const { discoverCharts } = require('./extract-helm.js');
|
||||
const charts = discoverCharts(srcRoot, new Set(['node_modules', '.git', 'venv', '__pycache__', '.terraform', '_bmad', '_bmad-output', '.codex', '.claude']));
|
||||
|
||||
if (easyCategories.includes('configuration')) {
|
||||
// Add nested value questions (harder than top-level)
|
||||
for (const chart of charts.filter(c => c.values.keys.length > 20).slice(0, 3)) {
|
||||
const objectKeys = chart.values.keys.filter(k => k.type === 'object');
|
||||
if (objectKeys.length > 0) {
|
||||
newQuestions.push({
|
||||
id: `ratchet-config-${chart.chart.name}-nested`,
|
||||
category: 'configuration',
|
||||
difficulty: 'hard',
|
||||
audience: ['human', 'machine'],
|
||||
question: `In the ${chart.chart.name} chart, which configuration keys are complex objects (not simple values)? List them.`,
|
||||
answer: objectKeys.map(k => k.name).join(', '),
|
||||
answerType: 'list',
|
||||
source: `${chart.dir}/values.yaml`,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (easyCategories.includes('resource')) {
|
||||
// Add cross-chart resource comparison questions
|
||||
const deployers = charts.filter(c => c.templates.resources.some(r => r.kind === 'Deployment'));
|
||||
if (deployers.length > 0) {
|
||||
newQuestions.push({
|
||||
id: 'ratchet-resource-deployments',
|
||||
category: 'resource',
|
||||
difficulty: 'hard',
|
||||
audience: ['human', 'machine'],
|
||||
question: 'Which Helm charts create Kubernetes Deployments? List all of them.',
|
||||
answer: deployers.map(c => `${c.chart.name} (${c.dir})`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: 'template scanning',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (easyCategories.includes('dependency')) {
|
||||
// Add transitive dependency questions
|
||||
const withDeps = charts.filter(c => c.chart.dependencies.length > 2);
|
||||
for (const chart of withDeps.slice(0, 2)) {
|
||||
newQuestions.push({
|
||||
id: `ratchet-dep-${chart.chart.name}-conditions`,
|
||||
category: 'dependency',
|
||||
difficulty: 'hard',
|
||||
audience: ['machine'],
|
||||
question: `What are the enable conditions for each dependency of the "${chart.chart.name}" chart?`,
|
||||
answer: chart.chart.dependencies.map(d => `${d.name}: ${d.condition || 'always enabled'}`).join('\n'),
|
||||
answerType: 'list',
|
||||
source: `${chart.dir}/Chart.yaml`,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const added = newQuestions.length - currentQuestions.length;
|
||||
if (added > 0) console.log(` Added ${added} harder questions`);
|
||||
return newQuestions;
|
||||
}
|
||||
|
||||
/** Main ratchet loop */
|
||||
async function ratchetLoop(opts) {
|
||||
console.log('═'.repeat(60));
|
||||
console.log('QUALITY RATCHET — BMad Improvement Loop');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(`Target: ${opts.target}% Max iterations: ${opts.maxIter}`);
|
||||
console.log(`Eval model: ${opts.model} Fix model: ${opts.fixModel}`);
|
||||
console.log();
|
||||
|
||||
// Load initial questions
|
||||
const questionsPath = path.join(PROJECT_DIR, 'eval-questions.json');
|
||||
let questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||||
let questions = questionsData.questions;
|
||||
|
||||
const history = [];
|
||||
|
||||
for (let iter = 1; iter <= opts.maxIter; iter++) {
|
||||
console.log(`\n${'─'.repeat(60)}`);
|
||||
console.log(`ITERATION ${iter}/${opts.maxIter}`);
|
||||
console.log('─'.repeat(60));
|
||||
|
||||
// Write current questions
|
||||
const iterQuestionsPath = path.join(PROJECT_DIR, `eval-questions-iter${iter}.json`);
|
||||
fs.writeFileSync(iterQuestionsPath, JSON.stringify({ generated: new Date().toISOString(), count: questions.length, questions }, null, 2));
|
||||
|
||||
// Step 1: Eval
|
||||
console.log('\n📊 EVAL');
|
||||
const report = await runEval(opts.docsDir, iterQuestionsPath, { model: opts.model });
|
||||
|
||||
console.log(` Score: ${report.overallScore}% (A:${report.avgAccuracy} C:${report.avgCompleteness} P:${report.avgPrecision})`);
|
||||
console.log(` NOT_FOUND: ${report.notFoundRate}`);
|
||||
|
||||
history.push({
|
||||
iteration: iter,
|
||||
score: report.overallScore,
|
||||
questions: questions.length,
|
||||
notFoundRate: report.notFoundRate,
|
||||
});
|
||||
|
||||
// Check if we've hit the target
|
||||
if (report.overallScore >= opts.target) {
|
||||
console.log(`\n🎯 TARGET REACHED: ${report.overallScore}% >= ${opts.target}%`);
|
||||
|
||||
// Ratchet: make it harder
|
||||
const harderQuestions = ratchetQuestions(questions, report, opts.srcRoot, opts.snapshot);
|
||||
if (harderQuestions.length > questions.length) {
|
||||
console.log(` Ratcheting up: ${questions.length} → ${harderQuestions.length} questions`);
|
||||
questions = harderQuestions;
|
||||
// Don't break — run another iteration with harder questions
|
||||
continue;
|
||||
} else {
|
||||
console.log(' No harder questions to add. Pipeline is solid.');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Diagnose
|
||||
console.log('\n🔍 DIAGNOSE');
|
||||
const diagnosis = await diagnose(report, opts);
|
||||
console.log(` ${diagnosis.diagnosis}`);
|
||||
console.log(` Proposed fixes: ${diagnosis.fixes.length}`);
|
||||
|
||||
if (diagnosis.fixes.length === 0) {
|
||||
console.log(' No actionable fixes proposed. Stopping.');
|
||||
break;
|
||||
}
|
||||
|
||||
// Step 3: Apply fixes
|
||||
console.log('\n🔧 FIX');
|
||||
let anyApplied = false;
|
||||
for (const fix of diagnosis.fixes.slice(0, 3)) { // Max 3 fixes per iteration
|
||||
console.log(` [${fix.id}] ${fix.description}`);
|
||||
console.log(` Impact: ${fix.impact.join(', ')} | Expected: ${fix.expectedImprovement}`);
|
||||
const applied = await applyFix(fix, opts);
|
||||
if (applied) anyApplied = true;
|
||||
}
|
||||
|
||||
if (!anyApplied) {
|
||||
console.log(' No fixes could be applied. Stopping.');
|
||||
break;
|
||||
}
|
||||
|
||||
// Step 4: Regenerate docs
|
||||
console.log('\n📝 REGENERATE');
|
||||
try {
|
||||
regenerateDocs(opts);
|
||||
} catch (err) {
|
||||
console.log(` ⚠ Doc generation failed: ${err.message}`);
|
||||
console.log(' Reverting changes...');
|
||||
execSync(`git checkout -- sysdoc.js extract-helm.js`, { cwd: PROJECT_DIR });
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Final summary
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log('RATCHET SUMMARY');
|
||||
console.log('═'.repeat(60));
|
||||
for (const h of history) {
|
||||
console.log(` Iter ${h.iteration}: ${h.score}% (${h.questions} questions, NOT_FOUND: ${h.notFoundRate})`);
|
||||
}
|
||||
|
||||
const finalScore = history[history.length - 1]?.score || 0;
|
||||
console.log(`\nFinal: ${finalScore}% ${finalScore >= opts.target ? '✅ PASS' : '❌ BELOW TARGET'}`);
|
||||
|
||||
// Save history
|
||||
const historyPath = path.join(PROJECT_DIR, 'ratchet-history.json');
|
||||
fs.writeFileSync(historyPath, JSON.stringify({
|
||||
timestamp: new Date().toISOString(),
|
||||
target: opts.target,
|
||||
history,
|
||||
finalScore,
|
||||
passed: finalScore >= opts.target,
|
||||
}, null, 2));
|
||||
console.log(`History: ${historyPath}`);
|
||||
|
||||
return { history, finalScore, passed: finalScore >= opts.target };
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
const opts = parseArgs();
|
||||
if (!opts.srcRoot || !opts.snapshot || !opts.docsDir) {
|
||||
console.error('Usage: node ratchet.js <foxtrot-root> <snapshot> <docs-dir> [--target=80] [--max-iter=5]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
ratchetLoop(opts).then(result => {
|
||||
process.exit(result.passed ? 0 : 1);
|
||||
}).catch(err => {
|
||||
console.error('Ratchet failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { ratchetLoop };
|
||||
Reference in New Issue
Block a user