Phase 9: Doc Evaluation Harness\n\n- eval-questions.js: Generates ground-truth questions from raw source data\n- eval.js: LLM-as-judge scoring harness (answers from docs, scores against truth)\n- Generated 33 questions covering config, dependencies, resources, and interactions\n- Baseline score: 66.7% (configuration 93%, dependencies 77%, structural 31%)
This commit is contained in:
275
eval.js
Normal file
275
eval.js
Normal file
@@ -0,0 +1,275 @@
|
||||
/**
|
||||
* Eval Harness: Doc Quality Scorer
|
||||
*
|
||||
* Scores generated documentation against ground-truth questions.
|
||||
* Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy.
|
||||
*
|
||||
* Usage: node eval.js <docs-dir> <questions.json> [output.json]
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { callLLM } = require('./prose.js');
|
||||
|
||||
/** Recursively read all .md files from a directory into a single context string */
|
||||
function loadDocs(docsDir, maxChars = 200000) {
|
||||
const docs = [];
|
||||
let totalChars = 0;
|
||||
|
||||
function walk(dir) {
|
||||
let entries;
|
||||
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
|
||||
for (const e of entries) {
|
||||
const fp = path.join(dir, e.name);
|
||||
if (e.isDirectory()) { walk(fp); continue; }
|
||||
if (!e.name.endsWith('.md')) continue;
|
||||
try {
|
||||
const content = fs.readFileSync(fp, 'utf8');
|
||||
if (totalChars + content.length > maxChars) continue;
|
||||
docs.push({ path: path.relative(docsDir, fp), content });
|
||||
totalChars += content.length;
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
walk(docsDir);
|
||||
return docs;
|
||||
}
|
||||
|
||||
/** Phase 1: Have an LLM answer a question using only the docs */
|
||||
async function answerFromDocs(question, docsContext, llmOpts) {
|
||||
const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND".
|
||||
|
||||
Be precise and factual. Match the expected answer format:
|
||||
- For "exact" answers: give the exact value
|
||||
- For "list" answers: list each item on its own line
|
||||
- For "ranked-list" answers: list items in order with counts
|
||||
- For "explanation" answers: give a concise explanation
|
||||
|
||||
QUESTION: ${question.question}
|
||||
EXPECTED FORMAT: ${question.answerType}
|
||||
|
||||
DOCUMENTATION:
|
||||
${docsContext}
|
||||
|
||||
Answer:`;
|
||||
|
||||
return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
|
||||
}
|
||||
|
||||
/** Phase 2: LLM-as-judge scores the answer against ground truth */
|
||||
async function scoreAnswer(question, llmAnswer, llmOpts) {
|
||||
const prompt = `You are a strict evaluator scoring an AI's answer against ground truth.
|
||||
|
||||
QUESTION: ${question.question}
|
||||
EXPECTED ANSWER TYPE: ${question.answerType}
|
||||
GROUND TRUTH: ${question.answer}
|
||||
AI ANSWER: ${llmAnswer}
|
||||
|
||||
Score the AI answer on these dimensions (0-5 each):
|
||||
1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct)
|
||||
2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete)
|
||||
3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination)
|
||||
|
||||
If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it).
|
||||
|
||||
Respond in EXACTLY this JSON format, nothing else:
|
||||
{"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`;
|
||||
|
||||
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
|
||||
|
||||
try {
|
||||
// Extract JSON from response (handle markdown wrapping)
|
||||
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||||
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||||
} catch {}
|
||||
|
||||
return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
|
||||
}
|
||||
|
||||
/** Build a focused doc context for a question (avoid sending 200K to every question) */
|
||||
function buildContext(question, docs) {
|
||||
// Keywords from the question
|
||||
const keywords = question.question.toLowerCase().split(/\s+/)
|
||||
.filter(w => w.length > 3)
|
||||
.filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w));
|
||||
|
||||
// Score each doc by keyword relevance
|
||||
const scored = docs.map(d => {
|
||||
const lower = d.content.toLowerCase();
|
||||
let score = 0;
|
||||
for (const kw of keywords) {
|
||||
const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
const count = (lower.match(new RegExp(escaped, 'g')) || []).length;
|
||||
score += Math.min(count, 10); // cap per keyword
|
||||
}
|
||||
return { ...d, score };
|
||||
});
|
||||
|
||||
// Take top docs up to ~50K chars
|
||||
scored.sort((a, b) => b.score - a.score);
|
||||
let context = '';
|
||||
let used = 0;
|
||||
for (const d of scored) {
|
||||
if (d.score === 0 && used > 3) break;
|
||||
if (context.length + d.content.length > 50000) continue;
|
||||
context += `\n--- ${d.path} ---\n${d.content}\n`;
|
||||
used++;
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
/** Run the full evaluation */
|
||||
async function runEval(docsDir, questionsPath, llmOpts = {}) {
|
||||
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||||
const questions = questionsData.questions;
|
||||
const docs = loadDocs(docsDir);
|
||||
|
||||
console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`);
|
||||
|
||||
const results = [];
|
||||
let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0;
|
||||
let notFound = 0;
|
||||
|
||||
for (let i = 0; i < questions.length; i++) {
|
||||
const q = questions[i];
|
||||
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
|
||||
|
||||
// Build focused context
|
||||
const context = buildContext(q, docs);
|
||||
|
||||
// Phase 1: Answer from docs
|
||||
let llmAnswer;
|
||||
try {
|
||||
llmAnswer = await answerFromDocs(q, context, llmOpts);
|
||||
} catch (err) {
|
||||
llmAnswer = `ERROR: ${err.message}`;
|
||||
}
|
||||
|
||||
// Phase 2: Score
|
||||
let score;
|
||||
try {
|
||||
score = await scoreAnswer(q, llmAnswer, llmOpts);
|
||||
} catch (err) {
|
||||
score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` };
|
||||
}
|
||||
|
||||
const isNotFound = llmAnswer.includes('NOT_FOUND');
|
||||
if (isNotFound) notFound++;
|
||||
|
||||
totalAccuracy += score.accuracy;
|
||||
totalCompleteness += score.completeness;
|
||||
totalPrecision += score.precision;
|
||||
|
||||
const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0);
|
||||
console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`);
|
||||
|
||||
results.push({
|
||||
id: q.id,
|
||||
category: q.category,
|
||||
difficulty: q.difficulty,
|
||||
question: q.question,
|
||||
groundTruth: q.answer,
|
||||
llmAnswer,
|
||||
score,
|
||||
composite: Number(composite),
|
||||
notFound: isNotFound,
|
||||
});
|
||||
}
|
||||
|
||||
// Aggregate scores
|
||||
const n = questions.length;
|
||||
const avgAccuracy = (totalAccuracy / n).toFixed(2);
|
||||
const avgCompleteness = (totalCompleteness / n).toFixed(2);
|
||||
const avgPrecision = (totalPrecision / n).toFixed(2);
|
||||
const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1);
|
||||
|
||||
// Category breakdown
|
||||
const catScores = {};
|
||||
for (const r of results) {
|
||||
if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 };
|
||||
catScores[r.category].total += r.composite;
|
||||
catScores[r.category].count++;
|
||||
}
|
||||
|
||||
// Difficulty breakdown
|
||||
const diffScores = {};
|
||||
for (const r of results) {
|
||||
if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 };
|
||||
diffScores[r.difficulty].total += r.composite;
|
||||
diffScores[r.difficulty].count++;
|
||||
}
|
||||
|
||||
const report = {
|
||||
timestamp: new Date().toISOString(),
|
||||
docsDir,
|
||||
questionsFile: questionsPath,
|
||||
totalQuestions: n,
|
||||
overallScore: Number(overallScore),
|
||||
avgAccuracy: Number(avgAccuracy),
|
||||
avgCompleteness: Number(avgCompleteness),
|
||||
avgPrecision: Number(avgPrecision),
|
||||
notFoundCount: notFound,
|
||||
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
|
||||
byCategory: Object.fromEntries(
|
||||
Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }])
|
||||
),
|
||||
byDifficulty: Object.fromEntries(
|
||||
Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }])
|
||||
),
|
||||
results,
|
||||
};
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
const docsDir = process.argv[2];
|
||||
const questionsPath = process.argv[3];
|
||||
const outPath = process.argv[4] || './eval-report.json';
|
||||
|
||||
if (!docsDir || !questionsPath) {
|
||||
console.error('Usage: node eval.js <docs-dir> <questions.json> [output.json]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
|
||||
console.log(`Using model: ${model}`);
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
const report = await runEval(docsDir, questionsPath, { model });
|
||||
|
||||
console.log('\n' + '═'.repeat(60));
|
||||
console.log('EVAL REPORT');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(`Overall Score: ${report.overallScore}%`);
|
||||
console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5`);
|
||||
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
|
||||
console.log('\nBy Category:');
|
||||
for (const [cat, s] of Object.entries(report.byCategory)) {
|
||||
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
|
||||
}
|
||||
console.log('\nBy Difficulty:');
|
||||
for (const [diff, s] of Object.entries(report.byDifficulty)) {
|
||||
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
|
||||
}
|
||||
|
||||
// Worst performers
|
||||
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
|
||||
console.log('\nWeakest Questions:');
|
||||
for (const w of worst) {
|
||||
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`);
|
||||
}
|
||||
|
||||
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
|
||||
console.log(`\nFull report: ${outPath}`);
|
||||
} catch (err) {
|
||||
console.error('Eval failed:', err);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };
|
||||
Reference in New Issue
Block a user