Files

276 lines
9.6 KiB
JavaScript
Raw Permalink Normal View History

/**
* Eval Harness: Doc Quality Scorer
*
* Scores generated documentation against ground-truth questions.
* Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy.
*
* Usage: node eval.js <docs-dir> <questions.json> [output.json]
*/
const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');
/** Recursively read all .md files from a directory into a single context string */
function loadDocs(docsDir, maxChars = 200000) {
const docs = [];
let totalChars = 0;
function walk(dir) {
let entries;
try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
for (const e of entries) {
const fp = path.join(dir, e.name);
if (e.isDirectory()) { walk(fp); continue; }
if (!e.name.endsWith('.md')) continue;
try {
const content = fs.readFileSync(fp, 'utf8');
if (totalChars + content.length > maxChars) continue;
docs.push({ path: path.relative(docsDir, fp), content });
totalChars += content.length;
} catch {}
}
}
walk(docsDir);
return docs;
}
/** Phase 1: Have an LLM answer a question using only the docs */
async function answerFromDocs(question, docsContext, llmOpts) {
const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND".
Be precise and factual. Match the expected answer format:
- For "exact" answers: give the exact value
- For "list" answers: list each item on its own line
- For "ranked-list" answers: list items in order with counts
- For "explanation" answers: give a concise explanation
QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}
DOCUMENTATION:
${docsContext}
Answer:`;
return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
}
/** Phase 2: LLM-as-judge scores the answer against ground truth */
async function scoreAnswer(question, llmAnswer, llmOpts) {
const prompt = `You are a strict evaluator scoring an AI's answer against ground truth.
QUESTION: ${question.question}
EXPECTED ANSWER TYPE: ${question.answerType}
GROUND TRUTH: ${question.answer}
AI ANSWER: ${llmAnswer}
Score the AI answer on these dimensions (0-5 each):
1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct)
2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete)
3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination)
If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it).
Respond in EXACTLY this JSON format, nothing else:
{"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`;
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
try {
// Extract JSON from response (handle markdown wrapping)
const jsonMatch = raw.match(/\{[\s\S]*\}/);
if (jsonMatch) return JSON.parse(jsonMatch[0]);
} catch {}
return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}
/** Build a focused doc context for a question (avoid sending 200K to every question) */
function buildContext(question, docs) {
// Keywords from the question
const keywords = question.question.toLowerCase().split(/\s+/)
.filter(w => w.length > 3)
.filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w));
// Score each doc by keyword relevance
const scored = docs.map(d => {
const lower = d.content.toLowerCase();
let score = 0;
for (const kw of keywords) {
const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const count = (lower.match(new RegExp(escaped, 'g')) || []).length;
score += Math.min(count, 10); // cap per keyword
}
return { ...d, score };
});
// Take top docs up to ~50K chars
scored.sort((a, b) => b.score - a.score);
let context = '';
let used = 0;
for (const d of scored) {
if (d.score === 0 && used > 3) break;
if (context.length + d.content.length > 50000) continue;
context += `\n--- ${d.path} ---\n${d.content}\n`;
used++;
}
return context;
}
/** Run the full evaluation */
async function runEval(docsDir, questionsPath, llmOpts = {}) {
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
const questions = questionsData.questions;
const docs = loadDocs(docsDir);
console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`);
const results = [];
let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0;
let notFound = 0;
for (let i = 0; i < questions.length; i++) {
const q = questions[i];
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
// Build focused context
const context = buildContext(q, docs);
// Phase 1: Answer from docs
let llmAnswer;
try {
llmAnswer = await answerFromDocs(q, context, llmOpts);
} catch (err) {
llmAnswer = `ERROR: ${err.message}`;
}
// Phase 2: Score
let score;
try {
score = await scoreAnswer(q, llmAnswer, llmOpts);
} catch (err) {
score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` };
}
const isNotFound = llmAnswer.includes('NOT_FOUND');
if (isNotFound) notFound++;
totalAccuracy += score.accuracy;
totalCompleteness += score.completeness;
totalPrecision += score.precision;
const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0);
console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`);
results.push({
id: q.id,
category: q.category,
difficulty: q.difficulty,
question: q.question,
groundTruth: q.answer,
llmAnswer,
score,
composite: Number(composite),
notFound: isNotFound,
});
}
// Aggregate scores
const n = questions.length;
const avgAccuracy = (totalAccuracy / n).toFixed(2);
const avgCompleteness = (totalCompleteness / n).toFixed(2);
const avgPrecision = (totalPrecision / n).toFixed(2);
const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1);
// Category breakdown
const catScores = {};
for (const r of results) {
if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 };
catScores[r.category].total += r.composite;
catScores[r.category].count++;
}
// Difficulty breakdown
const diffScores = {};
for (const r of results) {
if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 };
diffScores[r.difficulty].total += r.composite;
diffScores[r.difficulty].count++;
}
const report = {
timestamp: new Date().toISOString(),
docsDir,
questionsFile: questionsPath,
totalQuestions: n,
overallScore: Number(overallScore),
avgAccuracy: Number(avgAccuracy),
avgCompleteness: Number(avgCompleteness),
avgPrecision: Number(avgPrecision),
notFoundCount: notFound,
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
byCategory: Object.fromEntries(
Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }])
),
byDifficulty: Object.fromEntries(
Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }])
),
results,
};
return report;
}
if (require.main === module) {
const docsDir = process.argv[2];
const questionsPath = process.argv[3];
const outPath = process.argv[4] || './eval-report.json';
if (!docsDir || !questionsPath) {
console.error('Usage: node eval.js <docs-dir> <questions.json> [output.json]');
process.exit(1);
}
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
console.log(`Using model: ${model}`);
(async () => {
try {
const report = await runEval(docsDir, questionsPath, { model });
console.log('\n' + '═'.repeat(60));
console.log('EVAL REPORT');
console.log('═'.repeat(60));
console.log(`Overall Score: ${report.overallScore}%`);
console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5`);
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
console.log('\nBy Category:');
for (const [cat, s] of Object.entries(report.byCategory)) {
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
}
console.log('\nBy Difficulty:');
for (const [diff, s] of Object.entries(report.byDifficulty)) {
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
}
// Worst performers
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
console.log('\nWeakest Questions:');
for (const w of worst) {
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`);
}
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
console.log(`\nFull report: ${outPath}`);
} catch (err) {
console.error('Eval failed:', err);
process.exit(1);
}
})();
}
module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };