/** * Eval Harness: Doc Quality Scorer * * Scores generated documentation against ground-truth questions. * Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy. * * Usage: node eval.js [output.json] */ const fs = require('fs'); const path = require('path'); const { callLLM } = require('./prose.js'); /** Recursively read all .md files from a directory into a single context string */ function loadDocs(docsDir, maxChars = 200000) { const docs = []; let totalChars = 0; function walk(dir) { let entries; try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; } for (const e of entries) { const fp = path.join(dir, e.name); if (e.isDirectory()) { walk(fp); continue; } if (!e.name.endsWith('.md')) continue; try { const content = fs.readFileSync(fp, 'utf8'); if (totalChars + content.length > maxChars) continue; docs.push({ path: path.relative(docsDir, fp), content }); totalChars += content.length; } catch {} } } walk(docsDir); return docs; } /** Phase 1: Have an LLM answer a question using only the docs */ async function answerFromDocs(question, docsContext, llmOpts) { const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND". Be precise and factual. Match the expected answer format: - For "exact" answers: give the exact value - For "list" answers: list each item on its own line - For "ranked-list" answers: list items in order with counts - For "explanation" answers: give a concise explanation QUESTION: ${question.question} EXPECTED FORMAT: ${question.answerType} DOCUMENTATION: ${docsContext} Answer:`; return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 }); } /** Phase 2: LLM-as-judge scores the answer against ground truth */ async function scoreAnswer(question, llmAnswer, llmOpts) { const prompt = `You are a strict evaluator scoring an AI's answer against ground truth. QUESTION: ${question.question} EXPECTED ANSWER TYPE: ${question.answerType} GROUND TRUTH: ${question.answer} AI ANSWER: ${llmAnswer} Score the AI answer on these dimensions (0-5 each): 1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct) 2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete) 3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination) If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it). Respond in EXACTLY this JSON format, nothing else: {"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`; const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 }); try { // Extract JSON from response (handle markdown wrapping) const jsonMatch = raw.match(/\{[\s\S]*\}/); if (jsonMatch) return JSON.parse(jsonMatch[0]); } catch {} return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` }; } /** Build a focused doc context for a question (avoid sending 200K to every question) */ function buildContext(question, docs) { // Keywords from the question const keywords = question.question.toLowerCase().split(/\s+/) .filter(w => w.length > 3) .filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w)); // Score each doc by keyword relevance const scored = docs.map(d => { const lower = d.content.toLowerCase(); let score = 0; for (const kw of keywords) { const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const count = (lower.match(new RegExp(escaped, 'g')) || []).length; score += Math.min(count, 10); // cap per keyword } return { ...d, score }; }); // Take top docs up to ~50K chars scored.sort((a, b) => b.score - a.score); let context = ''; let used = 0; for (const d of scored) { if (d.score === 0 && used > 3) break; if (context.length + d.content.length > 50000) continue; context += `\n--- ${d.path} ---\n${d.content}\n`; used++; } return context; } /** Run the full evaluation */ async function runEval(docsDir, questionsPath, llmOpts = {}) { const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8')); const questions = questionsData.questions; const docs = loadDocs(docsDir); console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`); const results = []; let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0; let notFound = 0; for (let i = 0; i < questions.length; i++) { const q = questions[i]; process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`); // Build focused context const context = buildContext(q, docs); // Phase 1: Answer from docs let llmAnswer; try { llmAnswer = await answerFromDocs(q, context, llmOpts); } catch (err) { llmAnswer = `ERROR: ${err.message}`; } // Phase 2: Score let score; try { score = await scoreAnswer(q, llmAnswer, llmOpts); } catch (err) { score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` }; } const isNotFound = llmAnswer.includes('NOT_FOUND'); if (isNotFound) notFound++; totalAccuracy += score.accuracy; totalCompleteness += score.completeness; totalPrecision += score.precision; const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0); console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`); results.push({ id: q.id, category: q.category, difficulty: q.difficulty, question: q.question, groundTruth: q.answer, llmAnswer, score, composite: Number(composite), notFound: isNotFound, }); } // Aggregate scores const n = questions.length; const avgAccuracy = (totalAccuracy / n).toFixed(2); const avgCompleteness = (totalCompleteness / n).toFixed(2); const avgPrecision = (totalPrecision / n).toFixed(2); const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1); // Category breakdown const catScores = {}; for (const r of results) { if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 }; catScores[r.category].total += r.composite; catScores[r.category].count++; } // Difficulty breakdown const diffScores = {}; for (const r of results) { if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 }; diffScores[r.difficulty].total += r.composite; diffScores[r.difficulty].count++; } const report = { timestamp: new Date().toISOString(), docsDir, questionsFile: questionsPath, totalQuestions: n, overallScore: Number(overallScore), avgAccuracy: Number(avgAccuracy), avgCompleteness: Number(avgCompleteness), avgPrecision: Number(avgPrecision), notFoundCount: notFound, notFoundRate: ((notFound / n) * 100).toFixed(1) + '%', byCategory: Object.fromEntries( Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }]) ), byDifficulty: Object.fromEntries( Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }]) ), results, }; return report; } if (require.main === module) { const docsDir = process.argv[2]; const questionsPath = process.argv[3]; const outPath = process.argv[4] || './eval-report.json'; if (!docsDir || !questionsPath) { console.error('Usage: node eval.js [output.json]'); process.exit(1); } const model = process.env.LLM_MODEL || 'claude-haiku-4.5'; console.log(`Using model: ${model}`); (async () => { try { const report = await runEval(docsDir, questionsPath, { model }); console.log('\n' + '═'.repeat(60)); console.log('EVAL REPORT'); console.log('═'.repeat(60)); console.log(`Overall Score: ${report.overallScore}%`); console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5`); console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`); console.log('\nBy Category:'); for (const [cat, s] of Object.entries(report.byCategory)) { console.log(` ${cat}: ${s.avg}% (${s.count} questions)`); } console.log('\nBy Difficulty:'); for (const [diff, s] of Object.entries(report.byDifficulty)) { console.log(` ${diff}: ${s.avg}% (${s.count} questions)`); } // Worst performers const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5); console.log('\nWeakest Questions:'); for (const w of worst) { console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`); } fs.writeFileSync(outPath, JSON.stringify(report, null, 2)); console.log(`\nFull report: ${outPath}`); } catch (err) { console.error('Eval failed:', err); process.exit(1); } })(); } module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };