Phase 9: Doc Evaluation Harness\n\n- eval-questions.js: Generates ground-truth questions from raw source data\n- eval.js: LLM-as-judge scoring harness (answers from docs, scores against truth)\n- Generated 33 questions covering config, dependencies, resources, and interactions\n- Baseline score: 66.7% (configuration 93%, dependencies 77%, structural 31%)

2026-03-09 22:32:41 +00:00
parent d9fa087e22
commit b99341e8bc
4 changed files with 1680 additions and 0 deletions
--- a/eval.js
+++ b/eval.js
@@ -0,0 +1,275 @@
+/**
+ * Eval Harness: Doc Quality Scorer
+ * 
+ * Scores generated documentation against ground-truth questions.
+ * Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy.
+ * 
+ * Usage: node eval.js <docs-dir> <questions.json> [output.json]
+ */
+
+const fs = require('fs');
+const path = require('path');
+const { callLLM } = require('./prose.js');
+
+/** Recursively read all .md files from a directory into a single context string */
+function loadDocs(docsDir, maxChars = 200000) {
+  const docs = [];
+  let totalChars = 0;
+
+  function walk(dir) {
+    let entries;
+    try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
+    for (const e of entries) {
+      const fp = path.join(dir, e.name);
+      if (e.isDirectory()) { walk(fp); continue; }
+      if (!e.name.endsWith('.md')) continue;
+      try {
+        const content = fs.readFileSync(fp, 'utf8');
+        if (totalChars + content.length > maxChars) continue;
+        docs.push({ path: path.relative(docsDir, fp), content });
+        totalChars += content.length;
+      } catch {}
+    }
+  }
+
+  walk(docsDir);
+  return docs;
+}
+
+/** Phase 1: Have an LLM answer a question using only the docs */
+async function answerFromDocs(question, docsContext, llmOpts) {
+  const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND".
+
+Be precise and factual. Match the expected answer format:
+- For "exact" answers: give the exact value
+- For "list" answers: list each item on its own line
+- For "ranked-list" answers: list items in order with counts
+- For "explanation" answers: give a concise explanation
+
+QUESTION: ${question.question}
+EXPECTED FORMAT: ${question.answerType}
+
+DOCUMENTATION:
+${docsContext}
+
+Answer:`;
+
+  return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
+}
+
+/** Phase 2: LLM-as-judge scores the answer against ground truth */
+async function scoreAnswer(question, llmAnswer, llmOpts) {
+  const prompt = `You are a strict evaluator scoring an AI's answer against ground truth.
+
+QUESTION: ${question.question}
+EXPECTED ANSWER TYPE: ${question.answerType}
+GROUND TRUTH: ${question.answer}
+AI ANSWER: ${llmAnswer}
+
+Score the AI answer on these dimensions (0-5 each):
+1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct)
+2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete)
+3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination)
+
+If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it).
+
+Respond in EXACTLY this JSON format, nothing else:
+{"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`;
+
+  const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
+  
+  try {
+    // Extract JSON from response (handle markdown wrapping)
+    const jsonMatch = raw.match(/\{[\s\S]*\}/);
+    if (jsonMatch) return JSON.parse(jsonMatch[0]);
+  } catch {}
+  
+  return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
+}
+
+/** Build a focused doc context for a question (avoid sending 200K to every question) */
+function buildContext(question, docs) {
+  // Keywords from the question
+  const keywords = question.question.toLowerCase().split(/\s+/)
+    .filter(w => w.length > 3)
+    .filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w));
+  
+  // Score each doc by keyword relevance
+  const scored = docs.map(d => {
+    const lower = d.content.toLowerCase();
+    let score = 0;
+    for (const kw of keywords) {
+      const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+      const count = (lower.match(new RegExp(escaped, 'g')) || []).length;
+      score += Math.min(count, 10); // cap per keyword
+    }
+    return { ...d, score };
+  });
+
+  // Take top docs up to ~50K chars
+  scored.sort((a, b) => b.score - a.score);
+  let context = '';
+  let used = 0;
+  for (const d of scored) {
+    if (d.score === 0 && used > 3) break;
+    if (context.length + d.content.length > 50000) continue;
+    context += `\n--- ${d.path} ---\n${d.content}\n`;
+    used++;
+  }
+
+  return context;
+}
+
+/** Run the full evaluation */
+async function runEval(docsDir, questionsPath, llmOpts = {}) {
+  const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
+  const questions = questionsData.questions;
+  const docs = loadDocs(docsDir);
+  
+  console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`);
+  
+  const results = [];
+  let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0;
+  let notFound = 0;
+
+  for (let i = 0; i < questions.length; i++) {
+    const q = questions[i];
+    process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
+    
+    // Build focused context
+    const context = buildContext(q, docs);
+    
+    // Phase 1: Answer from docs
+    let llmAnswer;
+    try {
+      llmAnswer = await answerFromDocs(q, context, llmOpts);
+    } catch (err) {
+      llmAnswer = `ERROR: ${err.message}`;
+    }
+    
+    // Phase 2: Score
+    let score;
+    try {
+      score = await scoreAnswer(q, llmAnswer, llmOpts);
+    } catch (err) {
+      score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` };
+    }
+    
+    const isNotFound = llmAnswer.includes('NOT_FOUND');
+    if (isNotFound) notFound++;
+    
+    totalAccuracy += score.accuracy;
+    totalCompleteness += score.completeness;
+    totalPrecision += score.precision;
+    
+    const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0);
+    console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`);
+    
+    results.push({
+      id: q.id,
+      category: q.category,
+      difficulty: q.difficulty,
+      question: q.question,
+      groundTruth: q.answer,
+      llmAnswer,
+      score,
+      composite: Number(composite),
+      notFound: isNotFound,
+    });
+  }
+
+  // Aggregate scores
+  const n = questions.length;
+  const avgAccuracy = (totalAccuracy / n).toFixed(2);
+  const avgCompleteness = (totalCompleteness / n).toFixed(2);
+  const avgPrecision = (totalPrecision / n).toFixed(2);
+  const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1);
+
+  // Category breakdown
+  const catScores = {};
+  for (const r of results) {
+    if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 };
+    catScores[r.category].total += r.composite;
+    catScores[r.category].count++;
+  }
+
+  // Difficulty breakdown
+  const diffScores = {};
+  for (const r of results) {
+    if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 };
+    diffScores[r.difficulty].total += r.composite;
+    diffScores[r.difficulty].count++;
+  }
+
+  const report = {
+    timestamp: new Date().toISOString(),
+    docsDir,
+    questionsFile: questionsPath,
+    totalQuestions: n,
+    overallScore: Number(overallScore),
+    avgAccuracy: Number(avgAccuracy),
+    avgCompleteness: Number(avgCompleteness),
+    avgPrecision: Number(avgPrecision),
+    notFoundCount: notFound,
+    notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
+    byCategory: Object.fromEntries(
+      Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }])
+    ),
+    byDifficulty: Object.fromEntries(
+      Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }])
+    ),
+    results,
+  };
+
+  return report;
+}
+
+if (require.main === module) {
+  const docsDir = process.argv[2];
+  const questionsPath = process.argv[3];
+  const outPath = process.argv[4] || './eval-report.json';
+
+  if (!docsDir || !questionsPath) {
+    console.error('Usage: node eval.js <docs-dir> <questions.json> [output.json]');
+    process.exit(1);
+  }
+
+  const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
+  console.log(`Using model: ${model}`);
+
+  (async () => {
+    try {
+      const report = await runEval(docsDir, questionsPath, { model });
+      
+      console.log('\n' + '═'.repeat(60));
+      console.log('EVAL REPORT');
+      console.log('═'.repeat(60));
+      console.log(`Overall Score: ${report.overallScore}%`);
+      console.log(`Accuracy: ${report.avgAccuracy}/5  Completeness: ${report.avgCompleteness}/5  Precision: ${report.avgPrecision}/5`);
+      console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
+      console.log('\nBy Category:');
+      for (const [cat, s] of Object.entries(report.byCategory)) {
+        console.log(`  ${cat}: ${s.avg}% (${s.count} questions)`);
+      }
+      console.log('\nBy Difficulty:');
+      for (const [diff, s] of Object.entries(report.byDifficulty)) {
+        console.log(`  ${diff}: ${s.avg}% (${s.count} questions)`);
+      }
+      
+      // Worst performers
+      const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
+      console.log('\nWeakest Questions:');
+      for (const w of worst) {
+        console.log(`  [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`);
+      }
+      
+      fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
+      console.log(`\nFull report: ${outPath}`);
+    } catch (err) {
+      console.error('Eval failed:', err);
+      process.exit(1);
+    }
+  })();
+}
+
+module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };