eval.js

/**
 * Eval Harness: Doc Quality Scorer
 * 
 * Scores generated documentation against ground-truth questions.
 * Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy.
 * 
 * Usage: node eval.js <docs-dir> <questions.json> [output.json]
 */

const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');

/** Recursively read all .md files from a directory into a single context string */
function loadDocs(docsDir, maxChars = 200000) {
  const docs = [];
  let totalChars = 0;

  function walk(dir) {
    let entries;
    try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }
    for (const e of entries) {
      const fp = path.join(dir, e.name);
      if (e.isDirectory()) { walk(fp); continue; }
      if (!e.name.endsWith('.md')) continue;
      try {
        const content = fs.readFileSync(fp, 'utf8');
        if (totalChars + content.length > maxChars) continue;
        docs.push({ path: path.relative(docsDir, fp), content });
        totalChars += content.length;
      } catch {}
    }
  }

  walk(docsDir);
  return docs;
}

/** Phase 1: Have an LLM answer a question using only the docs */
async function answerFromDocs(question, docsContext, llmOpts) {
  const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND".

Be precise and factual. Match the expected answer format:
- For "exact" answers: give the exact value
- For "list" answers: list each item on its own line
- For "ranked-list" answers: list items in order with counts
- For "explanation" answers: give a concise explanation

QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}

DOCUMENTATION:
${docsContext}

Answer:`;

  return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
}

/** Phase 2: LLM-as-judge scores the answer against ground truth */
async function scoreAnswer(question, llmAnswer, llmOpts) {
  const prompt = `You are a strict evaluator scoring an AI's answer against ground truth.

QUESTION: ${question.question}
EXPECTED ANSWER TYPE: ${question.answerType}
GROUND TRUTH: ${question.answer}
AI ANSWER: ${llmAnswer}

Score the AI answer on these dimensions (0-5 each):
1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct)
2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete)
3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination)

If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it).

Respond in EXACTLY this JSON format, nothing else:
{"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`;

  const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
  
  try {
    // Extract JSON from response (handle markdown wrapping)
    const jsonMatch = raw.match(/\{[\s\S]*\}/);
    if (jsonMatch) return JSON.parse(jsonMatch[0]);
  } catch {}
  
  return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}

/** Build a focused doc context for a question (avoid sending 200K to every question) */
function buildContext(question, docs) {
  // Keywords from the question
  const keywords = question.question.toLowerCase().split(/\s+/)
    .filter(w => w.length > 3)
    .filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w));
  
  // Score each doc by keyword relevance
  const scored = docs.map(d => {
    const lower = d.content.toLowerCase();
    let score = 0;
    for (const kw of keywords) {
      const escaped = kw.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
      const count = (lower.match(new RegExp(escaped, 'g')) || []).length;
      score += Math.min(count, 10); // cap per keyword
    }
    return { ...d, score };
  });

  // Take top docs up to ~50K chars
  scored.sort((a, b) => b.score - a.score);
  let context = '';
  let used = 0;
  for (const d of scored) {
    if (d.score === 0 && used > 3) break;
    if (context.length + d.content.length > 50000) continue;
    context += `\n--- ${d.path} ---\n${d.content}\n`;
    used++;
  }

  return context;
}

/** Run the full evaluation */
async function runEval(docsDir, questionsPath, llmOpts = {}) {
  const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
  const questions = questionsData.questions;
  const docs = loadDocs(docsDir);
  
  console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`);
  
  const results = [];
  let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0;
  let notFound = 0;

  for (let i = 0; i < questions.length; i++) {
    const q = questions[i];
    process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
    
    // Build focused context
    const context = buildContext(q, docs);
    
    // Phase 1: Answer from docs
    let llmAnswer;
    try {
      llmAnswer = await answerFromDocs(q, context, llmOpts);
    } catch (err) {
      llmAnswer = `ERROR: ${err.message}`;
    }
    
    // Phase 2: Score
    let score;
    try {
      score = await scoreAnswer(q, llmAnswer, llmOpts);
    } catch (err) {
      score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` };
    }
    
    const isNotFound = llmAnswer.includes('NOT_FOUND');
    if (isNotFound) notFound++;
    
    totalAccuracy += score.accuracy;
    totalCompleteness += score.completeness;
    totalPrecision += score.precision;
    
    const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0);
    console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`);
    
    results.push({
      id: q.id,
      category: q.category,
      difficulty: q.difficulty,
      question: q.question,
      groundTruth: q.answer,
      llmAnswer,
      score,
      composite: Number(composite),
      notFound: isNotFound,
    });
  }

  // Aggregate scores
  const n = questions.length;
  const avgAccuracy = (totalAccuracy / n).toFixed(2);
  const avgCompleteness = (totalCompleteness / n).toFixed(2);
  const avgPrecision = (totalPrecision / n).toFixed(2);
  const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1);

  // Category breakdown
  const catScores = {};
  for (const r of results) {
    if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 };
    catScores[r.category].total += r.composite;
    catScores[r.category].count++;
  }

  // Difficulty breakdown
  const diffScores = {};
  for (const r of results) {
    if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 };
    diffScores[r.difficulty].total += r.composite;
    diffScores[r.difficulty].count++;
  }

  const report = {
    timestamp: new Date().toISOString(),
    docsDir,
    questionsFile: questionsPath,
    totalQuestions: n,
    overallScore: Number(overallScore),
    avgAccuracy: Number(avgAccuracy),
    avgCompleteness: Number(avgCompleteness),
    avgPrecision: Number(avgPrecision),
    notFoundCount: notFound,
    notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
    byCategory: Object.fromEntries(
      Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }])
    ),
    byDifficulty: Object.fromEntries(
      Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }])
    ),
    results,
  };

  return report;
}

if (require.main === module) {
  const docsDir = process.argv[2];
  const questionsPath = process.argv[3];
  const outPath = process.argv[4] || './eval-report.json';

  if (!docsDir || !questionsPath) {
    console.error('Usage: node eval.js <docs-dir> <questions.json> [output.json]');
    process.exit(1);
  }

  const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
  console.log(`Using model: ${model}`);

  (async () => {
    try {
      const report = await runEval(docsDir, questionsPath, { model });
      
      console.log('\n' + '═'.repeat(60));
      console.log('EVAL REPORT');
      console.log('═'.repeat(60));
      console.log(`Overall Score: ${report.overallScore}%`);
      console.log(`Accuracy: ${report.avgAccuracy}/5  Completeness: ${report.avgCompleteness}/5  Precision: ${report.avgPrecision}/5`);
      console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
      console.log('\nBy Category:');
      for (const [cat, s] of Object.entries(report.byCategory)) {
        console.log(`  ${cat}: ${s.avg}% (${s.count} questions)`);
      }
      console.log('\nBy Difficulty:');
      for (const [diff, s] of Object.entries(report.byDifficulty)) {
        console.log(`  ${diff}: ${s.avg}% (${s.count} questions)`);
      }
      
      // Worst performers
      const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
      console.log('\nWeakest Questions:');
      for (const w of worst) {
        console.log(`  [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`);
      }
      
      fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
      console.log(`\nFull report: ${outPath}`);
    } catch (err) {
      console.error('Eval failed:', err);
      process.exit(1);
    }
  })();
}

module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };
Phase 9: Doc Evaluation Harness\n\n- eval-questions.js: Generates ground-truth questions from raw source data\n- eval.js: LLM-as-judge scoring harness (answers from docs, scores against truth)\n- Generated 33 questions covering config, dependencies, resources, and interactions\n- Baseline score: 66.7% (configuration 93%, dependencies 77%, structural 31%) 2026-03-09 22:32:41 +00:00			`/**`
			`* Eval Harness: Doc Quality Scorer`
			`*`
			`* Scores generated documentation against ground-truth questions.`
			`* Two-phase: (1) LLM answers questions using only docs, (2) LLM-as-judge scores accuracy.`
			`*`
			`* Usage: node eval.js <docs-dir> <questions.json> [output.json]`
			`*/`

			`const fs = require('fs');`
			`const path = require('path');`
			`const { callLLM } = require('./prose.js');`

			`/** Recursively read all .md files from a directory into a single context string */`
			`function loadDocs(docsDir, maxChars = 200000) {`
			`const docs = [];`
			`let totalChars = 0;`

			`function walk(dir) {`
			`let entries;`
			`try { entries = fs.readdirSync(dir, { withFileTypes: true }); } catch { return; }`
			`for (const e of entries) {`
			`const fp = path.join(dir, e.name);`
			`if (e.isDirectory()) { walk(fp); continue; }`
			`if (!e.name.endsWith('.md')) continue;`
			`try {`
			`const content = fs.readFileSync(fp, 'utf8');`
			`if (totalChars + content.length > maxChars) continue;`
			`docs.push({ path: path.relative(docsDir, fp), content });`
			`totalChars += content.length;`
			`} catch {}`
			`}`
			`}`

			`walk(docsDir);`
			`return docs;`
			`}`

			`/** Phase 1: Have an LLM answer a question using only the docs */`
			`async function answerFromDocs(question, docsContext, llmOpts) {`
			const prompt = `You are evaluating documentation quality. Answer the following question using ONLY the documentation provided below. If the documentation does not contain enough information to answer, say "NOT_FOUND".

			`Be precise and factual. Match the expected answer format:`
			`- For "exact" answers: give the exact value`
			`- For "list" answers: list each item on its own line`
			`- For "ranked-list" answers: list items in order with counts`
			`- For "explanation" answers: give a concise explanation`

			`QUESTION: ${question.question}`
			`EXPECTED FORMAT: ${question.answerType}`

			`DOCUMENTATION:`
			`${docsContext}`

			Answer:`;

			`return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });`
			`}`

			`/** Phase 2: LLM-as-judge scores the answer against ground truth */`
			`async function scoreAnswer(question, llmAnswer, llmOpts) {`
			const prompt = `You are a strict evaluator scoring an AI's answer against ground truth.

			`QUESTION: ${question.question}`
			`EXPECTED ANSWER TYPE: ${question.answerType}`
			`GROUND TRUTH: ${question.answer}`
			`AI ANSWER: ${llmAnswer}`

			`Score the AI answer on these dimensions (0-5 each):`
			`1. ACCURACY: Does the answer contain the correct facts? (0=wrong, 3=partially correct, 5=exactly correct)`
			`2. COMPLETENESS: Does it cover all items in the ground truth? (0=missing everything, 3=partial, 5=complete)`
			`3. PRECISION: Is it free of hallucinated or incorrect extra information? (0=lots of hallucination, 5=no hallucination)`

			`If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5 (it didn't hallucinate, it just couldn't find it).`

			`Respond in EXACTLY this JSON format, nothing else:`
			{"accuracy": N, "completeness": N, "precision": N, "notes": "brief explanation"}`;

			`const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });`

			`try {`
			`// Extract JSON from response (handle markdown wrapping)`
			`const jsonMatch = raw.match(/\{[\s\S]*\}/);`
			`if (jsonMatch) return JSON.parse(jsonMatch[0]);`
			`} catch {}`

			return { accuracy: 0, completeness: 0, precision: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
			`}`

			`/** Build a focused doc context for a question (avoid sending 200K to every question) */`
			`function buildContext(question, docs) {`
			`// Keywords from the question`
			`const keywords = question.question.toLowerCase().split(/\s+/)`
			`.filter(w => w.length > 3)`
			`.filter(w => !['what', 'which', 'does', 'that', 'this', 'from', 'with', 'list', 'them', 'their', 'each', 'many', 'most', 'across'].includes(w));`

			`// Score each doc by keyword relevance`
			`const scored = docs.map(d => {`
			`const lower = d.content.toLowerCase();`
			`let score = 0;`
			`for (const kw of keywords) {`
			`const escaped = kw.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&');`
			`const count = (lower.match(new RegExp(escaped, 'g')) \|\| []).length;`
			`score += Math.min(count, 10); // cap per keyword`
			`}`
			`return { ...d, score };`
			`});`

			`// Take top docs up to ~50K chars`
			`scored.sort((a, b) => b.score - a.score);`
			`let context = '';`
			`let used = 0;`
			`for (const d of scored) {`
			`if (d.score === 0 && used > 3) break;`
			`if (context.length + d.content.length > 50000) continue;`
			context += `\n--- ${d.path} ---\n${d.content}\n`;
			`used++;`
			`}`

			`return context;`
			`}`

			`/** Run the full evaluation */`
			`async function runEval(docsDir, questionsPath, llmOpts = {}) {`
			`const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));`
			`const questions = questionsData.questions;`
			`const docs = loadDocs(docsDir);`

			console.log(`Loaded ${docs.length} doc files, ${questions.length} questions`);

			`const results = [];`
			`let totalAccuracy = 0, totalCompleteness = 0, totalPrecision = 0;`
			`let notFound = 0;`

			`for (let i = 0; i < questions.length; i++) {`
			`const q = questions[i];`
			process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);

			`// Build focused context`
			`const context = buildContext(q, docs);`

			`// Phase 1: Answer from docs`
			`let llmAnswer;`
			`try {`
			`llmAnswer = await answerFromDocs(q, context, llmOpts);`
			`} catch (err) {`
			llmAnswer = `ERROR: ${err.message}`;
			`}`

			`// Phase 2: Score`
			`let score;`
			`try {`
			`score = await scoreAnswer(q, llmAnswer, llmOpts);`
			`} catch (err) {`
			score = { accuracy: 0, completeness: 0, precision: 0, notes: `Score error: ${err.message}` };
			`}`

			`const isNotFound = llmAnswer.includes('NOT_FOUND');`
			`if (isNotFound) notFound++;`

			`totalAccuracy += score.accuracy;`
			`totalCompleteness += score.completeness;`
			`totalPrecision += score.precision;`

			`const composite = ((score.accuracy + score.completeness + score.precision) / 15 * 100).toFixed(0);`
			console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision})${isNotFound ? ' [NOT_FOUND]' : ''}`);

			`results.push({`
			`id: q.id,`
			`category: q.category,`
			`difficulty: q.difficulty,`
			`question: q.question,`
			`groundTruth: q.answer,`
			`llmAnswer,`
			`score,`
			`composite: Number(composite),`
			`notFound: isNotFound,`
			`});`
			`}`

			`// Aggregate scores`
			`const n = questions.length;`
			`const avgAccuracy = (totalAccuracy / n).toFixed(2);`
			`const avgCompleteness = (totalCompleteness / n).toFixed(2);`
			`const avgPrecision = (totalPrecision / n).toFixed(2);`
			`const overallScore = ((totalAccuracy + totalCompleteness + totalPrecision) / (n * 15) * 100).toFixed(1);`

			`// Category breakdown`
			`const catScores = {};`
			`for (const r of results) {`
			`if (!catScores[r.category]) catScores[r.category] = { total: 0, count: 0 };`
			`catScores[r.category].total += r.composite;`
			`catScores[r.category].count++;`
			`}`

			`// Difficulty breakdown`
			`const diffScores = {};`
			`for (const r of results) {`
			`if (!diffScores[r.difficulty]) diffScores[r.difficulty] = { total: 0, count: 0 };`
			`diffScores[r.difficulty].total += r.composite;`
			`diffScores[r.difficulty].count++;`
			`}`

			`const report = {`
			`timestamp: new Date().toISOString(),`
			`docsDir,`
			`questionsFile: questionsPath,`
			`totalQuestions: n,`
			`overallScore: Number(overallScore),`
			`avgAccuracy: Number(avgAccuracy),`
			`avgCompleteness: Number(avgCompleteness),`
			`avgPrecision: Number(avgPrecision),`
			`notFoundCount: notFound,`
			`notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',`
			`byCategory: Object.fromEntries(`
			`Object.entries(catScores).map(([cat, s]) => [cat, { avg: (s.total / s.count).toFixed(1), count: s.count }])`
			`),`
			`byDifficulty: Object.fromEntries(`
			`Object.entries(diffScores).map(([diff, s]) => [diff, { avg: (s.total / s.count).toFixed(1), count: s.count }])`
			`),`
			`results,`
			`};`

			`return report;`
			`}`

			`if (require.main === module) {`
			`const docsDir = process.argv[2];`
			`const questionsPath = process.argv[3];`
			`const outPath = process.argv[4] \|\| './eval-report.json';`

			`if (!docsDir \|\| !questionsPath) {`
			`console.error('Usage: node eval.js <docs-dir> <questions.json> [output.json]');`
			`process.exit(1);`
			`}`

			`const model = process.env.LLM_MODEL \|\| 'claude-haiku-4.5';`
			console.log(`Using model: ${model}`);

			`(async () => {`
			`try {`
			`const report = await runEval(docsDir, questionsPath, { model });`

			`console.log('\n' + '═'.repeat(60));`
			`console.log('EVAL REPORT');`
			`console.log('═'.repeat(60));`
			console.log(`Overall Score: ${report.overallScore}%`);
			console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5`);
			console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
			`console.log('\nBy Category:');`
			`for (const [cat, s] of Object.entries(report.byCategory)) {`
			console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
			`}`
			`console.log('\nBy Difficulty:');`
			`for (const [diff, s] of Object.entries(report.byDifficulty)) {`
			console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
			`}`

			`// Worst performers`
			`const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);`
			`console.log('\nWeakest Questions:');`
			`for (const w of worst) {`
			console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 80)}...`);
			`}`

			`fs.writeFileSync(outPath, JSON.stringify(report, null, 2));`
			console.log(`\nFull report: ${outPath}`);
			`} catch (err) {`
			`console.error('Eval failed:', err);`
			`process.exit(1);`
			`}`
			`})();`
			`}`

			`module.exports = { runEval, loadDocs, answerFromDocs, scoreAnswer };`