Phase 9c: Split eval into Agent (file-browsing) and Human (readability) tracks

Agent eval: 54.3% (22 questions, 40.9% NOT_FOUND) Human eval: 63.9% (28 questions, 17.9% NOT_FOUND) Key findings: - Agent navigation is the bottleneck (2.09/5) — long path-based filenames hurt discoverability - Human findability is decent (3.46/5) but dependency questions fail (0%) because chart docs for wrapper charts don't surface their sub-chart deps - Both tracks show strong precision (4.4+/5) — very low hallucination - Resources (91%) and interactions (95%) score great for humans - Configuration and contracts are solid across both tracks
2026-03-09 23:55:54 +00:00
parent 0cc4abcb0f
commit 304f0a9e9f
4 changed files with 2050 additions and 0 deletions
--- a/eval-human.js
+++ b/eval-human.js
@@ -0,0 +1,272 @@
+/**
+ * Eval Track 2: Human Readability Benchmark
+ * 
+ * Tests whether the docs are useful for a human engineer.
+ * Feeds the FULL relevant doc page(s) to the LLM and scores on:
+ * - Clarity: Is the prose understandable?
+ * - Actionability: Could you act on this information?
+ * - Navigation: Does the doc structure guide you to the answer?
+ * - Completeness: Is the answer fully covered?
+ * 
+ * Usage: node eval-human.js <docs-dir> <questions.json> [output.json]
+ */
+
+const fs = require('fs');
+const path = require('path');
+const { callLLM } = require('./prose.js');
+
+/** Find the most relevant doc files for a human-audience question */
+function findRelevantDocs(question, docsDir) {
+  const docs = [];
+  
+  // Always include the architecture overview
+  const archPath = path.join(docsDir, 'reference/system-architecture.md');
+  if (fs.existsSync(archPath)) {
+    docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') });
+  }
+
+  // Category-specific doc selection
+  const q = question.question.toLowerCase();
+  
+  // Helm-related: include helm index
+  if (q.includes('helm') || q.includes('chart') || q.includes('secret') || q.includes('port') || q.includes('deploy') || q.includes('service')) {
+    const helmIndex = path.join(docsDir, 'reference/helm/index.md');
+    if (fs.existsSync(helmIndex)) {
+      docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') });
+    }
+  }
+
+  // Find specific chart docs mentioned in the question
+  const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster'];
+  for (const name of chartNames) {
+    if (q.includes(name)) {
+      // Find matching chart doc(s)
+      const chartsDir = path.join(docsDir, 'reference/helm/charts');
+      if (fs.existsSync(chartsDir)) {
+        for (const f of fs.readdirSync(chartsDir)) {
+          if (f.includes(name)) {
+            const content = fs.readFileSync(path.join(chartsDir, f), 'utf8');
+            docs.push({ path: `reference/helm/charts/${f}`, content });
+          }
+        }
+      }
+    }
+  }
+
+  // Subsystem-related
+  if (q.includes('subsystem') || q.includes('cross-cutting') || q.includes('depend')) {
+    const subsDir = path.join(docsDir, 'reference/subsystems');
+    if (fs.existsSync(subsDir)) {
+      for (const f of fs.readdirSync(subsDir)) {
+        const content = fs.readFileSync(path.join(subsDir, f), 'utf8');
+        docs.push({ path: `reference/subsystems/${f}`, content });
+      }
+    }
+  }
+
+  // Contracts
+  if (q.includes('contract') || q.includes('interface') || q.includes('secret') || q.includes('configmap')) {
+    const contractsPath = path.join(docsDir, 'reference/contracts/index.md');
+    if (fs.existsSync(contractsPath)) {
+      const content = fs.readFileSync(contractsPath, 'utf8');
+      // Truncate if massive
+      docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) });
+    }
+  }
+
+  // Cap total context at 60K
+  let total = 0;
+  return docs.filter(d => {
+    if (total + d.content.length > 60000) return false;
+    total += d.content.length;
+    return true;
+  });
+}
+
+/** Have an LLM simulate a human reading the docs */
+async function humanRead(question, docs, llmOpts) {
+  const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n');
+
+  const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time.
+
+DOCUMENTATION:
+${context}
+
+QUESTION: ${question.question}
+
+Instructions:
+- Read the docs carefully, as a human would
+- If the answer requires combining info from multiple sections, do so
+- If the docs don't cover this, say "NOT_FOUND"
+- Match the expected format: ${question.answerType}
+
+Answer:`;
+
+  return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 });
+}
+
+/** Score with human-focused criteria */
+async function scoreHuman(question, answer, docsUsed, llmOpts) {
+  const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective.
+
+QUESTION: ${question.question}
+GROUND TRUTH: ${question.answer}
+READER'S ANSWER: ${answer}
+DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')}
+
+Score on these human-centric dimensions (0-5 each):
+1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious)
+2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute)
+3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered)
+4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect)
+
+If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0.
+
+Respond in EXACTLY this JSON format:
+{"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`;
+
+  const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
+  try {
+    const jsonMatch = raw.match(/\{[\s\S]*\}/);
+    if (jsonMatch) return JSON.parse(jsonMatch[0]);
+  } catch {}
+  return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
+}
+
+/** Run the human eval */
+async function runHumanEval(docsDir, questionsPath, llmOpts = {}) {
+  const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
+  const questions = questionsData.questions.filter(q => q.audience.includes('human'));
+  
+  console.log(`Human Eval: ${questions.length} human-audience questions`);
+  
+  const results = [];
+  let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 };
+  let notFound = 0;
+
+  for (let i = 0; i < questions.length; i++) {
+    const q = questions[i];
+    process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
+    
+    const docs = findRelevantDocs(q, docsDir);
+    
+    let answer;
+    try {
+      answer = await humanRead(q, docs, llmOpts);
+    } catch (err) {
+      answer = `ERROR: ${err.message}`;
+    }
+    
+    let score;
+    try {
+      score = await scoreHuman(q, answer, docs, llmOpts);
+    } catch (err) {
+      score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` };
+    }
+    
+    const isNotFound = answer.includes('NOT_FOUND');
+    if (isNotFound) notFound++;
+    
+    for (const k of Object.keys(totals)) totals[k] += (score[k] || 0);
+    
+    const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0);
+    console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
+    
+    results.push({
+      id: q.id,
+      category: q.category,
+      difficulty: q.difficulty,
+      question: q.question,
+      groundTruth: q.answer,
+      humanAnswer: answer,
+      docsUsed: docs.map(d => d.path),
+      score,
+      composite: Number(composite),
+      notFound: isNotFound,
+    });
+  }
+
+  const n = questions.length;
+  const report = {
+    evalType: 'human',
+    timestamp: new Date().toISOString(),
+    docsDir,
+    totalQuestions: n,
+    overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
+    avgClarity: (totals.clarity / n).toFixed(2),
+    avgActionability: (totals.actionability / n).toFixed(2),
+    avgCompleteness: (totals.completeness / n).toFixed(2),
+    avgFindability: (totals.findability / n).toFixed(2),
+    notFoundCount: notFound,
+    notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
+    byCategory: {},
+    byDifficulty: {},
+    results,
+  };
+
+  for (const r of results) {
+    for (const groupKey of ['category', 'difficulty']) {
+      const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
+      const key = r[groupKey];
+      if (!group[key]) group[key] = { total: 0, count: 0 };
+      group[key].total += r.composite;
+      group[key].count++;
+    }
+  }
+  for (const group of [report.byCategory, report.byDifficulty]) {
+    for (const [k, v] of Object.entries(group)) {
+      group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
+    }
+  }
+
+  return report;
+}
+
+if (require.main === module) {
+  const docsDir = process.argv[2];
+  const questionsPath = process.argv[3];
+  const outPath = process.argv[4] || './eval-human-report.json';
+
+  if (!docsDir || !questionsPath) {
+    console.error('Usage: node eval-human.js <docs-dir> <questions.json> [output.json]');
+    process.exit(1);
+  }
+
+  const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
+  console.log(`Using model: ${model}`);
+
+  (async () => {
+    try {
+      const report = await runHumanEval(docsDir, questionsPath, { model });
+      
+      console.log('\n' + '═'.repeat(60));
+      console.log('HUMAN EVAL REPORT');
+      console.log('═'.repeat(60));
+      console.log(`Overall Score: ${report.overallScore}%`);
+      console.log(`Clarity: ${report.avgClarity}/5  Actionability: ${report.avgActionability}/5  Completeness: ${report.avgCompleteness}/5  Findability: ${report.avgFindability}/5`);
+      console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
+      console.log('\nBy Category:');
+      for (const [cat, s] of Object.entries(report.byCategory)) {
+        console.log(`  ${cat}: ${s.avg}% (${s.count} questions)`);
+      }
+      console.log('\nBy Difficulty:');
+      for (const [diff, s] of Object.entries(report.byDifficulty)) {
+        console.log(`  ${diff}: ${s.avg}% (${s.count} questions)`);
+      }
+      
+      const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
+      console.log('\nWeakest:');
+      for (const w of worst) {
+        console.log(`  [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`);
+      }
+      
+      fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
+      console.log(`\nFull report: ${outPath}`);
+    } catch (err) {
+      console.error('Human eval failed:', err);
+      process.exit(1);
+    }
+  })();
+}
+
+module.exports = { runHumanEval };