eval-human.js

/**
 * Eval Track 2: Human Readability Benchmark
 * 
 * Tests whether the docs are useful for a human engineer.
 * Feeds the FULL relevant doc page(s) to the LLM and scores on:
 * - Clarity: Is the prose understandable?
 * - Actionability: Could you act on this information?
 * - Navigation: Does the doc structure guide you to the answer?
 * - Completeness: Is the answer fully covered?
 * 
 * Usage: node eval-human.js <docs-dir> <questions.json> [output.json]
 */

const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');

/** Find the most relevant doc files for a human-audience question */
function findRelevantDocs(question, docsDir) {
  const docs = [];
  
  // Always include the architecture overview
  const archPath = path.join(docsDir, 'reference/system-architecture.md');
  if (fs.existsSync(archPath)) {
    docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') });
  }

  // Category-specific doc selection
  const q = question.question.toLowerCase();
  
  // Helm-related: include helm index
  if (q.includes('helm') || q.includes('chart') || q.includes('secret') || q.includes('port') || q.includes('deploy') || q.includes('service')) {
    const helmIndex = path.join(docsDir, 'reference/helm/index.md');
    if (fs.existsSync(helmIndex)) {
      docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') });
    }
  }

  // Find specific chart docs mentioned in the question
  const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster'];
  for (const name of chartNames) {
    if (q.includes(name)) {
      // Find matching chart doc(s)
      const chartsDir = path.join(docsDir, 'reference/helm/charts');
      if (fs.existsSync(chartsDir)) {
        for (const f of fs.readdirSync(chartsDir)) {
          if (f.includes(name)) {
            const content = fs.readFileSync(path.join(chartsDir, f), 'utf8');
            docs.push({ path: `reference/helm/charts/${f}`, content });
          }
        }
      }
    }
  }

  // Subsystem-related
  if (q.includes('subsystem') || q.includes('cross-cutting') || q.includes('depend')) {
    const subsDir = path.join(docsDir, 'reference/subsystems');
    if (fs.existsSync(subsDir)) {
      for (const f of fs.readdirSync(subsDir)) {
        const content = fs.readFileSync(path.join(subsDir, f), 'utf8');
        docs.push({ path: `reference/subsystems/${f}`, content });
      }
    }
  }

  // Contracts
  if (q.includes('contract') || q.includes('interface') || q.includes('secret') || q.includes('configmap')) {
    const contractsPath = path.join(docsDir, 'reference/contracts/index.md');
    if (fs.existsSync(contractsPath)) {
      const content = fs.readFileSync(contractsPath, 'utf8');
      // Truncate if massive
      docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) });
    }
  }

  // Cap total context at 60K
  let total = 0;
  return docs.filter(d => {
    if (total + d.content.length > 60000) return false;
    total += d.content.length;
    return true;
  });
}

/** Have an LLM simulate a human reading the docs */
async function humanRead(question, docs, llmOpts) {
  const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n');

  const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time.

DOCUMENTATION:
${context}

QUESTION: ${question.question}

Instructions:
- Read the docs carefully, as a human would
- If the answer requires combining info from multiple sections, do so
- If the docs don't cover this, say "NOT_FOUND"
- Match the expected format: ${question.answerType}

Answer:`;

  return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 });
}

/** Score with human-focused criteria */
async function scoreHuman(question, answer, docsUsed, llmOpts) {
  const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective.

QUESTION: ${question.question}
GROUND TRUTH: ${question.answer}
READER'S ANSWER: ${answer}
DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')}

Score on these human-centric dimensions (0-5 each):
1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious)
2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute)
3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered)
4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect)

If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0.

Respond in EXACTLY this JSON format:
{"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`;

  const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
  try {
    const jsonMatch = raw.match(/\{[\s\S]*\}/);
    if (jsonMatch) return JSON.parse(jsonMatch[0]);
  } catch {}
  return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}

/** Run the human eval */
async function runHumanEval(docsDir, questionsPath, llmOpts = {}) {
  const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
  const questions = questionsData.questions.filter(q => q.audience.includes('human'));
  
  console.log(`Human Eval: ${questions.length} human-audience questions`);
  
  const results = [];
  let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 };
  let notFound = 0;

  for (let i = 0; i < questions.length; i++) {
    const q = questions[i];
    process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
    
    const docs = findRelevantDocs(q, docsDir);
    
    let answer;
    try {
      answer = await humanRead(q, docs, llmOpts);
    } catch (err) {
      answer = `ERROR: ${err.message}`;
    }
    
    let score;
    try {
      score = await scoreHuman(q, answer, docs, llmOpts);
    } catch (err) {
      score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` };
    }
    
    const isNotFound = answer.includes('NOT_FOUND');
    if (isNotFound) notFound++;
    
    for (const k of Object.keys(totals)) totals[k] += (score[k] || 0);
    
    const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0);
    console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
    
    results.push({
      id: q.id,
      category: q.category,
      difficulty: q.difficulty,
      question: q.question,
      groundTruth: q.answer,
      humanAnswer: answer,
      docsUsed: docs.map(d => d.path),
      score,
      composite: Number(composite),
      notFound: isNotFound,
    });
  }

  const n = questions.length;
  const report = {
    evalType: 'human',
    timestamp: new Date().toISOString(),
    docsDir,
    totalQuestions: n,
    overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
    avgClarity: (totals.clarity / n).toFixed(2),
    avgActionability: (totals.actionability / n).toFixed(2),
    avgCompleteness: (totals.completeness / n).toFixed(2),
    avgFindability: (totals.findability / n).toFixed(2),
    notFoundCount: notFound,
    notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
    byCategory: {},
    byDifficulty: {},
    results,
  };

  for (const r of results) {
    for (const groupKey of ['category', 'difficulty']) {
      const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
      const key = r[groupKey];
      if (!group[key]) group[key] = { total: 0, count: 0 };
      group[key].total += r.composite;
      group[key].count++;
    }
  }
  for (const group of [report.byCategory, report.byDifficulty]) {
    for (const [k, v] of Object.entries(group)) {
      group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
    }
  }

  return report;
}

if (require.main === module) {
  const docsDir = process.argv[2];
  const questionsPath = process.argv[3];
  const outPath = process.argv[4] || './eval-human-report.json';

  if (!docsDir || !questionsPath) {
    console.error('Usage: node eval-human.js <docs-dir> <questions.json> [output.json]');
    process.exit(1);
  }

  const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
  console.log(`Using model: ${model}`);

  (async () => {
    try {
      const report = await runHumanEval(docsDir, questionsPath, { model });
      
      console.log('\n' + '═'.repeat(60));
      console.log('HUMAN EVAL REPORT');
      console.log('═'.repeat(60));
      console.log(`Overall Score: ${report.overallScore}%`);
      console.log(`Clarity: ${report.avgClarity}/5  Actionability: ${report.avgActionability}/5  Completeness: ${report.avgCompleteness}/5  Findability: ${report.avgFindability}/5`);
      console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
      console.log('\nBy Category:');
      for (const [cat, s] of Object.entries(report.byCategory)) {
        console.log(`  ${cat}: ${s.avg}% (${s.count} questions)`);
      }
      console.log('\nBy Difficulty:');
      for (const [diff, s] of Object.entries(report.byDifficulty)) {
        console.log(`  ${diff}: ${s.avg}% (${s.count} questions)`);
      }
      
      const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
      console.log('\nWeakest:');
      for (const w of worst) {
        console.log(`  [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`);
      }
      
      fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
      console.log(`\nFull report: ${outPath}`);
    } catch (err) {
      console.error('Human eval failed:', err);
      process.exit(1);
    }
  })();
}

module.exports = { runHumanEval };
Phase 9c: Split eval into Agent (file-browsing) and Human (readability) tracks Agent eval: 54.3% (22 questions, 40.9% NOT_FOUND) Human eval: 63.9% (28 questions, 17.9% NOT_FOUND) Key findings: - Agent navigation is the bottleneck (2.09/5) — long path-based filenames hurt discoverability - Human findability is decent (3.46/5) but dependency questions fail (0%) because chart docs for wrapper charts don't surface their sub-chart deps - Both tracks show strong precision (4.4+/5) — very low hallucination - Resources (91%) and interactions (95%) score great for humans - Configuration and contracts are solid across both tracks 2026-03-09 23:55:54 +00:00			`/**`
			`* Eval Track 2: Human Readability Benchmark`
			`*`
			`* Tests whether the docs are useful for a human engineer.`
			`* Feeds the FULL relevant doc page(s) to the LLM and scores on:`
			`* - Clarity: Is the prose understandable?`
			`* - Actionability: Could you act on this information?`
			`* - Navigation: Does the doc structure guide you to the answer?`
			`* - Completeness: Is the answer fully covered?`
			`*`
			`* Usage: node eval-human.js <docs-dir> <questions.json> [output.json]`
			`*/`

			`const fs = require('fs');`
			`const path = require('path');`
			`const { callLLM } = require('./prose.js');`

			`/** Find the most relevant doc files for a human-audience question */`
			`function findRelevantDocs(question, docsDir) {`
			`const docs = [];`

			`// Always include the architecture overview`
			`const archPath = path.join(docsDir, 'reference/system-architecture.md');`
			`if (fs.existsSync(archPath)) {`
			`docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') });`
			`}`

			`// Category-specific doc selection`
			`const q = question.question.toLowerCase();`

			`// Helm-related: include helm index`
			`if (q.includes('helm') \|\| q.includes('chart') \|\| q.includes('secret') \|\| q.includes('port') \|\| q.includes('deploy') \|\| q.includes('service')) {`
			`const helmIndex = path.join(docsDir, 'reference/helm/index.md');`
			`if (fs.existsSync(helmIndex)) {`
			`docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') });`
			`}`
			`}`

			`// Find specific chart docs mentioned in the question`
			`const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster'];`
			`for (const name of chartNames) {`
			`if (q.includes(name)) {`
			`// Find matching chart doc(s)`
			`const chartsDir = path.join(docsDir, 'reference/helm/charts');`
			`if (fs.existsSync(chartsDir)) {`
			`for (const f of fs.readdirSync(chartsDir)) {`
			`if (f.includes(name)) {`
			`const content = fs.readFileSync(path.join(chartsDir, f), 'utf8');`
			docs.push({ path: `reference/helm/charts/${f}`, content });
			`}`
			`}`
			`}`
			`}`
			`}`

			`// Subsystem-related`
			`if (q.includes('subsystem') \|\| q.includes('cross-cutting') \|\| q.includes('depend')) {`
			`const subsDir = path.join(docsDir, 'reference/subsystems');`
			`if (fs.existsSync(subsDir)) {`
			`for (const f of fs.readdirSync(subsDir)) {`
			`const content = fs.readFileSync(path.join(subsDir, f), 'utf8');`
			docs.push({ path: `reference/subsystems/${f}`, content });
			`}`
			`}`
			`}`

			`// Contracts`
			`if (q.includes('contract') \|\| q.includes('interface') \|\| q.includes('secret') \|\| q.includes('configmap')) {`
			`const contractsPath = path.join(docsDir, 'reference/contracts/index.md');`
			`if (fs.existsSync(contractsPath)) {`
			`const content = fs.readFileSync(contractsPath, 'utf8');`
			`// Truncate if massive`
			`docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) });`
			`}`
			`}`

			`// Cap total context at 60K`
			`let total = 0;`
			`return docs.filter(d => {`
			`if (total + d.content.length > 60000) return false;`
			`total += d.content.length;`
			`return true;`
			`});`
			`}`

			`/** Have an LLM simulate a human reading the docs */`
			`async function humanRead(question, docs, llmOpts) {`
			const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n');

			const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time.

			`DOCUMENTATION:`
			`${context}`

			`QUESTION: ${question.question}`

			`Instructions:`
			`- Read the docs carefully, as a human would`
			`- If the answer requires combining info from multiple sections, do so`
			`- If the docs don't cover this, say "NOT_FOUND"`
			`- Match the expected format: ${question.answerType}`

			Answer:`;

			`return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 });`
			`}`

			`/** Score with human-focused criteria */`
			`async function scoreHuman(question, answer, docsUsed, llmOpts) {`
			const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective.

			`QUESTION: ${question.question}`
			`GROUND TRUTH: ${question.answer}`
			`READER'S ANSWER: ${answer}`
			`DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')}`

			`Score on these human-centric dimensions (0-5 each):`
			`1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious)`
			`2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute)`
			`3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered)`
			`4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect)`

			`If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0.`

			`Respond in EXACTLY this JSON format:`
			{"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`;

			`const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });`
			`try {`
			`const jsonMatch = raw.match(/\{[\s\S]*\}/);`
			`if (jsonMatch) return JSON.parse(jsonMatch[0]);`
			`} catch {}`
			return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
			`}`

			`/** Run the human eval */`
			`async function runHumanEval(docsDir, questionsPath, llmOpts = {}) {`
			`const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));`
			`const questions = questionsData.questions.filter(q => q.audience.includes('human'));`

			console.log(`Human Eval: ${questions.length} human-audience questions`);

			`const results = [];`
			`let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 };`
			`let notFound = 0;`

			`for (let i = 0; i < questions.length; i++) {`
			`const q = questions[i];`
			process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);

			`const docs = findRelevantDocs(q, docsDir);`

			`let answer;`
			`try {`
			`answer = await humanRead(q, docs, llmOpts);`
			`} catch (err) {`
			answer = `ERROR: ${err.message}`;
			`}`

			`let score;`
			`try {`
			`score = await scoreHuman(q, answer, docs, llmOpts);`
			`} catch (err) {`
			score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` };
			`}`

			`const isNotFound = answer.includes('NOT_FOUND');`
			`if (isNotFound) notFound++;`

			`for (const k of Object.keys(totals)) totals[k] += (score[k] \|\| 0);`

			`const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0);`
			console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);

			`results.push({`
			`id: q.id,`
			`category: q.category,`
			`difficulty: q.difficulty,`
			`question: q.question,`
			`groundTruth: q.answer,`
			`humanAnswer: answer,`
			`docsUsed: docs.map(d => d.path),`
			`score,`
			`composite: Number(composite),`
			`notFound: isNotFound,`
			`});`
			`}`

			`const n = questions.length;`
			`const report = {`
			`evalType: 'human',`
			`timestamp: new Date().toISOString(),`
			`docsDir,`
			`totalQuestions: n,`
			`overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),`
			`avgClarity: (totals.clarity / n).toFixed(2),`
			`avgActionability: (totals.actionability / n).toFixed(2),`
			`avgCompleteness: (totals.completeness / n).toFixed(2),`
			`avgFindability: (totals.findability / n).toFixed(2),`
			`notFoundCount: notFound,`
			`notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',`
			`byCategory: {},`
			`byDifficulty: {},`
			`results,`
			`};`

			`for (const r of results) {`
			`for (const groupKey of ['category', 'difficulty']) {`
			`const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;`
			`const key = r[groupKey];`
			`if (!group[key]) group[key] = { total: 0, count: 0 };`
			`group[key].total += r.composite;`
			`group[key].count++;`
			`}`
			`}`
			`for (const group of [report.byCategory, report.byDifficulty]) {`
			`for (const [k, v] of Object.entries(group)) {`
			`group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };`
			`}`
			`}`

			`return report;`
			`}`

			`if (require.main === module) {`
			`const docsDir = process.argv[2];`
			`const questionsPath = process.argv[3];`
			`const outPath = process.argv[4] \|\| './eval-human-report.json';`

			`if (!docsDir \|\| !questionsPath) {`
			`console.error('Usage: node eval-human.js <docs-dir> <questions.json> [output.json]');`
			`process.exit(1);`
			`}`

			`const model = process.env.LLM_MODEL \|\| 'claude-haiku-4.5';`
			console.log(`Using model: ${model}`);

			`(async () => {`
			`try {`
			`const report = await runHumanEval(docsDir, questionsPath, { model });`

			`console.log('\n' + '═'.repeat(60));`
			`console.log('HUMAN EVAL REPORT');`
			`console.log('═'.repeat(60));`
			console.log(`Overall Score: ${report.overallScore}%`);
			console.log(`Clarity: ${report.avgClarity}/5 Actionability: ${report.avgActionability}/5 Completeness: ${report.avgCompleteness}/5 Findability: ${report.avgFindability}/5`);
			console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
			`console.log('\nBy Category:');`
			`for (const [cat, s] of Object.entries(report.byCategory)) {`
			console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
			`}`
			`console.log('\nBy Difficulty:');`
			`for (const [diff, s] of Object.entries(report.byDifficulty)) {`
			console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
			`}`

			`const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);`
			`console.log('\nWeakest:');`
			`for (const w of worst) {`
			console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`);
			`}`

			`fs.writeFileSync(outPath, JSON.stringify(report, null, 2));`
			console.log(`\nFull report: ${outPath}`);
			`} catch (err) {`
			`console.error('Human eval failed:', err);`
			`process.exit(1);`
			`}`
			`})();`
			`}`

			`module.exports = { runHumanEval };`