eval-agent.js

/**
 * Eval Track 1: Agent File-Browsing Benchmark
 * 
 * Spawns a sub-agent with file access to the docs directory.
 * The agent navigates the tree, reads files, follows cross-references.
 * Tests whether the doc STRUCTURE is navigable by an AI agent.
 * 
 * Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]
 */

const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');

/** Simulate an agent browsing the doc tree with file tools */
async function agentBrowse(question, docsDir, llmOpts) {
  // Step 1: Agent sees the directory tree
  const tree = buildTree(docsDir, '', 3);
  
  // Step 2: Agent picks which files to read based on the question + tree
  const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files.

FILE TREE:
${tree}

QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}

Based on the file tree, which files should you read to answer this question? List up to 5 file paths (most relevant first). Think about:
- Index files that might have summary tables
- Specific chart/subsystem docs that match the question topic
- Architecture overview docs for system-wide questions

Respond with ONLY the file paths, one per line. No explanation.`;

  const planRaw = await callLLM(planPrompt, { ...llmOpts, maxTokens: 512, temperature: 0.0 });
  
  // Parse file paths from plan
  const plannedFiles = planRaw.split('\n')
    .map(l => l.trim().replace(/^[-*•]\s*/, '').replace(/`/g, ''))
    .filter(l => l.length > 0 && !l.startsWith('#'))
    .slice(0, 5);

  // Step 3: Read the planned files
  let context = '';
  const filesRead = [];
  for (const relPath of plannedFiles) {
    const absPath = path.join(docsDir, relPath);
    if (fs.existsSync(absPath)) {
      try {
        const content = fs.readFileSync(absPath, 'utf8');
        // Cap per file at 15K chars
        const truncated = content.length > 15000 ? content.substring(0, 15000) + '\n... (truncated)' : content;
        context += `\n=== ${relPath} ===\n${truncated}\n`;
        filesRead.push(relPath);
      } catch {}
    }
  }

  // Step 4: If the agent found nothing useful, let it try a second pass
  if (filesRead.length === 0) {
    // Fallback: read the main index files
    const fallbacks = ['reference/system-architecture.md', 'reference/helm/index.md'];
    for (const fb of fallbacks) {
      const absPath = path.join(docsDir, fb);
      if (fs.existsSync(absPath)) {
        const content = fs.readFileSync(absPath, 'utf8');
        context += `\n=== ${fb} ===\n${content.substring(0, 15000)}\n`;
        filesRead.push(fb);
      }
    }
  }

  // Step 5: Agent answers from the files it read
  const answerPrompt = `You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read:

${context}

QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}

Answer the question using ONLY the information from the files above. If you can't find the answer, say "NOT_FOUND".
Be precise and match the expected format.

Answer:`;

  const answer = await callLLM(answerPrompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
  
  return { answer, filesRead, plannedFiles };
}

/** Build a directory tree string */
function buildTree(dir, prefix, maxDepth) {
  if (maxDepth <= 0) return '';
  const lines = [];
  let entries;
  try { entries = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); } catch { return ''; }
  
  for (const e of entries) {
    if (e.name.startsWith('.')) continue;
    const relPath = prefix ? `${prefix}/${e.name}` : e.name;
    if (e.isDirectory()) {
      const childCount = fs.readdirSync(path.join(dir, e.name)).length;
      lines.push(`${relPath}/ (${childCount} items)`);
      if (maxDepth > 1) {
        lines.push(buildTree(path.join(dir, e.name), relPath, maxDepth - 1));
      }
    } else {
      const size = fs.statSync(path.join(dir, e.name)).size;
      lines.push(`${relPath} (${(size / 1024).toFixed(1)}K)`);
    }
  }
  return lines.filter(l => l).join('\n');
}

/** Score using LLM-as-judge (same as eval.js) */
async function scoreAnswer(question, llmAnswer, llmOpts) {
  const prompt = `You are a strict evaluator scoring an AI agent's answer against ground truth.

QUESTION: ${question.question}
EXPECTED ANSWER TYPE: ${question.answerType}
GROUND TRUTH: ${question.answer}
AI ANSWER: ${llmAnswer}

Score on these dimensions (0-5 each):
1. ACCURACY: Does the answer contain the correct facts?
2. COMPLETENESS: Does it cover all items in the ground truth?
3. PRECISION: Is it free of hallucinated or incorrect extra information?
4. NAVIGATION: Did the agent demonstrate it could find the right information? (0=couldn't find anything, 5=went straight to the right file)

If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5, NAVIGATION=0.

Respond in EXACTLY this JSON format:
{"accuracy": N, "completeness": N, "precision": N, "navigation": N, "notes": "brief explanation"}`;

  const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
  try {
    const jsonMatch = raw.match(/\{[\s\S]*\}/);
    if (jsonMatch) return JSON.parse(jsonMatch[0]);
  } catch {}
  return { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}

/** Run the agent eval */
async function runAgentEval(docsDir, questionsPath, llmOpts = {}) {
  const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
  const questions = questionsData.questions.filter(q => q.audience.includes('machine'));
  
  console.log(`Agent Eval: ${questions.length} machine-audience questions`);
  
  const results = [];
  let totals = { accuracy: 0, completeness: 0, precision: 0, navigation: 0 };
  let notFound = 0;

  for (let i = 0; i < questions.length; i++) {
    const q = questions[i];
    process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
    
    let browseResult;
    try {
      browseResult = await agentBrowse(q, docsDir, llmOpts);
    } catch (err) {
      browseResult = { answer: `ERROR: ${err.message}`, filesRead: [], plannedFiles: [] };
    }
    
    let score;
    try {
      score = await scoreAnswer(q, browseResult.answer, llmOpts);
    } catch (err) {
      score = { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Score error: ${err.message}` };
    }
    
    const isNotFound = browseResult.answer.includes('NOT_FOUND');
    if (isNotFound) notFound++;
    
    for (const k of Object.keys(totals)) totals[k] += score[k];
    
    const composite = ((score.accuracy + score.completeness + score.precision + score.navigation) / 20 * 100).toFixed(0);
    console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision} N:${score.navigation}) files:${browseResult.filesRead.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
    
    results.push({
      id: q.id,
      category: q.category,
      difficulty: q.difficulty,
      question: q.question,
      groundTruth: q.answer,
      llmAnswer: browseResult.answer,
      filesRead: browseResult.filesRead,
      plannedFiles: browseResult.plannedFiles,
      score,
      composite: Number(composite),
      notFound: isNotFound,
    });
  }

  const n = questions.length;
  const report = {
    evalType: 'agent',
    timestamp: new Date().toISOString(),
    docsDir,
    totalQuestions: n,
    overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
    avgAccuracy: (totals.accuracy / n).toFixed(2),
    avgCompleteness: (totals.completeness / n).toFixed(2),
    avgPrecision: (totals.precision / n).toFixed(2),
    avgNavigation: (totals.navigation / n).toFixed(2),
    notFoundCount: notFound,
    notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
    byCategory: {},
    byDifficulty: {},
    results,
  };

  // Aggregate by category and difficulty
  for (const r of results) {
    for (const groupKey of ['category', 'difficulty']) {
      const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
      const key = r[groupKey];
      if (!group[key]) group[key] = { total: 0, count: 0 };
      group[key].total += r.composite;
      group[key].count++;
    }
  }
  for (const group of [report.byCategory, report.byDifficulty]) {
    for (const [k, v] of Object.entries(group)) {
      group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
    }
  }

  return report;
}

if (require.main === module) {
  const docsDir = process.argv[2];
  const questionsPath = process.argv[3];
  const outPath = process.argv[4] || './eval-agent-report.json';

  if (!docsDir || !questionsPath) {
    console.error('Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]');
    process.exit(1);
  }

  const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
  console.log(`Using model: ${model}`);

  (async () => {
    try {
      const report = await runAgentEval(docsDir, questionsPath, { model });
      
      console.log('\n' + '═'.repeat(60));
      console.log('AGENT EVAL REPORT');
      console.log('═'.repeat(60));
      console.log(`Overall Score: ${report.overallScore}%`);
      console.log(`Accuracy: ${report.avgAccuracy}/5  Completeness: ${report.avgCompleteness}/5  Precision: ${report.avgPrecision}/5  Navigation: ${report.avgNavigation}/5`);
      console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
      console.log('\nBy Category:');
      for (const [cat, s] of Object.entries(report.byCategory)) {
        console.log(`  ${cat}: ${s.avg}% (${s.count} questions)`);
      }
      console.log('\nBy Difficulty:');
      for (const [diff, s] of Object.entries(report.byDifficulty)) {
        console.log(`  ${diff}: ${s.avg}% (${s.count} questions)`);
      }
      
      const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
      console.log('\nWeakest:');
      for (const w of worst) {
        console.log(`  [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}... (read: ${w.filesRead.join(', ') || 'none'})`);
      }
      
      fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
      console.log(`\nFull report: ${outPath}`);
    } catch (err) {
      console.error('Agent eval failed:', err);
      process.exit(1);
    }
  })();
}

module.exports = { runAgentEval };
Phase 9c: Split eval into Agent (file-browsing) and Human (readability) tracks Agent eval: 54.3% (22 questions, 40.9% NOT_FOUND) Human eval: 63.9% (28 questions, 17.9% NOT_FOUND) Key findings: - Agent navigation is the bottleneck (2.09/5) — long path-based filenames hurt discoverability - Human findability is decent (3.46/5) but dependency questions fail (0%) because chart docs for wrapper charts don't surface their sub-chart deps - Both tracks show strong precision (4.4+/5) — very low hallucination - Resources (91%) and interactions (95%) score great for humans - Configuration and contracts are solid across both tracks 2026-03-09 23:55:54 +00:00			`/**`
			`* Eval Track 1: Agent File-Browsing Benchmark`
			`*`
			`* Spawns a sub-agent with file access to the docs directory.`
			`* The agent navigates the tree, reads files, follows cross-references.`
			`* Tests whether the doc STRUCTURE is navigable by an AI agent.`
			`*`
			`* Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]`
			`*/`

			`const fs = require('fs');`
			`const path = require('path');`
			`const { callLLM } = require('./prose.js');`

			`/** Simulate an agent browsing the doc tree with file tools */`
			`async function agentBrowse(question, docsDir, llmOpts) {`
			`// Step 1: Agent sees the directory tree`
			`const tree = buildTree(docsDir, '', 3);`

			`// Step 2: Agent picks which files to read based on the question + tree`
			const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files.

			`FILE TREE:`
			`${tree}`

			`QUESTION: ${question.question}`
			`EXPECTED FORMAT: ${question.answerType}`

			`Based on the file tree, which files should you read to answer this question? List up to 5 file paths (most relevant first). Think about:`
			`- Index files that might have summary tables`
			`- Specific chart/subsystem docs that match the question topic`
			`- Architecture overview docs for system-wide questions`

			Respond with ONLY the file paths, one per line. No explanation.`;

			`const planRaw = await callLLM(planPrompt, { ...llmOpts, maxTokens: 512, temperature: 0.0 });`

			`// Parse file paths from plan`
			`const plannedFiles = planRaw.split('\n')`
			.map(l => l.trim().replace(/^[-•]\s/, '').replace(/`/g, ''))
			`.filter(l => l.length > 0 && !l.startsWith('#'))`
			`.slice(0, 5);`

			`// Step 3: Read the planned files`
			`let context = '';`
			`const filesRead = [];`
			`for (const relPath of plannedFiles) {`
			`const absPath = path.join(docsDir, relPath);`
			`if (fs.existsSync(absPath)) {`
			`try {`
			`const content = fs.readFileSync(absPath, 'utf8');`
			`// Cap per file at 15K chars`
			`const truncated = content.length > 15000 ? content.substring(0, 15000) + '\n... (truncated)' : content;`
			context += `\n=== ${relPath} ===\n${truncated}\n`;
			`filesRead.push(relPath);`
			`} catch {}`
			`}`
			`}`

			`// Step 4: If the agent found nothing useful, let it try a second pass`
			`if (filesRead.length === 0) {`
			`// Fallback: read the main index files`
			`const fallbacks = ['reference/system-architecture.md', 'reference/helm/index.md'];`
			`for (const fb of fallbacks) {`
			`const absPath = path.join(docsDir, fb);`
			`if (fs.existsSync(absPath)) {`
			`const content = fs.readFileSync(absPath, 'utf8');`
			context += `\n=== ${fb} ===\n${content.substring(0, 15000)}\n`;
			`filesRead.push(fb);`
			`}`
			`}`
			`}`

			`// Step 5: Agent answers from the files it read`
			const answerPrompt = `You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read:

			`${context}`

			`QUESTION: ${question.question}`
			`EXPECTED FORMAT: ${question.answerType}`

			`Answer the question using ONLY the information from the files above. If you can't find the answer, say "NOT_FOUND".`
			`Be precise and match the expected format.`

			Answer:`;

			`const answer = await callLLM(answerPrompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });`

			`return { answer, filesRead, plannedFiles };`
			`}`

			`/** Build a directory tree string */`
			`function buildTree(dir, prefix, maxDepth) {`
			`if (maxDepth <= 0) return '';`
			`const lines = [];`
			`let entries;`
			`try { entries = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); } catch { return ''; }`

			`for (const e of entries) {`
			`if (e.name.startsWith('.')) continue;`
			const relPath = prefix ? `${prefix}/${e.name}` : e.name;
			`if (e.isDirectory()) {`
			`const childCount = fs.readdirSync(path.join(dir, e.name)).length;`
			lines.push(`${relPath}/ (${childCount} items)`);
			`if (maxDepth > 1) {`
			`lines.push(buildTree(path.join(dir, e.name), relPath, maxDepth - 1));`
			`}`
			`} else {`
			`const size = fs.statSync(path.join(dir, e.name)).size;`
			lines.push(`${relPath} (${(size / 1024).toFixed(1)}K)`);
			`}`
			`}`
			`return lines.filter(l => l).join('\n');`
			`}`

			`/** Score using LLM-as-judge (same as eval.js) */`
			`async function scoreAnswer(question, llmAnswer, llmOpts) {`
			const prompt = `You are a strict evaluator scoring an AI agent's answer against ground truth.

			`QUESTION: ${question.question}`
			`EXPECTED ANSWER TYPE: ${question.answerType}`
			`GROUND TRUTH: ${question.answer}`
			`AI ANSWER: ${llmAnswer}`

			`Score on these dimensions (0-5 each):`
			`1. ACCURACY: Does the answer contain the correct facts?`
			`2. COMPLETENESS: Does it cover all items in the ground truth?`
			`3. PRECISION: Is it free of hallucinated or incorrect extra information?`
			`4. NAVIGATION: Did the agent demonstrate it could find the right information? (0=couldn't find anything, 5=went straight to the right file)`

			`If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5, NAVIGATION=0.`

			`Respond in EXACTLY this JSON format:`
			{"accuracy": N, "completeness": N, "precision": N, "navigation": N, "notes": "brief explanation"}`;

			`const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });`
			`try {`
			`const jsonMatch = raw.match(/\{[\s\S]*\}/);`
			`if (jsonMatch) return JSON.parse(jsonMatch[0]);`
			`} catch {}`
			return { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
			`}`

			`/** Run the agent eval */`
			`async function runAgentEval(docsDir, questionsPath, llmOpts = {}) {`
			`const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));`
			`const questions = questionsData.questions.filter(q => q.audience.includes('machine'));`

			console.log(`Agent Eval: ${questions.length} machine-audience questions`);

			`const results = [];`
			`let totals = { accuracy: 0, completeness: 0, precision: 0, navigation: 0 };`
			`let notFound = 0;`

			`for (let i = 0; i < questions.length; i++) {`
			`const q = questions[i];`
			process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);

			`let browseResult;`
			`try {`
			`browseResult = await agentBrowse(q, docsDir, llmOpts);`
			`} catch (err) {`
			browseResult = { answer: `ERROR: ${err.message}`, filesRead: [], plannedFiles: [] };
			`}`

			`let score;`
			`try {`
			`score = await scoreAnswer(q, browseResult.answer, llmOpts);`
			`} catch (err) {`
			score = { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Score error: ${err.message}` };
			`}`

			`const isNotFound = browseResult.answer.includes('NOT_FOUND');`
			`if (isNotFound) notFound++;`

			`for (const k of Object.keys(totals)) totals[k] += score[k];`

			`const composite = ((score.accuracy + score.completeness + score.precision + score.navigation) / 20 * 100).toFixed(0);`
			console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision} N:${score.navigation}) files:${browseResult.filesRead.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);

			`results.push({`
			`id: q.id,`
			`category: q.category,`
			`difficulty: q.difficulty,`
			`question: q.question,`
			`groundTruth: q.answer,`
			`llmAnswer: browseResult.answer,`
			`filesRead: browseResult.filesRead,`
			`plannedFiles: browseResult.plannedFiles,`
			`score,`
			`composite: Number(composite),`
			`notFound: isNotFound,`
			`});`
			`}`

			`const n = questions.length;`
			`const report = {`
			`evalType: 'agent',`
			`timestamp: new Date().toISOString(),`
			`docsDir,`
			`totalQuestions: n,`
			`overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),`
			`avgAccuracy: (totals.accuracy / n).toFixed(2),`
			`avgCompleteness: (totals.completeness / n).toFixed(2),`
			`avgPrecision: (totals.precision / n).toFixed(2),`
			`avgNavigation: (totals.navigation / n).toFixed(2),`
			`notFoundCount: notFound,`
			`notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',`
			`byCategory: {},`
			`byDifficulty: {},`
			`results,`
			`};`

			`// Aggregate by category and difficulty`
			`for (const r of results) {`
			`for (const groupKey of ['category', 'difficulty']) {`
			`const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;`
			`const key = r[groupKey];`
			`if (!group[key]) group[key] = { total: 0, count: 0 };`
			`group[key].total += r.composite;`
			`group[key].count++;`
			`}`
			`}`
			`for (const group of [report.byCategory, report.byDifficulty]) {`
			`for (const [k, v] of Object.entries(group)) {`
			`group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };`
			`}`
			`}`

			`return report;`
			`}`

			`if (require.main === module) {`
			`const docsDir = process.argv[2];`
			`const questionsPath = process.argv[3];`
			`const outPath = process.argv[4] \|\| './eval-agent-report.json';`

			`if (!docsDir \|\| !questionsPath) {`
			`console.error('Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]');`
			`process.exit(1);`
			`}`

			`const model = process.env.LLM_MODEL \|\| 'claude-haiku-4.5';`
			console.log(`Using model: ${model}`);

			`(async () => {`
			`try {`
			`const report = await runAgentEval(docsDir, questionsPath, { model });`

			`console.log('\n' + '═'.repeat(60));`
			`console.log('AGENT EVAL REPORT');`
			`console.log('═'.repeat(60));`
			console.log(`Overall Score: ${report.overallScore}%`);
			console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5 Navigation: ${report.avgNavigation}/5`);
			console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
			`console.log('\nBy Category:');`
			`for (const [cat, s] of Object.entries(report.byCategory)) {`
			console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
			`}`
			`console.log('\nBy Difficulty:');`
			`for (const [diff, s] of Object.entries(report.byDifficulty)) {`
			console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
			`}`

			`const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);`
			`console.log('\nWeakest:');`
			`for (const w of worst) {`
			console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}... (read: ${w.filesRead.join(', ') \|\| 'none'})`);
			`}`

			`fs.writeFileSync(outPath, JSON.stringify(report, null, 2));`
			console.log(`\nFull report: ${outPath}`);
			`} catch (err) {`
			`console.error('Agent eval failed:', err);`
			`process.exit(1);`
			`}`
			`})();`
			`}`

			`module.exports = { runAgentEval };`