eval-agent.js

/**
 * Eval Track 1: Agent File-Browsing Benchmark
 * 
 * Spawns a sub-agent with file access to the docs directory.
 * The agent navigates the tree, reads files, follows cross-references.
 * Tests whether the doc STRUCTURE is navigable by an AI agent.
 * 
 * Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]
 */

const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');

/** Simulate an agent browsing the doc tree with file tools */
async function agentBrowse(question, docsDir, llmOpts) {
  // Step 1: Agent sees the directory tree
  const tree = buildTree(docsDir, '', 4);
  
  // Step 2: Agent picks which files to read based on the question + tree
  const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files.

FILE TREE:
${tree}

QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}

CRITICAL RULES FOR FILE SELECTION:
1. If you are unsure where to find the information, you MUST ALWAYS include "reference/index.md" as it contains a keyword mapping to all files.
2. If the question is about dependencies, check "reference/dependencies.md".
3. Think about index files that might have summary tables, and architecture overviews.
4. If you know the exact subsystem, list its operations.md or index.md.

Based on the file tree, which files should you read to answer this question? List up to 5 file paths (most relevant first).
Respond with ONLY the file paths, one per line. No explanation.`;

  const planRaw = await callLLM(planPrompt, { ...llmOpts, maxTokens: 512, temperature: 0.0 });
  
  // Parse file paths from plan (strip <think> blocks)
  const planText = planRaw.replace(/<think>[\s\S]*?<\/think>/, '').trim();
  const plannedFiles = planText.split('\n')
    .map(l => l.trim().replace(/^[-*•]\s*/, '').replace(/`/g, ''))
    .filter(l => l.length > 0 && !l.startsWith('#'))
    .slice(0, 5);

  // Step 3: Read the planned files
  let context = '';
  const filesRead = [];
  for (const relPath of plannedFiles) {
    const absPath = path.join(docsDir, relPath);
    if (fs.existsSync(absPath)) {
      try {
        const content = fs.readFileSync(absPath, 'utf8');
        // Cap per file at 30K chars to allow reading the full index
        const truncated = content.length > 30000 ? content.substring(0, 30000) + '\n... (truncated)' : content;
        context += `\n=== ${relPath} ===\n${truncated}\n`;
        filesRead.push(relPath);
      } catch {}
    }
  }

  // Step 4: If the agent found nothing useful, let it try a second pass
  if (filesRead.length === 0) {
    // Fallback: read the main index files
    const fallbacks = ['reference/system-architecture.md', 'reference/helm/index.md'];
    for (const fb of fallbacks) {
      const absPath = path.join(docsDir, fb);
      if (fs.existsSync(absPath)) {
        const content = fs.readFileSync(absPath, 'utf8');
        context += `\n=== ${fb} ===\n${content.substring(0, 30000)}\n`;
        filesRead.push(fb);
      }
    }
  }

  // Step 5: Agent answers from the files it read
  const answerPrompt = `You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read:

${context}

QUESTION: ${question.question}
EXPECTED FORMAT: ${question.answerType}

Answer the question using ONLY the information from the files above. If you can't find the answer, say "NOT_FOUND".
Be precise and match the expected format.

Answer:`;

  const answer = await callLLM(answerPrompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
  
  return { answer, filesRead, plannedFiles };
}

/** Build a directory tree string */
function buildTree(dir, prefix, maxDepth) {
  if (maxDepth <= 0) return '';
  const lines = [];
  let entries;
  try { entries = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); } catch { return ''; }
  
  for (const e of entries) {
    if (e.name.startsWith('.')) continue;
    const relPath = prefix ? `${prefix}/${e.name}` : e.name;
    if (e.isDirectory()) {
      const childCount = fs.readdirSync(path.join(dir, e.name)).length;
      lines.push(`${relPath}/ (${childCount} items)`);
      if (maxDepth > 1) {
        lines.push(buildTree(path.join(dir, e.name), relPath, maxDepth - 1));
      }
    } else {
      const size = fs.statSync(path.join(dir, e.name)).size;
      lines.push(`${relPath} (${(size / 1024).toFixed(1)}K)`);
    }
  }
  return lines.filter(l => l).join('\n');
}

/** Score using LLM-as-judge (same as eval.js) */
async function scoreAnswer(question, llmAnswer, llmOpts) {
  const prompt = `You are a strict evaluator scoring an AI agent's answer against ground truth.

QUESTION: ${question.question}
EXPECTED ANSWER TYPE: ${question.answerType}
GROUND TRUTH: ${question.answer}
AI ANSWER: ${llmAnswer}

Score on these dimensions (0-5 each):
1. ACCURACY: Does the answer contain the correct facts?
2. COMPLETENESS: Does it cover all items in the ground truth?
3. PRECISION: Is it free of hallucinated or incorrect extra information?
4. NAVIGATION: Did the agent demonstrate it could find the right information? (0=couldn't find anything, 5=went straight to the right file)

If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5, NAVIGATION=0.

Respond in EXACTLY this JSON format:
{"accuracy": N, "completeness": N, "precision": N, "navigation": N, "notes": "brief explanation"}`;

  const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
  try {
    const jsonMatch = raw.match(/\{[\s\S]*\}/);
    if (jsonMatch) return JSON.parse(jsonMatch[0]);
  } catch {}
  return { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}

/** Run the agent eval */
async function runAgentEval(docsDir, questionsPath, llmOpts = {}) {
  const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
  const questions = questionsData.questions.filter(q => !q.audience || q.audience.includes('machine') || true);
  
  console.log(`Agent Eval: ${questions.length} machine-audience questions`);
  
  const results = [];
  let totals = { accuracy: 0, completeness: 0, precision: 0, navigation: 0 };
  let notFound = 0;

  for (let i = 0; i < questions.length; i++) {
    const q = questions[i];
    process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
    
    let browseResult;
    try {
      browseResult = await agentBrowse(q, docsDir, llmOpts);
    } catch (err) {
      browseResult = { answer: `ERROR: ${err.message}`, filesRead: [], plannedFiles: [] };
    }
    
    let score;
    try {
      score = await scoreAnswer(q, browseResult.answer, llmOpts);
    } catch (err) {
      score = { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Score error: ${err.message}` };
    }
    
    const isNotFound = browseResult.answer.includes('NOT_FOUND');
    if (isNotFound) notFound++;
    
    for (const k of Object.keys(totals)) totals[k] += score[k];
    
    const composite = ((score.accuracy + score.completeness + score.precision + score.navigation) / 20 * 100).toFixed(0);
    console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision} N:${score.navigation}) files:${browseResult.filesRead.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
    
    results.push({
      id: q.id,
      category: q.category,
      difficulty: q.difficulty,
      question: q.question,
      groundTruth: q.answer,
      llmAnswer: browseResult.answer,
      filesRead: browseResult.filesRead,
      plannedFiles: browseResult.plannedFiles,
      score,
      composite: Number(composite),
      notFound: isNotFound,
    });
  }

  const n = questions.length;
  const report = {
    evalType: 'agent',
    timestamp: new Date().toISOString(),
    docsDir,
    totalQuestions: n,
    overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
    avgAccuracy: (totals.accuracy / n).toFixed(2),
    avgCompleteness: (totals.completeness / n).toFixed(2),
    avgPrecision: (totals.precision / n).toFixed(2),
    avgNavigation: (totals.navigation / n).toFixed(2),
    notFoundCount: notFound,
    notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
    byCategory: {},
    byDifficulty: {},
    results,
  };

  // Aggregate by category and difficulty
  for (const r of results) {
    for (const groupKey of ['category', 'difficulty']) {
      const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
      const key = r[groupKey];
      if (!group[key]) group[key] = { total: 0, count: 0 };
      group[key].total += r.composite;
      group[key].count++;
    }
  }
  for (const group of [report.byCategory, report.byDifficulty]) {
    for (const [k, v] of Object.entries(group)) {
      group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
    }
  }

  return report;
}

if (require.main === module) {
  const docsDir = process.argv[2];
  const questionsPath = process.argv[3];
  const outPath = process.argv[4] || './eval-agent-report.json';

  if (!docsDir || !questionsPath) {
    console.error('Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]');
    process.exit(1);
  }

  const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
  console.log(`Using model: ${model}`);

  (async () => {
    try {
      const report = await runAgentEval(docsDir, questionsPath, { model });
      
      console.log('\n' + '═'.repeat(60));
      console.log('AGENT EVAL REPORT');
      console.log('═'.repeat(60));
      console.log(`Overall Score: ${report.overallScore}%`);
      console.log(`Accuracy: ${report.avgAccuracy}/5  Completeness: ${report.avgCompleteness}/5  Precision: ${report.avgPrecision}/5  Navigation: ${report.avgNavigation}/5`);
      console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
      console.log('\nBy Category:');
      for (const [cat, s] of Object.entries(report.byCategory)) {
        console.log(`  ${cat}: ${s.avg}% (${s.count} questions)`);
      }
      console.log('\nBy Difficulty:');
      for (const [diff, s] of Object.entries(report.byDifficulty)) {
        console.log(`  ${diff}: ${s.avg}% (${s.count} questions)`);
      }
      
      const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
      console.log('\nWeakest:');
      for (const w of worst) {
        console.log(`  [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}... (read: ${w.filesRead.join(', ') || 'none'})`);
      }
      
      fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
      console.log(`\nFull report: ${outPath}`);
    } catch (err) {
      console.error('Agent eval failed:', err);
      process.exit(1);
    }
  })();
}

module.exports = { runAgentEval };