/** * Eval Track 1: Agent File-Browsing Benchmark * * Spawns a sub-agent with file access to the docs directory. * The agent navigates the tree, reads files, follows cross-references. * Tests whether the doc STRUCTURE is navigable by an AI agent. * * Usage: node eval-agent.js [output.json] */ const fs = require('fs'); const path = require('path'); const { callLLM } = require('./prose.js'); /** Simulate an agent browsing the doc tree with file tools */ async function agentBrowse(question, docsDir, llmOpts) { // Step 1: Agent sees the directory tree const tree = buildTree(docsDir, '', 4); // Step 2: Agent picks which files to read based on the question + tree const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files. FILE TREE: ${tree} QUESTION: ${question.question} EXPECTED FORMAT: ${question.answerType} Based on the file tree, which files should you read to answer this question? List up to 5 file paths (most relevant first). Think about: - Index files that might have summary tables - Specific chart/subsystem docs that match the question topic - Architecture overview docs for system-wide questions Respond with ONLY the file paths, one per line. No explanation.`; const planRaw = await callLLM(planPrompt, { ...llmOpts, maxTokens: 512, temperature: 0.0 }); // Parse file paths from plan const plannedFiles = planRaw.split('\n') .map(l => l.trim().replace(/^[-*•]\s*/, '').replace(/`/g, '')) .filter(l => l.length > 0 && !l.startsWith('#')) .slice(0, 5); // Step 3: Read the planned files let context = ''; const filesRead = []; for (const relPath of plannedFiles) { const absPath = path.join(docsDir, relPath); if (fs.existsSync(absPath)) { try { const content = fs.readFileSync(absPath, 'utf8'); // Cap per file at 30K chars to allow reading the full index const truncated = content.length > 30000 ? content.substring(0, 30000) + '\n... (truncated)' : content; context += `\n=== ${relPath} ===\n${truncated}\n`; filesRead.push(relPath); } catch {} } } // Step 4: If the agent found nothing useful, let it try a second pass if (filesRead.length === 0) { // Fallback: read the main index files const fallbacks = ['reference/system-architecture.md', 'reference/helm/index.md']; for (const fb of fallbacks) { const absPath = path.join(docsDir, fb); if (fs.existsSync(absPath)) { const content = fs.readFileSync(absPath, 'utf8'); context += `\n=== ${fb} ===\n${content.substring(0, 30000)}\n`; filesRead.push(fb); } } } // Step 5: Agent answers from the files it read const answerPrompt = `You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read: ${context} QUESTION: ${question.question} EXPECTED FORMAT: ${question.answerType} Answer the question using ONLY the information from the files above. If you can't find the answer, say "NOT_FOUND". Be precise and match the expected format. Answer:`; const answer = await callLLM(answerPrompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 }); return { answer, filesRead, plannedFiles }; } /** Build a directory tree string */ function buildTree(dir, prefix, maxDepth) { if (maxDepth <= 0) return ''; const lines = []; let entries; try { entries = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); } catch { return ''; } for (const e of entries) { if (e.name.startsWith('.')) continue; const relPath = prefix ? `${prefix}/${e.name}` : e.name; if (e.isDirectory()) { const childCount = fs.readdirSync(path.join(dir, e.name)).length; lines.push(`${relPath}/ (${childCount} items)`); if (maxDepth > 1) { lines.push(buildTree(path.join(dir, e.name), relPath, maxDepth - 1)); } } else { const size = fs.statSync(path.join(dir, e.name)).size; lines.push(`${relPath} (${(size / 1024).toFixed(1)}K)`); } } return lines.filter(l => l).join('\n'); } /** Score using LLM-as-judge (same as eval.js) */ async function scoreAnswer(question, llmAnswer, llmOpts) { const prompt = `You are a strict evaluator scoring an AI agent's answer against ground truth. QUESTION: ${question.question} EXPECTED ANSWER TYPE: ${question.answerType} GROUND TRUTH: ${question.answer} AI ANSWER: ${llmAnswer} Score on these dimensions (0-5 each): 1. ACCURACY: Does the answer contain the correct facts? 2. COMPLETENESS: Does it cover all items in the ground truth? 3. PRECISION: Is it free of hallucinated or incorrect extra information? 4. NAVIGATION: Did the agent demonstrate it could find the right information? (0=couldn't find anything, 5=went straight to the right file) If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5, NAVIGATION=0. Respond in EXACTLY this JSON format: {"accuracy": N, "completeness": N, "precision": N, "navigation": N, "notes": "brief explanation"}`; const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 }); try { const jsonMatch = raw.match(/\{[\s\S]*\}/); if (jsonMatch) return JSON.parse(jsonMatch[0]); } catch {} return { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Parse error: ${raw.substring(0, 100)}` }; } /** Run the agent eval */ async function runAgentEval(docsDir, questionsPath, llmOpts = {}) { const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8')); const questions = questionsData.questions.filter(q => q.audience.includes('machine')); console.log(`Agent Eval: ${questions.length} machine-audience questions`); const results = []; let totals = { accuracy: 0, completeness: 0, precision: 0, navigation: 0 }; let notFound = 0; for (let i = 0; i < questions.length; i++) { const q = questions[i]; process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`); let browseResult; try { browseResult = await agentBrowse(q, docsDir, llmOpts); } catch (err) { browseResult = { answer: `ERROR: ${err.message}`, filesRead: [], plannedFiles: [] }; } let score; try { score = await scoreAnswer(q, browseResult.answer, llmOpts); } catch (err) { score = { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Score error: ${err.message}` }; } const isNotFound = browseResult.answer.includes('NOT_FOUND'); if (isNotFound) notFound++; for (const k of Object.keys(totals)) totals[k] += score[k]; const composite = ((score.accuracy + score.completeness + score.precision + score.navigation) / 20 * 100).toFixed(0); console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision} N:${score.navigation}) files:${browseResult.filesRead.length}${isNotFound ? ' [NOT_FOUND]' : ''}`); results.push({ id: q.id, category: q.category, difficulty: q.difficulty, question: q.question, groundTruth: q.answer, llmAnswer: browseResult.answer, filesRead: browseResult.filesRead, plannedFiles: browseResult.plannedFiles, score, composite: Number(composite), notFound: isNotFound, }); } const n = questions.length; const report = { evalType: 'agent', timestamp: new Date().toISOString(), docsDir, totalQuestions: n, overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1), avgAccuracy: (totals.accuracy / n).toFixed(2), avgCompleteness: (totals.completeness / n).toFixed(2), avgPrecision: (totals.precision / n).toFixed(2), avgNavigation: (totals.navigation / n).toFixed(2), notFoundCount: notFound, notFoundRate: ((notFound / n) * 100).toFixed(1) + '%', byCategory: {}, byDifficulty: {}, results, }; // Aggregate by category and difficulty for (const r of results) { for (const groupKey of ['category', 'difficulty']) { const group = groupKey === 'category' ? report.byCategory : report.byDifficulty; const key = r[groupKey]; if (!group[key]) group[key] = { total: 0, count: 0 }; group[key].total += r.composite; group[key].count++; } } for (const group of [report.byCategory, report.byDifficulty]) { for (const [k, v] of Object.entries(group)) { group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count }; } } return report; } if (require.main === module) { const docsDir = process.argv[2]; const questionsPath = process.argv[3]; const outPath = process.argv[4] || './eval-agent-report.json'; if (!docsDir || !questionsPath) { console.error('Usage: node eval-agent.js [output.json]'); process.exit(1); } const model = process.env.LLM_MODEL || 'claude-haiku-4.5'; console.log(`Using model: ${model}`); (async () => { try { const report = await runAgentEval(docsDir, questionsPath, { model }); console.log('\n' + '═'.repeat(60)); console.log('AGENT EVAL REPORT'); console.log('═'.repeat(60)); console.log(`Overall Score: ${report.overallScore}%`); console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5 Navigation: ${report.avgNavigation}/5`); console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`); console.log('\nBy Category:'); for (const [cat, s] of Object.entries(report.byCategory)) { console.log(` ${cat}: ${s.avg}% (${s.count} questions)`); } console.log('\nBy Difficulty:'); for (const [diff, s] of Object.entries(report.byDifficulty)) { console.log(` ${diff}: ${s.avg}% (${s.count} questions)`); } const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5); console.log('\nWeakest:'); for (const w of worst) { console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}... (read: ${w.filesRead.join(', ') || 'none'})`); } fs.writeFileSync(outPath, JSON.stringify(report, null, 2)); console.log(`\nFull report: ${outPath}`); } catch (err) { console.error('Agent eval failed:', err); process.exit(1); } })(); } module.exports = { runAgentEval };