281 lines
10 KiB
JavaScript
281 lines
10 KiB
JavaScript
|
|
/**
|
||
|
|
* Eval Track 1: Agent File-Browsing Benchmark
|
||
|
|
*
|
||
|
|
* Spawns a sub-agent with file access to the docs directory.
|
||
|
|
* The agent navigates the tree, reads files, follows cross-references.
|
||
|
|
* Tests whether the doc STRUCTURE is navigable by an AI agent.
|
||
|
|
*
|
||
|
|
* Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]
|
||
|
|
*/
|
||
|
|
|
||
|
|
const fs = require('fs');
|
||
|
|
const path = require('path');
|
||
|
|
const { callLLM } = require('./prose.js');
|
||
|
|
|
||
|
|
/** Simulate an agent browsing the doc tree with file tools */
|
||
|
|
async function agentBrowse(question, docsDir, llmOpts) {
|
||
|
|
// Step 1: Agent sees the directory tree
|
||
|
|
const tree = buildTree(docsDir, '', 3);
|
||
|
|
|
||
|
|
// Step 2: Agent picks which files to read based on the question + tree
|
||
|
|
const planPrompt = `You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files.
|
||
|
|
|
||
|
|
FILE TREE:
|
||
|
|
${tree}
|
||
|
|
|
||
|
|
QUESTION: ${question.question}
|
||
|
|
EXPECTED FORMAT: ${question.answerType}
|
||
|
|
|
||
|
|
Based on the file tree, which files should you read to answer this question? List up to 5 file paths (most relevant first). Think about:
|
||
|
|
- Index files that might have summary tables
|
||
|
|
- Specific chart/subsystem docs that match the question topic
|
||
|
|
- Architecture overview docs for system-wide questions
|
||
|
|
|
||
|
|
Respond with ONLY the file paths, one per line. No explanation.`;
|
||
|
|
|
||
|
|
const planRaw = await callLLM(planPrompt, { ...llmOpts, maxTokens: 512, temperature: 0.0 });
|
||
|
|
|
||
|
|
// Parse file paths from plan
|
||
|
|
const plannedFiles = planRaw.split('\n')
|
||
|
|
.map(l => l.trim().replace(/^[-*•]\s*/, '').replace(/`/g, ''))
|
||
|
|
.filter(l => l.length > 0 && !l.startsWith('#'))
|
||
|
|
.slice(0, 5);
|
||
|
|
|
||
|
|
// Step 3: Read the planned files
|
||
|
|
let context = '';
|
||
|
|
const filesRead = [];
|
||
|
|
for (const relPath of plannedFiles) {
|
||
|
|
const absPath = path.join(docsDir, relPath);
|
||
|
|
if (fs.existsSync(absPath)) {
|
||
|
|
try {
|
||
|
|
const content = fs.readFileSync(absPath, 'utf8');
|
||
|
|
// Cap per file at 15K chars
|
||
|
|
const truncated = content.length > 15000 ? content.substring(0, 15000) + '\n... (truncated)' : content;
|
||
|
|
context += `\n=== ${relPath} ===\n${truncated}\n`;
|
||
|
|
filesRead.push(relPath);
|
||
|
|
} catch {}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 4: If the agent found nothing useful, let it try a second pass
|
||
|
|
if (filesRead.length === 0) {
|
||
|
|
// Fallback: read the main index files
|
||
|
|
const fallbacks = ['reference/system-architecture.md', 'reference/helm/index.md'];
|
||
|
|
for (const fb of fallbacks) {
|
||
|
|
const absPath = path.join(docsDir, fb);
|
||
|
|
if (fs.existsSync(absPath)) {
|
||
|
|
const content = fs.readFileSync(absPath, 'utf8');
|
||
|
|
context += `\n=== ${fb} ===\n${content.substring(0, 15000)}\n`;
|
||
|
|
filesRead.push(fb);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 5: Agent answers from the files it read
|
||
|
|
const answerPrompt = `You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read:
|
||
|
|
|
||
|
|
${context}
|
||
|
|
|
||
|
|
QUESTION: ${question.question}
|
||
|
|
EXPECTED FORMAT: ${question.answerType}
|
||
|
|
|
||
|
|
Answer the question using ONLY the information from the files above. If you can't find the answer, say "NOT_FOUND".
|
||
|
|
Be precise and match the expected format.
|
||
|
|
|
||
|
|
Answer:`;
|
||
|
|
|
||
|
|
const answer = await callLLM(answerPrompt, { ...llmOpts, maxTokens: 1024, temperature: 0.0 });
|
||
|
|
|
||
|
|
return { answer, filesRead, plannedFiles };
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Build a directory tree string */
|
||
|
|
function buildTree(dir, prefix, maxDepth) {
|
||
|
|
if (maxDepth <= 0) return '';
|
||
|
|
const lines = [];
|
||
|
|
let entries;
|
||
|
|
try { entries = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); } catch { return ''; }
|
||
|
|
|
||
|
|
for (const e of entries) {
|
||
|
|
if (e.name.startsWith('.')) continue;
|
||
|
|
const relPath = prefix ? `${prefix}/${e.name}` : e.name;
|
||
|
|
if (e.isDirectory()) {
|
||
|
|
const childCount = fs.readdirSync(path.join(dir, e.name)).length;
|
||
|
|
lines.push(`${relPath}/ (${childCount} items)`);
|
||
|
|
if (maxDepth > 1) {
|
||
|
|
lines.push(buildTree(path.join(dir, e.name), relPath, maxDepth - 1));
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
const size = fs.statSync(path.join(dir, e.name)).size;
|
||
|
|
lines.push(`${relPath} (${(size / 1024).toFixed(1)}K)`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return lines.filter(l => l).join('\n');
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Score using LLM-as-judge (same as eval.js) */
|
||
|
|
async function scoreAnswer(question, llmAnswer, llmOpts) {
|
||
|
|
const prompt = `You are a strict evaluator scoring an AI agent's answer against ground truth.
|
||
|
|
|
||
|
|
QUESTION: ${question.question}
|
||
|
|
EXPECTED ANSWER TYPE: ${question.answerType}
|
||
|
|
GROUND TRUTH: ${question.answer}
|
||
|
|
AI ANSWER: ${llmAnswer}
|
||
|
|
|
||
|
|
Score on these dimensions (0-5 each):
|
||
|
|
1. ACCURACY: Does the answer contain the correct facts?
|
||
|
|
2. COMPLETENESS: Does it cover all items in the ground truth?
|
||
|
|
3. PRECISION: Is it free of hallucinated or incorrect extra information?
|
||
|
|
4. NAVIGATION: Did the agent demonstrate it could find the right information? (0=couldn't find anything, 5=went straight to the right file)
|
||
|
|
|
||
|
|
If the AI answered "NOT_FOUND", score ACCURACY=0, COMPLETENESS=0, PRECISION=5, NAVIGATION=0.
|
||
|
|
|
||
|
|
Respond in EXACTLY this JSON format:
|
||
|
|
{"accuracy": N, "completeness": N, "precision": N, "navigation": N, "notes": "brief explanation"}`;
|
||
|
|
|
||
|
|
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
|
||
|
|
try {
|
||
|
|
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||
|
|
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||
|
|
} catch {}
|
||
|
|
return { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Run the agent eval */
|
||
|
|
async function runAgentEval(docsDir, questionsPath, llmOpts = {}) {
|
||
|
|
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||
|
|
const questions = questionsData.questions.filter(q => q.audience.includes('machine'));
|
||
|
|
|
||
|
|
console.log(`Agent Eval: ${questions.length} machine-audience questions`);
|
||
|
|
|
||
|
|
const results = [];
|
||
|
|
let totals = { accuracy: 0, completeness: 0, precision: 0, navigation: 0 };
|
||
|
|
let notFound = 0;
|
||
|
|
|
||
|
|
for (let i = 0; i < questions.length; i++) {
|
||
|
|
const q = questions[i];
|
||
|
|
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
|
||
|
|
|
||
|
|
let browseResult;
|
||
|
|
try {
|
||
|
|
browseResult = await agentBrowse(q, docsDir, llmOpts);
|
||
|
|
} catch (err) {
|
||
|
|
browseResult = { answer: `ERROR: ${err.message}`, filesRead: [], plannedFiles: [] };
|
||
|
|
}
|
||
|
|
|
||
|
|
let score;
|
||
|
|
try {
|
||
|
|
score = await scoreAnswer(q, browseResult.answer, llmOpts);
|
||
|
|
} catch (err) {
|
||
|
|
score = { accuracy: 0, completeness: 0, precision: 0, navigation: 0, notes: `Score error: ${err.message}` };
|
||
|
|
}
|
||
|
|
|
||
|
|
const isNotFound = browseResult.answer.includes('NOT_FOUND');
|
||
|
|
if (isNotFound) notFound++;
|
||
|
|
|
||
|
|
for (const k of Object.keys(totals)) totals[k] += score[k];
|
||
|
|
|
||
|
|
const composite = ((score.accuracy + score.completeness + score.precision + score.navigation) / 20 * 100).toFixed(0);
|
||
|
|
console.log(` ${composite}% (A:${score.accuracy} C:${score.completeness} P:${score.precision} N:${score.navigation}) files:${browseResult.filesRead.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
|
||
|
|
|
||
|
|
results.push({
|
||
|
|
id: q.id,
|
||
|
|
category: q.category,
|
||
|
|
difficulty: q.difficulty,
|
||
|
|
question: q.question,
|
||
|
|
groundTruth: q.answer,
|
||
|
|
llmAnswer: browseResult.answer,
|
||
|
|
filesRead: browseResult.filesRead,
|
||
|
|
plannedFiles: browseResult.plannedFiles,
|
||
|
|
score,
|
||
|
|
composite: Number(composite),
|
||
|
|
notFound: isNotFound,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
const n = questions.length;
|
||
|
|
const report = {
|
||
|
|
evalType: 'agent',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
docsDir,
|
||
|
|
totalQuestions: n,
|
||
|
|
overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
|
||
|
|
avgAccuracy: (totals.accuracy / n).toFixed(2),
|
||
|
|
avgCompleteness: (totals.completeness / n).toFixed(2),
|
||
|
|
avgPrecision: (totals.precision / n).toFixed(2),
|
||
|
|
avgNavigation: (totals.navigation / n).toFixed(2),
|
||
|
|
notFoundCount: notFound,
|
||
|
|
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
|
||
|
|
byCategory: {},
|
||
|
|
byDifficulty: {},
|
||
|
|
results,
|
||
|
|
};
|
||
|
|
|
||
|
|
// Aggregate by category and difficulty
|
||
|
|
for (const r of results) {
|
||
|
|
for (const groupKey of ['category', 'difficulty']) {
|
||
|
|
const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
|
||
|
|
const key = r[groupKey];
|
||
|
|
if (!group[key]) group[key] = { total: 0, count: 0 };
|
||
|
|
group[key].total += r.composite;
|
||
|
|
group[key].count++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for (const group of [report.byCategory, report.byDifficulty]) {
|
||
|
|
for (const [k, v] of Object.entries(group)) {
|
||
|
|
group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return report;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (require.main === module) {
|
||
|
|
const docsDir = process.argv[2];
|
||
|
|
const questionsPath = process.argv[3];
|
||
|
|
const outPath = process.argv[4] || './eval-agent-report.json';
|
||
|
|
|
||
|
|
if (!docsDir || !questionsPath) {
|
||
|
|
console.error('Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
|
||
|
|
console.log(`Using model: ${model}`);
|
||
|
|
|
||
|
|
(async () => {
|
||
|
|
try {
|
||
|
|
const report = await runAgentEval(docsDir, questionsPath, { model });
|
||
|
|
|
||
|
|
console.log('\n' + '═'.repeat(60));
|
||
|
|
console.log('AGENT EVAL REPORT');
|
||
|
|
console.log('═'.repeat(60));
|
||
|
|
console.log(`Overall Score: ${report.overallScore}%`);
|
||
|
|
console.log(`Accuracy: ${report.avgAccuracy}/5 Completeness: ${report.avgCompleteness}/5 Precision: ${report.avgPrecision}/5 Navigation: ${report.avgNavigation}/5`);
|
||
|
|
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
|
||
|
|
console.log('\nBy Category:');
|
||
|
|
for (const [cat, s] of Object.entries(report.byCategory)) {
|
||
|
|
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
|
||
|
|
}
|
||
|
|
console.log('\nBy Difficulty:');
|
||
|
|
for (const [diff, s] of Object.entries(report.byDifficulty)) {
|
||
|
|
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
|
||
|
|
console.log('\nWeakest:');
|
||
|
|
for (const w of worst) {
|
||
|
|
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}... (read: ${w.filesRead.join(', ') || 'none'})`);
|
||
|
|
}
|
||
|
|
|
||
|
|
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
|
||
|
|
console.log(`\nFull report: ${outPath}`);
|
||
|
|
} catch (err) {
|
||
|
|
console.error('Agent eval failed:', err);
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
})();
|
||
|
|
}
|
||
|
|
|
||
|
|
module.exports = { runAgentEval };
|