Phase 9c: Split eval into Agent (file-browsing) and Human (readability) tracks

Agent eval: 54.3% (22 questions, 40.9% NOT_FOUND)
Human eval: 63.9% (28 questions, 17.9% NOT_FOUND)

Key findings:
- Agent navigation is the bottleneck (2.09/5) — long path-based filenames hurt discoverability
- Human findability is decent (3.46/5) but dependency questions fail (0%) because chart docs for wrapper charts don't surface their sub-chart deps
- Both tracks show strong precision (4.4+/5) — very low hallucination
- Resources (91%) and interactions (95%) score great for humans
- Configuration and contracts are solid across both tracks
This commit is contained in:
Jarvis Prime
2026-03-09 23:55:54 +00:00
parent 0cc4abcb0f
commit 304f0a9e9f
4 changed files with 2050 additions and 0 deletions

272
eval-human.js Normal file
View File

@@ -0,0 +1,272 @@
/**
* Eval Track 2: Human Readability Benchmark
*
* Tests whether the docs are useful for a human engineer.
* Feeds the FULL relevant doc page(s) to the LLM and scores on:
* - Clarity: Is the prose understandable?
* - Actionability: Could you act on this information?
* - Navigation: Does the doc structure guide you to the answer?
* - Completeness: Is the answer fully covered?
*
* Usage: node eval-human.js <docs-dir> <questions.json> [output.json]
*/
const fs = require('fs');
const path = require('path');
const { callLLM } = require('./prose.js');
/** Find the most relevant doc files for a human-audience question */
function findRelevantDocs(question, docsDir) {
const docs = [];
// Always include the architecture overview
const archPath = path.join(docsDir, 'reference/system-architecture.md');
if (fs.existsSync(archPath)) {
docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') });
}
// Category-specific doc selection
const q = question.question.toLowerCase();
// Helm-related: include helm index
if (q.includes('helm') || q.includes('chart') || q.includes('secret') || q.includes('port') || q.includes('deploy') || q.includes('service')) {
const helmIndex = path.join(docsDir, 'reference/helm/index.md');
if (fs.existsSync(helmIndex)) {
docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') });
}
}
// Find specific chart docs mentioned in the question
const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster'];
for (const name of chartNames) {
if (q.includes(name)) {
// Find matching chart doc(s)
const chartsDir = path.join(docsDir, 'reference/helm/charts');
if (fs.existsSync(chartsDir)) {
for (const f of fs.readdirSync(chartsDir)) {
if (f.includes(name)) {
const content = fs.readFileSync(path.join(chartsDir, f), 'utf8');
docs.push({ path: `reference/helm/charts/${f}`, content });
}
}
}
}
}
// Subsystem-related
if (q.includes('subsystem') || q.includes('cross-cutting') || q.includes('depend')) {
const subsDir = path.join(docsDir, 'reference/subsystems');
if (fs.existsSync(subsDir)) {
for (const f of fs.readdirSync(subsDir)) {
const content = fs.readFileSync(path.join(subsDir, f), 'utf8');
docs.push({ path: `reference/subsystems/${f}`, content });
}
}
}
// Contracts
if (q.includes('contract') || q.includes('interface') || q.includes('secret') || q.includes('configmap')) {
const contractsPath = path.join(docsDir, 'reference/contracts/index.md');
if (fs.existsSync(contractsPath)) {
const content = fs.readFileSync(contractsPath, 'utf8');
// Truncate if massive
docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) });
}
}
// Cap total context at 60K
let total = 0;
return docs.filter(d => {
if (total + d.content.length > 60000) return false;
total += d.content.length;
return true;
});
}
/** Have an LLM simulate a human reading the docs */
async function humanRead(question, docs, llmOpts) {
const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n');
const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time.
DOCUMENTATION:
${context}
QUESTION: ${question.question}
Instructions:
- Read the docs carefully, as a human would
- If the answer requires combining info from multiple sections, do so
- If the docs don't cover this, say "NOT_FOUND"
- Match the expected format: ${question.answerType}
Answer:`;
return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 });
}
/** Score with human-focused criteria */
async function scoreHuman(question, answer, docsUsed, llmOpts) {
const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective.
QUESTION: ${question.question}
GROUND TRUTH: ${question.answer}
READER'S ANSWER: ${answer}
DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')}
Score on these human-centric dimensions (0-5 each):
1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious)
2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute)
3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered)
4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect)
If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0.
Respond in EXACTLY this JSON format:
{"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`;
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
try {
const jsonMatch = raw.match(/\{[\s\S]*\}/);
if (jsonMatch) return JSON.parse(jsonMatch[0]);
} catch {}
return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
}
/** Run the human eval */
async function runHumanEval(docsDir, questionsPath, llmOpts = {}) {
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
const questions = questionsData.questions.filter(q => q.audience.includes('human'));
console.log(`Human Eval: ${questions.length} human-audience questions`);
const results = [];
let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 };
let notFound = 0;
for (let i = 0; i < questions.length; i++) {
const q = questions[i];
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
const docs = findRelevantDocs(q, docsDir);
let answer;
try {
answer = await humanRead(q, docs, llmOpts);
} catch (err) {
answer = `ERROR: ${err.message}`;
}
let score;
try {
score = await scoreHuman(q, answer, docs, llmOpts);
} catch (err) {
score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` };
}
const isNotFound = answer.includes('NOT_FOUND');
if (isNotFound) notFound++;
for (const k of Object.keys(totals)) totals[k] += (score[k] || 0);
const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0);
console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
results.push({
id: q.id,
category: q.category,
difficulty: q.difficulty,
question: q.question,
groundTruth: q.answer,
humanAnswer: answer,
docsUsed: docs.map(d => d.path),
score,
composite: Number(composite),
notFound: isNotFound,
});
}
const n = questions.length;
const report = {
evalType: 'human',
timestamp: new Date().toISOString(),
docsDir,
totalQuestions: n,
overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
avgClarity: (totals.clarity / n).toFixed(2),
avgActionability: (totals.actionability / n).toFixed(2),
avgCompleteness: (totals.completeness / n).toFixed(2),
avgFindability: (totals.findability / n).toFixed(2),
notFoundCount: notFound,
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
byCategory: {},
byDifficulty: {},
results,
};
for (const r of results) {
for (const groupKey of ['category', 'difficulty']) {
const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
const key = r[groupKey];
if (!group[key]) group[key] = { total: 0, count: 0 };
group[key].total += r.composite;
group[key].count++;
}
}
for (const group of [report.byCategory, report.byDifficulty]) {
for (const [k, v] of Object.entries(group)) {
group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
}
}
return report;
}
if (require.main === module) {
const docsDir = process.argv[2];
const questionsPath = process.argv[3];
const outPath = process.argv[4] || './eval-human-report.json';
if (!docsDir || !questionsPath) {
console.error('Usage: node eval-human.js <docs-dir> <questions.json> [output.json]');
process.exit(1);
}
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
console.log(`Using model: ${model}`);
(async () => {
try {
const report = await runHumanEval(docsDir, questionsPath, { model });
console.log('\n' + '═'.repeat(60));
console.log('HUMAN EVAL REPORT');
console.log('═'.repeat(60));
console.log(`Overall Score: ${report.overallScore}%`);
console.log(`Clarity: ${report.avgClarity}/5 Actionability: ${report.avgActionability}/5 Completeness: ${report.avgCompleteness}/5 Findability: ${report.avgFindability}/5`);
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
console.log('\nBy Category:');
for (const [cat, s] of Object.entries(report.byCategory)) {
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
}
console.log('\nBy Difficulty:');
for (const [diff, s] of Object.entries(report.byDifficulty)) {
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
}
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
console.log('\nWeakest:');
for (const w of worst) {
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`);
}
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
console.log(`\nFull report: ${outPath}`);
} catch (err) {
console.error('Human eval failed:', err);
process.exit(1);
}
})();
}
module.exports = { runHumanEval };