/** * Eval Track 2: Human Readability Benchmark * * Tests whether the docs are useful for a human engineer. * Feeds the FULL relevant doc page(s) to the LLM and scores on: * - Clarity: Is the prose understandable? * - Actionability: Could you act on this information? * - Navigation: Does the doc structure guide you to the answer? * - Completeness: Is the answer fully covered? * * Usage: node eval-human.js [output.json] */ const fs = require('fs'); const path = require('path'); const { callLLM } = require('./prose.js'); /** Find the most relevant doc files for a human-audience question */ function findRelevantDocs(question, docsDir) { const docs = []; // Always include the architecture overview const archPath = path.join(docsDir, 'reference/system-architecture.md'); if (fs.existsSync(archPath)) { docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') }); } // Category-specific doc selection const q = question.question.toLowerCase(); // Helm-related: include helm index if (q.includes('helm') || q.includes('chart') || q.includes('secret') || q.includes('port') || q.includes('deploy') || q.includes('service')) { const helmIndex = path.join(docsDir, 'reference/helm/index.md'); if (fs.existsSync(helmIndex)) { docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') }); } } // Find specific chart docs mentioned in the question const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster']; for (const name of chartNames) { if (q.includes(name)) { // Find matching chart doc(s) const chartsDir = path.join(docsDir, 'reference/helm/charts'); if (fs.existsSync(chartsDir)) { for (const f of fs.readdirSync(chartsDir)) { if (f.includes(name)) { const content = fs.readFileSync(path.join(chartsDir, f), 'utf8'); docs.push({ path: `reference/helm/charts/${f}`, content }); } } } } } // Subsystem-related if (q.includes('subsystem') || q.includes('cross-cutting') || q.includes('depend')) { const subsDir = path.join(docsDir, 'reference/subsystems'); if (fs.existsSync(subsDir)) { for (const f of fs.readdirSync(subsDir)) { const content = fs.readFileSync(path.join(subsDir, f), 'utf8'); docs.push({ path: `reference/subsystems/${f}`, content }); } } } // Contracts if (q.includes('contract') || q.includes('interface') || q.includes('secret') || q.includes('configmap')) { const contractsPath = path.join(docsDir, 'reference/contracts/index.md'); if (fs.existsSync(contractsPath)) { const content = fs.readFileSync(contractsPath, 'utf8'); // Truncate if massive docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) }); } } // Cap total context at 60K let total = 0; return docs.filter(d => { if (total + d.content.length > 60000) return false; total += d.content.length; return true; }); } /** Have an LLM simulate a human reading the docs */ async function humanRead(question, docs, llmOpts) { const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n'); const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time. DOCUMENTATION: ${context} QUESTION: ${question.question} Instructions: - Read the docs carefully, as a human would - If the answer requires combining info from multiple sections, do so - If the docs don't cover this, say "NOT_FOUND" - Match the expected format: ${question.answerType} Answer:`; return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 }); } /** Score with human-focused criteria */ async function scoreHuman(question, answer, docsUsed, llmOpts) { const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective. QUESTION: ${question.question} GROUND TRUTH: ${question.answer} READER'S ANSWER: ${answer} DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')} Score on these human-centric dimensions (0-5 each): 1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious) 2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute) 3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered) 4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect) If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0. Respond in EXACTLY this JSON format: {"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`; const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 }); try { const jsonMatch = raw.match(/\{[\s\S]*\}/); if (jsonMatch) return JSON.parse(jsonMatch[0]); } catch {} return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` }; } /** Run the human eval */ async function runHumanEval(docsDir, questionsPath, llmOpts = {}) { const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8')); const questions = questionsData.questions.filter(q => q.audience.includes('human')); console.log(`Human Eval: ${questions.length} human-audience questions`); const results = []; let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 }; let notFound = 0; for (let i = 0; i < questions.length; i++) { const q = questions[i]; process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`); const docs = findRelevantDocs(q, docsDir); let answer; try { answer = await humanRead(q, docs, llmOpts); } catch (err) { answer = `ERROR: ${err.message}`; } let score; try { score = await scoreHuman(q, answer, docs, llmOpts); } catch (err) { score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` }; } const isNotFound = answer.includes('NOT_FOUND'); if (isNotFound) notFound++; for (const k of Object.keys(totals)) totals[k] += (score[k] || 0); const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0); console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`); results.push({ id: q.id, category: q.category, difficulty: q.difficulty, question: q.question, groundTruth: q.answer, humanAnswer: answer, docsUsed: docs.map(d => d.path), score, composite: Number(composite), notFound: isNotFound, }); } const n = questions.length; const report = { evalType: 'human', timestamp: new Date().toISOString(), docsDir, totalQuestions: n, overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1), avgClarity: (totals.clarity / n).toFixed(2), avgActionability: (totals.actionability / n).toFixed(2), avgCompleteness: (totals.completeness / n).toFixed(2), avgFindability: (totals.findability / n).toFixed(2), notFoundCount: notFound, notFoundRate: ((notFound / n) * 100).toFixed(1) + '%', byCategory: {}, byDifficulty: {}, results, }; for (const r of results) { for (const groupKey of ['category', 'difficulty']) { const group = groupKey === 'category' ? report.byCategory : report.byDifficulty; const key = r[groupKey]; if (!group[key]) group[key] = { total: 0, count: 0 }; group[key].total += r.composite; group[key].count++; } } for (const group of [report.byCategory, report.byDifficulty]) { for (const [k, v] of Object.entries(group)) { group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count }; } } return report; } if (require.main === module) { const docsDir = process.argv[2]; const questionsPath = process.argv[3]; const outPath = process.argv[4] || './eval-human-report.json'; if (!docsDir || !questionsPath) { console.error('Usage: node eval-human.js [output.json]'); process.exit(1); } const model = process.env.LLM_MODEL || 'claude-haiku-4.5'; console.log(`Using model: ${model}`); (async () => { try { const report = await runHumanEval(docsDir, questionsPath, { model }); console.log('\n' + '═'.repeat(60)); console.log('HUMAN EVAL REPORT'); console.log('═'.repeat(60)); console.log(`Overall Score: ${report.overallScore}%`); console.log(`Clarity: ${report.avgClarity}/5 Actionability: ${report.avgActionability}/5 Completeness: ${report.avgCompleteness}/5 Findability: ${report.avgFindability}/5`); console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`); console.log('\nBy Category:'); for (const [cat, s] of Object.entries(report.byCategory)) { console.log(` ${cat}: ${s.avg}% (${s.count} questions)`); } console.log('\nBy Difficulty:'); for (const [diff, s] of Object.entries(report.byDifficulty)) { console.log(` ${diff}: ${s.avg}% (${s.count} questions)`); } const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5); console.log('\nWeakest:'); for (const w of worst) { console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`); } fs.writeFileSync(outPath, JSON.stringify(report, null, 2)); console.log(`\nFull report: ${outPath}`); } catch (err) { console.error('Human eval failed:', err); process.exit(1); } })(); } module.exports = { runHumanEval };