273 lines
10 KiB
JavaScript
273 lines
10 KiB
JavaScript
|
|
/**
|
||
|
|
* Eval Track 2: Human Readability Benchmark
|
||
|
|
*
|
||
|
|
* Tests whether the docs are useful for a human engineer.
|
||
|
|
* Feeds the FULL relevant doc page(s) to the LLM and scores on:
|
||
|
|
* - Clarity: Is the prose understandable?
|
||
|
|
* - Actionability: Could you act on this information?
|
||
|
|
* - Navigation: Does the doc structure guide you to the answer?
|
||
|
|
* - Completeness: Is the answer fully covered?
|
||
|
|
*
|
||
|
|
* Usage: node eval-human.js <docs-dir> <questions.json> [output.json]
|
||
|
|
*/
|
||
|
|
|
||
|
|
const fs = require('fs');
|
||
|
|
const path = require('path');
|
||
|
|
const { callLLM } = require('./prose.js');
|
||
|
|
|
||
|
|
/** Find the most relevant doc files for a human-audience question */
|
||
|
|
function findRelevantDocs(question, docsDir) {
|
||
|
|
const docs = [];
|
||
|
|
|
||
|
|
// Always include the architecture overview
|
||
|
|
const archPath = path.join(docsDir, 'reference/system-architecture.md');
|
||
|
|
if (fs.existsSync(archPath)) {
|
||
|
|
docs.push({ path: 'reference/system-architecture.md', content: fs.readFileSync(archPath, 'utf8') });
|
||
|
|
}
|
||
|
|
|
||
|
|
// Category-specific doc selection
|
||
|
|
const q = question.question.toLowerCase();
|
||
|
|
|
||
|
|
// Helm-related: include helm index
|
||
|
|
if (q.includes('helm') || q.includes('chart') || q.includes('secret') || q.includes('port') || q.includes('deploy') || q.includes('service')) {
|
||
|
|
const helmIndex = path.join(docsDir, 'reference/helm/index.md');
|
||
|
|
if (fs.existsSync(helmIndex)) {
|
||
|
|
docs.push({ path: 'reference/helm/index.md', content: fs.readFileSync(helmIndex, 'utf8') });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Find specific chart docs mentioned in the question
|
||
|
|
const chartNames = ['mdm-app', 'ai-app', 'elasticsearch', 'hazelcast', 'cassandra', 'kong', 'redis', 'jenkins', 'otel-collector', 'twistlock', 'cluster'];
|
||
|
|
for (const name of chartNames) {
|
||
|
|
if (q.includes(name)) {
|
||
|
|
// Find matching chart doc(s)
|
||
|
|
const chartsDir = path.join(docsDir, 'reference/helm/charts');
|
||
|
|
if (fs.existsSync(chartsDir)) {
|
||
|
|
for (const f of fs.readdirSync(chartsDir)) {
|
||
|
|
if (f.includes(name)) {
|
||
|
|
const content = fs.readFileSync(path.join(chartsDir, f), 'utf8');
|
||
|
|
docs.push({ path: `reference/helm/charts/${f}`, content });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Subsystem-related
|
||
|
|
if (q.includes('subsystem') || q.includes('cross-cutting') || q.includes('depend')) {
|
||
|
|
const subsDir = path.join(docsDir, 'reference/subsystems');
|
||
|
|
if (fs.existsSync(subsDir)) {
|
||
|
|
for (const f of fs.readdirSync(subsDir)) {
|
||
|
|
const content = fs.readFileSync(path.join(subsDir, f), 'utf8');
|
||
|
|
docs.push({ path: `reference/subsystems/${f}`, content });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Contracts
|
||
|
|
if (q.includes('contract') || q.includes('interface') || q.includes('secret') || q.includes('configmap')) {
|
||
|
|
const contractsPath = path.join(docsDir, 'reference/contracts/index.md');
|
||
|
|
if (fs.existsSync(contractsPath)) {
|
||
|
|
const content = fs.readFileSync(contractsPath, 'utf8');
|
||
|
|
// Truncate if massive
|
||
|
|
docs.push({ path: 'reference/contracts/index.md', content: content.substring(0, 30000) });
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Cap total context at 60K
|
||
|
|
let total = 0;
|
||
|
|
return docs.filter(d => {
|
||
|
|
if (total + d.content.length > 60000) return false;
|
||
|
|
total += d.content.length;
|
||
|
|
return true;
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Have an LLM simulate a human reading the docs */
|
||
|
|
async function humanRead(question, docs, llmOpts) {
|
||
|
|
const context = docs.map(d => `\n=== ${d.path} ===\n${d.content}`).join('\n');
|
||
|
|
|
||
|
|
const prompt = `You are a new infrastructure engineer who just joined the Foxtrot team. You've been given documentation to read. Answer the following question as if you're reading these docs for the first time.
|
||
|
|
|
||
|
|
DOCUMENTATION:
|
||
|
|
${context}
|
||
|
|
|
||
|
|
QUESTION: ${question.question}
|
||
|
|
|
||
|
|
Instructions:
|
||
|
|
- Read the docs carefully, as a human would
|
||
|
|
- If the answer requires combining info from multiple sections, do so
|
||
|
|
- If the docs don't cover this, say "NOT_FOUND"
|
||
|
|
- Match the expected format: ${question.answerType}
|
||
|
|
|
||
|
|
Answer:`;
|
||
|
|
|
||
|
|
return callLLM(prompt, { ...llmOpts, maxTokens: 1024, temperature: 0.1 });
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Score with human-focused criteria */
|
||
|
|
async function scoreHuman(question, answer, docsUsed, llmOpts) {
|
||
|
|
const prompt = `You are evaluating documentation quality from a HUMAN reader's perspective.
|
||
|
|
|
||
|
|
QUESTION: ${question.question}
|
||
|
|
GROUND TRUTH: ${question.answer}
|
||
|
|
READER'S ANSWER: ${answer}
|
||
|
|
DOCS CONSULTED: ${docsUsed.map(d => d.path).join(', ')}
|
||
|
|
|
||
|
|
Score on these human-centric dimensions (0-5 each):
|
||
|
|
1. CLARITY: Was the information presented clearly enough for a human to extract the answer? (0=confusing/buried, 5=immediately obvious)
|
||
|
|
2. ACTIONABILITY: Could an engineer act on this information? (0=useless, 5=ready to execute)
|
||
|
|
3. COMPLETENESS: Did the docs contain ALL the information needed? (0=missing, 5=fully covered)
|
||
|
|
4. FINDABILITY: Based on the doc paths, would a human naturally look in these files? (0=buried in wrong place, 5=exactly where you'd expect)
|
||
|
|
|
||
|
|
If the reader answered "NOT_FOUND", score CLARITY=0, ACTIONABILITY=0, COMPLETENESS=0, FINDABILITY=0.
|
||
|
|
|
||
|
|
Respond in EXACTLY this JSON format:
|
||
|
|
{"clarity": N, "actionability": N, "completeness": N, "findability": N, "notes": "brief explanation"}`;
|
||
|
|
|
||
|
|
const raw = await callLLM(prompt, { ...llmOpts, maxTokens: 256, temperature: 0.0 });
|
||
|
|
try {
|
||
|
|
const jsonMatch = raw.match(/\{[\s\S]*\}/);
|
||
|
|
if (jsonMatch) return JSON.parse(jsonMatch[0]);
|
||
|
|
} catch {}
|
||
|
|
return { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Parse error: ${raw.substring(0, 100)}` };
|
||
|
|
}
|
||
|
|
|
||
|
|
/** Run the human eval */
|
||
|
|
async function runHumanEval(docsDir, questionsPath, llmOpts = {}) {
|
||
|
|
const questionsData = JSON.parse(fs.readFileSync(questionsPath, 'utf8'));
|
||
|
|
const questions = questionsData.questions.filter(q => q.audience.includes('human'));
|
||
|
|
|
||
|
|
console.log(`Human Eval: ${questions.length} human-audience questions`);
|
||
|
|
|
||
|
|
const results = [];
|
||
|
|
let totals = { clarity: 0, actionability: 0, completeness: 0, findability: 0 };
|
||
|
|
let notFound = 0;
|
||
|
|
|
||
|
|
for (let i = 0; i < questions.length; i++) {
|
||
|
|
const q = questions[i];
|
||
|
|
process.stdout.write(`[${i + 1}/${questions.length}] ${q.id}...`);
|
||
|
|
|
||
|
|
const docs = findRelevantDocs(q, docsDir);
|
||
|
|
|
||
|
|
let answer;
|
||
|
|
try {
|
||
|
|
answer = await humanRead(q, docs, llmOpts);
|
||
|
|
} catch (err) {
|
||
|
|
answer = `ERROR: ${err.message}`;
|
||
|
|
}
|
||
|
|
|
||
|
|
let score;
|
||
|
|
try {
|
||
|
|
score = await scoreHuman(q, answer, docs, llmOpts);
|
||
|
|
} catch (err) {
|
||
|
|
score = { clarity: 0, actionability: 0, completeness: 0, findability: 0, notes: `Score error: ${err.message}` };
|
||
|
|
}
|
||
|
|
|
||
|
|
const isNotFound = answer.includes('NOT_FOUND');
|
||
|
|
if (isNotFound) notFound++;
|
||
|
|
|
||
|
|
for (const k of Object.keys(totals)) totals[k] += (score[k] || 0);
|
||
|
|
|
||
|
|
const composite = ((score.clarity + score.actionability + score.completeness + score.findability) / 20 * 100).toFixed(0);
|
||
|
|
console.log(` ${composite}% (Cl:${score.clarity} Ac:${score.actionability} Co:${score.completeness} Fi:${score.findability}) docs:${docs.length}${isNotFound ? ' [NOT_FOUND]' : ''}`);
|
||
|
|
|
||
|
|
results.push({
|
||
|
|
id: q.id,
|
||
|
|
category: q.category,
|
||
|
|
difficulty: q.difficulty,
|
||
|
|
question: q.question,
|
||
|
|
groundTruth: q.answer,
|
||
|
|
humanAnswer: answer,
|
||
|
|
docsUsed: docs.map(d => d.path),
|
||
|
|
score,
|
||
|
|
composite: Number(composite),
|
||
|
|
notFound: isNotFound,
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
const n = questions.length;
|
||
|
|
const report = {
|
||
|
|
evalType: 'human',
|
||
|
|
timestamp: new Date().toISOString(),
|
||
|
|
docsDir,
|
||
|
|
totalQuestions: n,
|
||
|
|
overallScore: ((Object.values(totals).reduce((a, b) => a + b, 0)) / (n * 20) * 100).toFixed(1),
|
||
|
|
avgClarity: (totals.clarity / n).toFixed(2),
|
||
|
|
avgActionability: (totals.actionability / n).toFixed(2),
|
||
|
|
avgCompleteness: (totals.completeness / n).toFixed(2),
|
||
|
|
avgFindability: (totals.findability / n).toFixed(2),
|
||
|
|
notFoundCount: notFound,
|
||
|
|
notFoundRate: ((notFound / n) * 100).toFixed(1) + '%',
|
||
|
|
byCategory: {},
|
||
|
|
byDifficulty: {},
|
||
|
|
results,
|
||
|
|
};
|
||
|
|
|
||
|
|
for (const r of results) {
|
||
|
|
for (const groupKey of ['category', 'difficulty']) {
|
||
|
|
const group = groupKey === 'category' ? report.byCategory : report.byDifficulty;
|
||
|
|
const key = r[groupKey];
|
||
|
|
if (!group[key]) group[key] = { total: 0, count: 0 };
|
||
|
|
group[key].total += r.composite;
|
||
|
|
group[key].count++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for (const group of [report.byCategory, report.byDifficulty]) {
|
||
|
|
for (const [k, v] of Object.entries(group)) {
|
||
|
|
group[k] = { avg: (v.total / v.count).toFixed(1), count: v.count };
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return report;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (require.main === module) {
|
||
|
|
const docsDir = process.argv[2];
|
||
|
|
const questionsPath = process.argv[3];
|
||
|
|
const outPath = process.argv[4] || './eval-human-report.json';
|
||
|
|
|
||
|
|
if (!docsDir || !questionsPath) {
|
||
|
|
console.error('Usage: node eval-human.js <docs-dir> <questions.json> [output.json]');
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
|
||
|
|
const model = process.env.LLM_MODEL || 'claude-haiku-4.5';
|
||
|
|
console.log(`Using model: ${model}`);
|
||
|
|
|
||
|
|
(async () => {
|
||
|
|
try {
|
||
|
|
const report = await runHumanEval(docsDir, questionsPath, { model });
|
||
|
|
|
||
|
|
console.log('\n' + '═'.repeat(60));
|
||
|
|
console.log('HUMAN EVAL REPORT');
|
||
|
|
console.log('═'.repeat(60));
|
||
|
|
console.log(`Overall Score: ${report.overallScore}%`);
|
||
|
|
console.log(`Clarity: ${report.avgClarity}/5 Actionability: ${report.avgActionability}/5 Completeness: ${report.avgCompleteness}/5 Findability: ${report.avgFindability}/5`);
|
||
|
|
console.log(`Not Found: ${report.notFoundCount}/${report.totalQuestions} (${report.notFoundRate})`);
|
||
|
|
console.log('\nBy Category:');
|
||
|
|
for (const [cat, s] of Object.entries(report.byCategory)) {
|
||
|
|
console.log(` ${cat}: ${s.avg}% (${s.count} questions)`);
|
||
|
|
}
|
||
|
|
console.log('\nBy Difficulty:');
|
||
|
|
for (const [diff, s] of Object.entries(report.byDifficulty)) {
|
||
|
|
console.log(` ${diff}: ${s.avg}% (${s.count} questions)`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const worst = [...report.results].sort((a, b) => a.composite - b.composite).slice(0, 5);
|
||
|
|
console.log('\nWeakest:');
|
||
|
|
for (const w of worst) {
|
||
|
|
console.log(` [${w.id}] ${w.composite}% — ${w.question.substring(0, 70)}...`);
|
||
|
|
}
|
||
|
|
|
||
|
|
fs.writeFileSync(outPath, JSON.stringify(report, null, 2));
|
||
|
|
console.log(`\nFull report: ${outPath}`);
|
||
|
|
} catch (err) {
|
||
|
|
console.error('Human eval failed:', err);
|
||
|
|
process.exit(1);
|
||
|
|
}
|
||
|
|
})();
|
||
|
|
}
|
||
|
|
|
||
|
|
module.exports = { runHumanEval };
|