2026-03-09 23:55:54 +00:00
/ * *
* Eval Track 1 : Agent File - Browsing Benchmark
*
* Spawns a sub - agent with file access to the docs directory .
* The agent navigates the tree , reads files , follows cross - references .
* Tests whether the doc STRUCTURE is navigable by an AI agent .
*
* Usage : node eval - agent . js < docs - dir > < questions . json > [ output . json ]
* /
const fs = require ( 'fs' ) ;
const path = require ( 'path' ) ;
const { callLLM } = require ( './prose.js' ) ;
/** Simulate an agent browsing the doc tree with file tools */
async function agentBrowse ( question , docsDir , llmOpts ) {
// Step 1: Agent sees the directory tree
2026-03-10 00:40:38 +00:00
const tree = buildTree ( docsDir , '' , 4 ) ;
2026-03-09 23:55:54 +00:00
// Step 2: Agent picks which files to read based on the question + tree
const planPrompt = ` You are an AI agent with access to a documentation directory. You need to answer a question by browsing the file tree and reading specific files.
FILE TREE :
$ { tree }
QUESTION : $ { question . question }
EXPECTED FORMAT : $ { question . answerType }
Based on the file tree , which files should you read to answer this question ? List up to 5 file paths ( most relevant first ) . Think about :
- Index files that might have summary tables
- Specific chart / subsystem docs that match the question topic
- Architecture overview docs for system - wide questions
Respond with ONLY the file paths , one per line . No explanation . ` ;
const planRaw = await callLLM ( planPrompt , { ... llmOpts , maxTokens : 512 , temperature : 0.0 } ) ;
// Parse file paths from plan
const plannedFiles = planRaw . split ( '\n' )
. map ( l => l . trim ( ) . replace ( /^[-*•]\s*/ , '' ) . replace ( /`/g , '' ) )
. filter ( l => l . length > 0 && ! l . startsWith ( '#' ) )
. slice ( 0 , 5 ) ;
// Step 3: Read the planned files
let context = '' ;
const filesRead = [ ] ;
for ( const relPath of plannedFiles ) {
const absPath = path . join ( docsDir , relPath ) ;
if ( fs . existsSync ( absPath ) ) {
try {
const content = fs . readFileSync ( absPath , 'utf8' ) ;
2026-03-10 00:40:38 +00:00
// Cap per file at 30K chars to allow reading the full index
const truncated = content . length > 30000 ? content . substring ( 0 , 30000 ) + '\n... (truncated)' : content ;
2026-03-09 23:55:54 +00:00
context += ` \n === ${ relPath } === \n ${ truncated } \n ` ;
filesRead . push ( relPath ) ;
} catch { }
}
}
// Step 4: If the agent found nothing useful, let it try a second pass
if ( filesRead . length === 0 ) {
// Fallback: read the main index files
const fallbacks = [ 'reference/system-architecture.md' , 'reference/helm/index.md' ] ;
for ( const fb of fallbacks ) {
const absPath = path . join ( docsDir , fb ) ;
if ( fs . existsSync ( absPath ) ) {
const content = fs . readFileSync ( absPath , 'utf8' ) ;
2026-03-10 00:40:38 +00:00
context += ` \n === ${ fb } === \n ${ content . substring ( 0 , 30000 ) } \n ` ;
2026-03-09 23:55:54 +00:00
filesRead . push ( fb ) ;
}
}
}
// Step 5: Agent answers from the files it read
const answerPrompt = ` You are an AI agent that has browsed a documentation directory to answer a question. Here are the files you read:
$ { context }
QUESTION : $ { question . question }
EXPECTED FORMAT : $ { question . answerType }
Answer the question using ONLY the information from the files above . If you can ' t find the answer , say "NOT_FOUND" .
Be precise and match the expected format .
Answer : ` ;
const answer = await callLLM ( answerPrompt , { ... llmOpts , maxTokens : 1024 , temperature : 0.0 } ) ;
return { answer , filesRead , plannedFiles } ;
}
/** Build a directory tree string */
function buildTree ( dir , prefix , maxDepth ) {
if ( maxDepth <= 0 ) return '' ;
const lines = [ ] ;
let entries ;
try { entries = fs . readdirSync ( dir , { withFileTypes : true } ) . sort ( ( a , b ) => a . name . localeCompare ( b . name ) ) ; } catch { return '' ; }
for ( const e of entries ) {
if ( e . name . startsWith ( '.' ) ) continue ;
const relPath = prefix ? ` ${ prefix } / ${ e . name } ` : e . name ;
if ( e . isDirectory ( ) ) {
const childCount = fs . readdirSync ( path . join ( dir , e . name ) ) . length ;
lines . push ( ` ${ relPath } / ( ${ childCount } items) ` ) ;
if ( maxDepth > 1 ) {
lines . push ( buildTree ( path . join ( dir , e . name ) , relPath , maxDepth - 1 ) ) ;
}
} else {
const size = fs . statSync ( path . join ( dir , e . name ) ) . size ;
lines . push ( ` ${ relPath } ( ${ ( size / 1024 ) . toFixed ( 1 ) } K) ` ) ;
}
}
return lines . filter ( l => l ) . join ( '\n' ) ;
}
/** Score using LLM-as-judge (same as eval.js) */
async function scoreAnswer ( question , llmAnswer , llmOpts ) {
const prompt = ` You are a strict evaluator scoring an AI agent's answer against ground truth.
QUESTION : $ { question . question }
EXPECTED ANSWER TYPE : $ { question . answerType }
GROUND TRUTH : $ { question . answer }
AI ANSWER : $ { llmAnswer }
Score on these dimensions ( 0 - 5 each ) :
1. ACCURACY : Does the answer contain the correct facts ?
2. COMPLETENESS : Does it cover all items in the ground truth ?
3. PRECISION : Is it free of hallucinated or incorrect extra information ?
4. NAVIGATION : Did the agent demonstrate it could find the right information ? ( 0 = couldn ' t find anything , 5 = went straight to the right file )
If the AI answered "NOT_FOUND" , score ACCURACY = 0 , COMPLETENESS = 0 , PRECISION = 5 , NAVIGATION = 0.
Respond in EXACTLY this JSON format :
{ "accuracy" : N , "completeness" : N , "precision" : N , "navigation" : N , "notes" : "brief explanation" } ` ;
const raw = await callLLM ( prompt , { ... llmOpts , maxTokens : 256 , temperature : 0.0 } ) ;
try {
const jsonMatch = raw . match ( /\{[\s\S]*\}/ ) ;
if ( jsonMatch ) return JSON . parse ( jsonMatch [ 0 ] ) ;
} catch { }
return { accuracy : 0 , completeness : 0 , precision : 0 , navigation : 0 , notes : ` Parse error: ${ raw . substring ( 0 , 100 ) } ` } ;
}
/** Run the agent eval */
async function runAgentEval ( docsDir , questionsPath , llmOpts = { } ) {
const questionsData = JSON . parse ( fs . readFileSync ( questionsPath , 'utf8' ) ) ;
feat: confluence benchmark, pattern extractor, agent KB, UX spec
- extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions,
CIDR allocations, naming conventions, sync waves, tech stack from code
- agent-kb.js: token-efficient JSON rendering of same doc tree
- eval-confluence-ref-questions.json: 32 reference-only benchmark questions
- wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%)
- docs/human-ux-spec.md: BMad UX designer spec for human doc structure
- Eval results: V2 at 28.7% vs confluence 77.8% baseline
- Hub/spoke ownership now correctly extracted (95% on that question)
- Naming conventions, regions, CIDRs surfaced in system-architecture.md
2026-03-10 14:20:35 +00:00
const questions = questionsData . questions . filter ( q => ! q . audience || q . audience . includes ( 'machine' ) || true ) ;
2026-03-09 23:55:54 +00:00
console . log ( ` Agent Eval: ${ questions . length } machine-audience questions ` ) ;
const results = [ ] ;
let totals = { accuracy : 0 , completeness : 0 , precision : 0 , navigation : 0 } ;
let notFound = 0 ;
for ( let i = 0 ; i < questions . length ; i ++ ) {
const q = questions [ i ] ;
process . stdout . write ( ` [ ${ i + 1 } / ${ questions . length } ] ${ q . id } ... ` ) ;
let browseResult ;
try {
browseResult = await agentBrowse ( q , docsDir , llmOpts ) ;
} catch ( err ) {
browseResult = { answer : ` ERROR: ${ err . message } ` , filesRead : [ ] , plannedFiles : [ ] } ;
}
let score ;
try {
score = await scoreAnswer ( q , browseResult . answer , llmOpts ) ;
} catch ( err ) {
score = { accuracy : 0 , completeness : 0 , precision : 0 , navigation : 0 , notes : ` Score error: ${ err . message } ` } ;
}
const isNotFound = browseResult . answer . includes ( 'NOT_FOUND' ) ;
if ( isNotFound ) notFound ++ ;
for ( const k of Object . keys ( totals ) ) totals [ k ] += score [ k ] ;
const composite = ( ( score . accuracy + score . completeness + score . precision + score . navigation ) / 20 * 100 ) . toFixed ( 0 ) ;
console . log ( ` ${ composite } % (A: ${ score . accuracy } C: ${ score . completeness } P: ${ score . precision } N: ${ score . navigation } ) files: ${ browseResult . filesRead . length } ${ isNotFound ? ' [NOT_FOUND]' : '' } ` ) ;
results . push ( {
id : q . id ,
category : q . category ,
difficulty : q . difficulty ,
question : q . question ,
groundTruth : q . answer ,
llmAnswer : browseResult . answer ,
filesRead : browseResult . filesRead ,
plannedFiles : browseResult . plannedFiles ,
score ,
composite : Number ( composite ) ,
notFound : isNotFound ,
} ) ;
}
const n = questions . length ;
const report = {
evalType : 'agent' ,
timestamp : new Date ( ) . toISOString ( ) ,
docsDir ,
totalQuestions : n ,
overallScore : ( ( Object . values ( totals ) . reduce ( ( a , b ) => a + b , 0 ) ) / ( n * 20 ) * 100 ) . toFixed ( 1 ) ,
avgAccuracy : ( totals . accuracy / n ) . toFixed ( 2 ) ,
avgCompleteness : ( totals . completeness / n ) . toFixed ( 2 ) ,
avgPrecision : ( totals . precision / n ) . toFixed ( 2 ) ,
avgNavigation : ( totals . navigation / n ) . toFixed ( 2 ) ,
notFoundCount : notFound ,
notFoundRate : ( ( notFound / n ) * 100 ) . toFixed ( 1 ) + '%' ,
byCategory : { } ,
byDifficulty : { } ,
results ,
} ;
// Aggregate by category and difficulty
for ( const r of results ) {
for ( const groupKey of [ 'category' , 'difficulty' ] ) {
const group = groupKey === 'category' ? report . byCategory : report . byDifficulty ;
const key = r [ groupKey ] ;
if ( ! group [ key ] ) group [ key ] = { total : 0 , count : 0 } ;
group [ key ] . total += r . composite ;
group [ key ] . count ++ ;
}
}
for ( const group of [ report . byCategory , report . byDifficulty ] ) {
for ( const [ k , v ] of Object . entries ( group ) ) {
group [ k ] = { avg : ( v . total / v . count ) . toFixed ( 1 ) , count : v . count } ;
}
}
return report ;
}
if ( require . main === module ) {
const docsDir = process . argv [ 2 ] ;
const questionsPath = process . argv [ 3 ] ;
const outPath = process . argv [ 4 ] || './eval-agent-report.json' ;
if ( ! docsDir || ! questionsPath ) {
console . error ( 'Usage: node eval-agent.js <docs-dir> <questions.json> [output.json]' ) ;
process . exit ( 1 ) ;
}
const model = process . env . LLM _MODEL || 'claude-haiku-4.5' ;
console . log ( ` Using model: ${ model } ` ) ;
( async ( ) => {
try {
const report = await runAgentEval ( docsDir , questionsPath , { model } ) ;
console . log ( '\n' + '═' . repeat ( 60 ) ) ;
console . log ( 'AGENT EVAL REPORT' ) ;
console . log ( '═' . repeat ( 60 ) ) ;
console . log ( ` Overall Score: ${ report . overallScore } % ` ) ;
console . log ( ` Accuracy: ${ report . avgAccuracy } /5 Completeness: ${ report . avgCompleteness } /5 Precision: ${ report . avgPrecision } /5 Navigation: ${ report . avgNavigation } /5 ` ) ;
console . log ( ` Not Found: ${ report . notFoundCount } / ${ report . totalQuestions } ( ${ report . notFoundRate } ) ` ) ;
console . log ( '\nBy Category:' ) ;
for ( const [ cat , s ] of Object . entries ( report . byCategory ) ) {
console . log ( ` ${ cat } : ${ s . avg } % ( ${ s . count } questions) ` ) ;
}
console . log ( '\nBy Difficulty:' ) ;
for ( const [ diff , s ] of Object . entries ( report . byDifficulty ) ) {
console . log ( ` ${ diff } : ${ s . avg } % ( ${ s . count } questions) ` ) ;
}
const worst = [ ... report . results ] . sort ( ( a , b ) => a . composite - b . composite ) . slice ( 0 , 5 ) ;
console . log ( '\nWeakest:' ) ;
for ( const w of worst ) {
console . log ( ` [ ${ w . id } ] ${ w . composite } % — ${ w . question . substring ( 0 , 70 ) } ... (read: ${ w . filesRead . join ( ', ' ) || 'none' } ) ` ) ;
}
fs . writeFileSync ( outPath , JSON . stringify ( report , null , 2 ) ) ;
console . log ( ` \n Full report: ${ outPath } ` ) ;
} catch ( err ) {
console . error ( 'Agent eval failed:' , err ) ;
process . exit ( 1 ) ;
}
} ) ( ) ;
}
module . exports = { runAgentEval } ;