feat: confluence benchmark, pattern extractor, agent KB, UX spec

- extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions,
  CIDR allocations, naming conventions, sync waves, tech stack from code
- agent-kb.js: token-efficient JSON rendering of same doc tree
- eval-confluence-ref-questions.json: 32 reference-only benchmark questions
- wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%)
- docs/human-ux-spec.md: BMad UX designer spec for human doc structure
- Eval results: V2 at 28.7% vs confluence 77.8% baseline
- Hub/spoke ownership now correctly extracted (95% on that question)
- Naming conventions, regions, CIDRs surfaced in system-architecture.md
This commit is contained in:
Jarvis Prime
2026-03-10 14:20:35 +00:00
parent 049609a358
commit 0265ec7a60
844 changed files with 2129910 additions and 30 deletions

333
extract-patterns.js Normal file
View File

@@ -0,0 +1,333 @@
/**
* extract-patterns.js — Mine architectural patterns from code artifacts.
*
* Extracts:
* - Layered architecture from repo/dir naming conventions
* - Hub/spoke model from ArgoCD ApplicationSet configs
* - Cloud regions from terraform configs + values.yaml
* - CIDR allocations from terraform variables
* - Naming conventions from scripts + terraform
* - Sync-wave ordering from Helm template annotations
* - Release/deployment patterns from CI configs + scripts
* - Tech stack from Helm chart images + dependencies
*/
const fs = require('fs');
const path = require('path');
const LAYER_PATTERNS = [
{ pattern: /^app[-_]/, layer: 'Application', order: 1 },
{ pattern: /^compute[-_]/, layer: 'Compute', order: 2 },
{ pattern: /^network[-_]/, layer: 'Network', order: 3 },
{ pattern: /^account[-_]/, layer: 'Account', order: 4 },
{ pattern: /^control[-_]/, layer: 'Control Plane', order: 5 },
{ pattern: /^runtime/, layer: 'Runtime (shared)', order: 0 },
{ pattern: /^ipam[-_]/, layer: 'IPAM', order: 3.5 },
{ pattern: /^skills/, layer: 'Skills/Tooling', order: 6 },
{ pattern: /^docs/, layer: 'Documentation', order: 7 },
];
/**
* Infer layered architecture from top-level directory names.
*/
function extractLayers(srcRoot) {
const dirs = fs.readdirSync(srcRoot, { withFileTypes: true })
.filter(d => d.isDirectory() && !d.name.startsWith('.'))
.map(d => d.name);
const layers = {};
for (const dir of dirs) {
for (const lp of LAYER_PATTERNS) {
if (lp.pattern.test(dir)) {
if (!layers[lp.layer]) layers[lp.layer] = { order: lp.order, repos: [] };
layers[lp.layer].repos.push(dir);
break;
}
}
}
return Object.entries(layers)
.sort((a, b) => a[1].order - b[1].order)
.map(([name, info]) => ({ layer: name, repos: info.repos, order: info.order }));
}
/**
* Extract ArgoCD ApplicationSet configs to infer hub/spoke ownership.
*/
function extractArgoCDAppSets(srcRoot) {
const appsets = [];
const walkDir = (dir) => {
try {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
walkDir(full);
} else if (entry.name.endsWith('-appset.yaml') || entry.name.endsWith('-appset.yml')) {
try {
const content = fs.readFileSync(full, 'utf8');
const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
const namespace = content.match(/namespace:\s*['"]?([^\s'"]+)/)?.[1] || '';
const repoURL = content.match(/repoURL:\s*['"]?([^\s'"]+)/)?.[1] || '';
const targetRevision = content.match(/targetRevision:\s*['"]?([^\s'"]+)/)?.[1] || '';
const destServer = content.match(/server:\s*['"]?([^\s'"]+)/)?.[1] || '';
const relPath = path.relative(srcRoot, full);
// Determine if hub or spoke based on path
const isHub = relPath.includes('hub') || relPath.includes('control-plane');
appsets.push({
name, namespace, repoURL, targetRevision, destServer,
file: relPath,
location: isHub ? 'hub' : 'spoke',
repoName: repoURL.match(/\/([^/]+?)(?:\.git)?$/)?.[1] || repoURL,
});
} catch {}
}
}
} catch {}
};
walkDir(srcRoot);
return appsets;
}
/**
* Extract cloud regions from terraform configs and values.yaml files.
*/
function extractCloudRegions(srcRoot) {
const regions = { aws: new Set(), azure: new Set(), gcp: new Set() };
const walkDir = (dir) => {
try {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
walkDir(full);
} else if (entry.name.endsWith('.tf') || entry.name === 'values.yaml' || entry.name === 'variables.tf') {
try {
const content = fs.readFileSync(full, 'utf8');
// AWS regions
const awsMatches = content.match(/us-east-[12]|us-west-[12]|eu-west-[123]|eu-central-[12]|ap-southeast-[12]|ap-northeast-[123]/g);
if (awsMatches) awsMatches.forEach(r => regions.aws.add(r));
// Azure regions
const azureMatches = content.match(/(?:centralus|eastus[2]?|westus[23]?|westeurope|northeurope|southeastasia|australiaeast)/g);
if (azureMatches) azureMatches.forEach(r => regions.azure.add(r));
// GCP regions
const gcpMatches = content.match(/us-central1|us-east[14]|us-west[14]|europe-west[1-6]|asia-east[12]|asia-southeast[12]/g);
if (gcpMatches) gcpMatches.forEach(r => regions.gcp.add(r));
} catch {}
}
}
} catch {}
};
walkDir(srcRoot);
return {
aws: [...regions.aws].sort(),
azure: [...regions.azure].sort(),
gcp: [...regions.gcp].sort(),
};
}
/**
* Extract CIDR allocations from terraform variables and configs.
*/
function extractCIDRAllocations(srcRoot) {
const cidrs = [];
const walkDir = (dir) => {
try {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
walkDir(full);
} else if (entry.name.endsWith('.tf') || entry.name.endsWith('.tfvars')) {
try {
const content = fs.readFileSync(full, 'utf8');
const relPath = path.relative(srcRoot, full);
// Match CIDR blocks
const cidrMatches = content.match(/(?:cidr|CIDR|subnet|network).*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/g);
if (cidrMatches) {
for (const m of cidrMatches) {
const cidr = m.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/)?.[1];
if (cidr) cidrs.push({ cidr, context: m.trim().substring(0, 100), file: relPath });
}
}
} catch {}
}
}
} catch {}
};
walkDir(srcRoot);
// Deduplicate by CIDR
const unique = {};
for (const c of cidrs) {
if (!unique[c.cidr]) unique[c.cidr] = [];
unique[c.cidr].push({ context: c.context, file: c.file });
}
return Object.entries(unique).map(([cidr, refs]) => ({ cidr, refs }));
}
/**
* Extract naming conventions from scripts and terraform.
*/
function extractNamingConventions(srcRoot) {
const conventions = [];
const walkDir = (dir) => {
try {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
walkDir(full);
} else if (entry.name.endsWith('.sh') || entry.name.endsWith('.py') || entry.name.endsWith('.tf')) {
try {
const content = fs.readFileSync(full, 'utf8');
const relPath = path.relative(srcRoot, full);
// Only match lines that explicitly describe naming conventions with template patterns
const lines = content.split('\n');
for (const line of lines) {
const trimmed = line.trim();
// Must contain a template-like pattern AND a convention keyword
if (trimmed.match(/convention|naming|format/i) && trimmed.match(/\{(phase|region|cloud|index|env)\}/i)) {
conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
}
// Also match explicit naming examples like "aws-{phase}-{region-code}-{index}-vpc"
if (trimmed.match(/(?:aws|azr|gcp)-\{.*\}-\{.*\}/)) {
conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
}
}
} catch {}
}
}
} catch {}
};
walkDir(srcRoot);
// Deduplicate
const seen = new Set();
return conventions.filter(c => {
const key = c.pattern;
if (seen.has(key)) return false;
seen.add(key);
return true;
});
}
/**
* Extract sync-wave ordering from Helm templates.
*/
function extractSyncWaves(srcRoot) {
const waves = {};
const walkDir = (dir) => {
try {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
walkDir(full);
} else if (entry.name.endsWith('.yaml') || entry.name.endsWith('.yml')) {
try {
const content = fs.readFileSync(full, 'utf8');
const waveMatch = content.match(/sync-wave:\s*["']?(-?\d+)["']?/);
if (waveMatch) {
const wave = parseInt(waveMatch[1]);
const kind = content.match(/kind:\s*(\w+)/)?.[1] || 'Unknown';
const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
const relPath = path.relative(srcRoot, full);
if (!waves[wave]) waves[wave] = [];
waves[wave].push({ kind, name, file: relPath });
}
} catch {}
}
}
} catch {}
};
walkDir(srcRoot);
return Object.entries(waves)
.sort((a, b) => Number(a[0]) - Number(b[0]))
.map(([wave, resources]) => ({ wave: Number(wave), resources }));
}
/**
* Extract tech stack from Helm chart images and package.json.
*/
function extractTechStack(srcRoot) {
const images = new Set();
const packages = {};
const walkDir = (dir) => {
try {
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
walkDir(full);
} else if (entry.name === 'values.yaml') {
try {
const content = fs.readFileSync(full, 'utf8');
const imgMatches = content.match(/image:\s*['"]?([^\s'"]+)/g);
if (imgMatches) imgMatches.forEach(m => {
const img = m.replace(/image:\s*['"]?/, '').replace(/['"]$/, '');
if (img && !img.includes('{{') && !img.includes('__helm')) images.add(img);
});
const repoMatches = content.match(/repository:\s*['"]?([^\s'"]+)/g);
if (repoMatches) repoMatches.forEach(m => {
const repo = m.replace(/repository:\s*['"]?/, '').replace(/['"]$/, '');
if (repo && !repo.includes('{{') && !repo.includes('__helm') && repo.includes('/')) images.add(repo);
});
} catch {}
} else if (entry.name === 'package.json') {
try {
const pkg = JSON.parse(fs.readFileSync(full, 'utf8'));
const relPath = path.relative(srcRoot, full);
if (pkg.dependencies) {
for (const [name, ver] of Object.entries(pkg.dependencies)) {
if (!packages[name]) packages[name] = [];
packages[name].push({ version: ver, file: relPath });
}
}
} catch {}
}
}
} catch {}
};
walkDir(srcRoot);
return {
containerImages: [...images].sort(),
npmPackages: Object.entries(packages).sort((a, b) => b[1].length - a[1].length).slice(0, 30)
.map(([name, refs]) => ({ name, count: refs.length, versions: [...new Set(refs.map(r => r.version))] })),
};
}
/**
* Run all pattern extractors and return a unified result.
*/
function extractAllPatterns(srcRoot) {
console.log('Extracting architectural patterns...');
const layers = extractLayers(srcRoot);
console.log(` Layers: ${layers.length}`);
const appsets = extractArgoCDAppSets(srcRoot);
console.log(` ApplicationSets: ${appsets.length}`);
const regions = extractCloudRegions(srcRoot);
console.log(` Regions: AWS=${regions.aws.length} Azure=${regions.azure.length} GCP=${regions.gcp.length}`);
const cidrs = extractCIDRAllocations(srcRoot);
console.log(` CIDR allocations: ${cidrs.length}`);
const naming = extractNamingConventions(srcRoot);
console.log(` Naming conventions: ${naming.length}`);
const syncWaves = extractSyncWaves(srcRoot);
console.log(` Sync waves: ${syncWaves.length} distinct waves`);
const techStack = extractTechStack(srcRoot);
console.log(` Container images: ${techStack.containerImages.length}, NPM packages: ${techStack.npmPackages.length}`);
return { layers, appsets, regions, cidrs, naming, syncWaves, techStack };
}
module.exports = { extractAllPatterns, extractLayers, extractArgoCDAppSets, extractCloudRegions, extractCIDRAllocations, extractNamingConventions, extractSyncWaves, extractTechStack };