feat: confluence benchmark, pattern extractor, agent KB, UX spec
- extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md
This commit is contained in:
333
extract-patterns.js
Normal file
333
extract-patterns.js
Normal file
@@ -0,0 +1,333 @@
|
||||
/**
|
||||
* extract-patterns.js — Mine architectural patterns from code artifacts.
|
||||
*
|
||||
* Extracts:
|
||||
* - Layered architecture from repo/dir naming conventions
|
||||
* - Hub/spoke model from ArgoCD ApplicationSet configs
|
||||
* - Cloud regions from terraform configs + values.yaml
|
||||
* - CIDR allocations from terraform variables
|
||||
* - Naming conventions from scripts + terraform
|
||||
* - Sync-wave ordering from Helm template annotations
|
||||
* - Release/deployment patterns from CI configs + scripts
|
||||
* - Tech stack from Helm chart images + dependencies
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const LAYER_PATTERNS = [
|
||||
{ pattern: /^app[-_]/, layer: 'Application', order: 1 },
|
||||
{ pattern: /^compute[-_]/, layer: 'Compute', order: 2 },
|
||||
{ pattern: /^network[-_]/, layer: 'Network', order: 3 },
|
||||
{ pattern: /^account[-_]/, layer: 'Account', order: 4 },
|
||||
{ pattern: /^control[-_]/, layer: 'Control Plane', order: 5 },
|
||||
{ pattern: /^runtime/, layer: 'Runtime (shared)', order: 0 },
|
||||
{ pattern: /^ipam[-_]/, layer: 'IPAM', order: 3.5 },
|
||||
{ pattern: /^skills/, layer: 'Skills/Tooling', order: 6 },
|
||||
{ pattern: /^docs/, layer: 'Documentation', order: 7 },
|
||||
];
|
||||
|
||||
/**
|
||||
* Infer layered architecture from top-level directory names.
|
||||
*/
|
||||
function extractLayers(srcRoot) {
|
||||
const dirs = fs.readdirSync(srcRoot, { withFileTypes: true })
|
||||
.filter(d => d.isDirectory() && !d.name.startsWith('.'))
|
||||
.map(d => d.name);
|
||||
|
||||
const layers = {};
|
||||
for (const dir of dirs) {
|
||||
for (const lp of LAYER_PATTERNS) {
|
||||
if (lp.pattern.test(dir)) {
|
||||
if (!layers[lp.layer]) layers[lp.layer] = { order: lp.order, repos: [] };
|
||||
layers[lp.layer].repos.push(dir);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Object.entries(layers)
|
||||
.sort((a, b) => a[1].order - b[1].order)
|
||||
.map(([name, info]) => ({ layer: name, repos: info.repos, order: info.order }));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract ArgoCD ApplicationSet configs to infer hub/spoke ownership.
|
||||
*/
|
||||
function extractArgoCDAppSets(srcRoot) {
|
||||
const appsets = [];
|
||||
const walkDir = (dir) => {
|
||||
try {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
|
||||
walkDir(full);
|
||||
} else if (entry.name.endsWith('-appset.yaml') || entry.name.endsWith('-appset.yml')) {
|
||||
try {
|
||||
const content = fs.readFileSync(full, 'utf8');
|
||||
const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
|
||||
const namespace = content.match(/namespace:\s*['"]?([^\s'"]+)/)?.[1] || '';
|
||||
const repoURL = content.match(/repoURL:\s*['"]?([^\s'"]+)/)?.[1] || '';
|
||||
const targetRevision = content.match(/targetRevision:\s*['"]?([^\s'"]+)/)?.[1] || '';
|
||||
const destServer = content.match(/server:\s*['"]?([^\s'"]+)/)?.[1] || '';
|
||||
const relPath = path.relative(srcRoot, full);
|
||||
|
||||
// Determine if hub or spoke based on path
|
||||
const isHub = relPath.includes('hub') || relPath.includes('control-plane');
|
||||
|
||||
appsets.push({
|
||||
name, namespace, repoURL, targetRevision, destServer,
|
||||
file: relPath,
|
||||
location: isHub ? 'hub' : 'spoke',
|
||||
repoName: repoURL.match(/\/([^/]+?)(?:\.git)?$/)?.[1] || repoURL,
|
||||
});
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
walkDir(srcRoot);
|
||||
return appsets;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract cloud regions from terraform configs and values.yaml files.
|
||||
*/
|
||||
function extractCloudRegions(srcRoot) {
|
||||
const regions = { aws: new Set(), azure: new Set(), gcp: new Set() };
|
||||
|
||||
const walkDir = (dir) => {
|
||||
try {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
|
||||
walkDir(full);
|
||||
} else if (entry.name.endsWith('.tf') || entry.name === 'values.yaml' || entry.name === 'variables.tf') {
|
||||
try {
|
||||
const content = fs.readFileSync(full, 'utf8');
|
||||
// AWS regions
|
||||
const awsMatches = content.match(/us-east-[12]|us-west-[12]|eu-west-[123]|eu-central-[12]|ap-southeast-[12]|ap-northeast-[123]/g);
|
||||
if (awsMatches) awsMatches.forEach(r => regions.aws.add(r));
|
||||
// Azure regions
|
||||
const azureMatches = content.match(/(?:centralus|eastus[2]?|westus[23]?|westeurope|northeurope|southeastasia|australiaeast)/g);
|
||||
if (azureMatches) azureMatches.forEach(r => regions.azure.add(r));
|
||||
// GCP regions
|
||||
const gcpMatches = content.match(/us-central1|us-east[14]|us-west[14]|europe-west[1-6]|asia-east[12]|asia-southeast[12]/g);
|
||||
if (gcpMatches) gcpMatches.forEach(r => regions.gcp.add(r));
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
walkDir(srcRoot);
|
||||
|
||||
return {
|
||||
aws: [...regions.aws].sort(),
|
||||
azure: [...regions.azure].sort(),
|
||||
gcp: [...regions.gcp].sort(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract CIDR allocations from terraform variables and configs.
|
||||
*/
|
||||
function extractCIDRAllocations(srcRoot) {
|
||||
const cidrs = [];
|
||||
|
||||
const walkDir = (dir) => {
|
||||
try {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
|
||||
walkDir(full);
|
||||
} else if (entry.name.endsWith('.tf') || entry.name.endsWith('.tfvars')) {
|
||||
try {
|
||||
const content = fs.readFileSync(full, 'utf8');
|
||||
const relPath = path.relative(srcRoot, full);
|
||||
// Match CIDR blocks
|
||||
const cidrMatches = content.match(/(?:cidr|CIDR|subnet|network).*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/g);
|
||||
if (cidrMatches) {
|
||||
for (const m of cidrMatches) {
|
||||
const cidr = m.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/)?.[1];
|
||||
if (cidr) cidrs.push({ cidr, context: m.trim().substring(0, 100), file: relPath });
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
walkDir(srcRoot);
|
||||
|
||||
// Deduplicate by CIDR
|
||||
const unique = {};
|
||||
for (const c of cidrs) {
|
||||
if (!unique[c.cidr]) unique[c.cidr] = [];
|
||||
unique[c.cidr].push({ context: c.context, file: c.file });
|
||||
}
|
||||
return Object.entries(unique).map(([cidr, refs]) => ({ cidr, refs }));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract naming conventions from scripts and terraform.
|
||||
*/
|
||||
function extractNamingConventions(srcRoot) {
|
||||
const conventions = [];
|
||||
|
||||
const walkDir = (dir) => {
|
||||
try {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
|
||||
walkDir(full);
|
||||
} else if (entry.name.endsWith('.sh') || entry.name.endsWith('.py') || entry.name.endsWith('.tf')) {
|
||||
try {
|
||||
const content = fs.readFileSync(full, 'utf8');
|
||||
const relPath = path.relative(srcRoot, full);
|
||||
// Only match lines that explicitly describe naming conventions with template patterns
|
||||
const lines = content.split('\n');
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
// Must contain a template-like pattern AND a convention keyword
|
||||
if (trimmed.match(/convention|naming|format/i) && trimmed.match(/\{(phase|region|cloud|index|env)\}/i)) {
|
||||
conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
|
||||
}
|
||||
// Also match explicit naming examples like "aws-{phase}-{region-code}-{index}-vpc"
|
||||
if (trimmed.match(/(?:aws|azr|gcp)-\{.*\}-\{.*\}/)) {
|
||||
conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
walkDir(srcRoot);
|
||||
|
||||
// Deduplicate
|
||||
const seen = new Set();
|
||||
return conventions.filter(c => {
|
||||
const key = c.pattern;
|
||||
if (seen.has(key)) return false;
|
||||
seen.add(key);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract sync-wave ordering from Helm templates.
|
||||
*/
|
||||
function extractSyncWaves(srcRoot) {
|
||||
const waves = {};
|
||||
|
||||
const walkDir = (dir) => {
|
||||
try {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
|
||||
walkDir(full);
|
||||
} else if (entry.name.endsWith('.yaml') || entry.name.endsWith('.yml')) {
|
||||
try {
|
||||
const content = fs.readFileSync(full, 'utf8');
|
||||
const waveMatch = content.match(/sync-wave:\s*["']?(-?\d+)["']?/);
|
||||
if (waveMatch) {
|
||||
const wave = parseInt(waveMatch[1]);
|
||||
const kind = content.match(/kind:\s*(\w+)/)?.[1] || 'Unknown';
|
||||
const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
|
||||
const relPath = path.relative(srcRoot, full);
|
||||
if (!waves[wave]) waves[wave] = [];
|
||||
waves[wave].push({ kind, name, file: relPath });
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
walkDir(srcRoot);
|
||||
|
||||
return Object.entries(waves)
|
||||
.sort((a, b) => Number(a[0]) - Number(b[0]))
|
||||
.map(([wave, resources]) => ({ wave: Number(wave), resources }));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract tech stack from Helm chart images and package.json.
|
||||
*/
|
||||
function extractTechStack(srcRoot) {
|
||||
const images = new Set();
|
||||
const packages = {};
|
||||
|
||||
const walkDir = (dir) => {
|
||||
try {
|
||||
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
||||
const full = path.join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
|
||||
walkDir(full);
|
||||
} else if (entry.name === 'values.yaml') {
|
||||
try {
|
||||
const content = fs.readFileSync(full, 'utf8');
|
||||
const imgMatches = content.match(/image:\s*['"]?([^\s'"]+)/g);
|
||||
if (imgMatches) imgMatches.forEach(m => {
|
||||
const img = m.replace(/image:\s*['"]?/, '').replace(/['"]$/, '');
|
||||
if (img && !img.includes('{{') && !img.includes('__helm')) images.add(img);
|
||||
});
|
||||
const repoMatches = content.match(/repository:\s*['"]?([^\s'"]+)/g);
|
||||
if (repoMatches) repoMatches.forEach(m => {
|
||||
const repo = m.replace(/repository:\s*['"]?/, '').replace(/['"]$/, '');
|
||||
if (repo && !repo.includes('{{') && !repo.includes('__helm') && repo.includes('/')) images.add(repo);
|
||||
});
|
||||
} catch {}
|
||||
} else if (entry.name === 'package.json') {
|
||||
try {
|
||||
const pkg = JSON.parse(fs.readFileSync(full, 'utf8'));
|
||||
const relPath = path.relative(srcRoot, full);
|
||||
if (pkg.dependencies) {
|
||||
for (const [name, ver] of Object.entries(pkg.dependencies)) {
|
||||
if (!packages[name]) packages[name] = [];
|
||||
packages[name].push({ version: ver, file: relPath });
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
};
|
||||
walkDir(srcRoot);
|
||||
|
||||
return {
|
||||
containerImages: [...images].sort(),
|
||||
npmPackages: Object.entries(packages).sort((a, b) => b[1].length - a[1].length).slice(0, 30)
|
||||
.map(([name, refs]) => ({ name, count: refs.length, versions: [...new Set(refs.map(r => r.version))] })),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Run all pattern extractors and return a unified result.
|
||||
*/
|
||||
function extractAllPatterns(srcRoot) {
|
||||
console.log('Extracting architectural patterns...');
|
||||
const layers = extractLayers(srcRoot);
|
||||
console.log(` Layers: ${layers.length}`);
|
||||
const appsets = extractArgoCDAppSets(srcRoot);
|
||||
console.log(` ApplicationSets: ${appsets.length}`);
|
||||
const regions = extractCloudRegions(srcRoot);
|
||||
console.log(` Regions: AWS=${regions.aws.length} Azure=${regions.azure.length} GCP=${regions.gcp.length}`);
|
||||
const cidrs = extractCIDRAllocations(srcRoot);
|
||||
console.log(` CIDR allocations: ${cidrs.length}`);
|
||||
const naming = extractNamingConventions(srcRoot);
|
||||
console.log(` Naming conventions: ${naming.length}`);
|
||||
const syncWaves = extractSyncWaves(srcRoot);
|
||||
console.log(` Sync waves: ${syncWaves.length} distinct waves`);
|
||||
const techStack = extractTechStack(srcRoot);
|
||||
console.log(` Container images: ${techStack.containerImages.length}, NPM packages: ${techStack.npmPackages.length}`);
|
||||
|
||||
return { layers, appsets, regions, cidrs, naming, syncWaves, techStack };
|
||||
}
|
||||
|
||||
module.exports = { extractAllPatterns, extractLayers, extractArgoCDAppSets, extractCloudRegions, extractCIDRAllocations, extractNamingConventions, extractSyncWaves, extractTechStack };
|
||||
Reference in New Issue
Block a user