feat: confluence benchmark, pattern extractor, agent KB, UX spec

- extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md
2026-03-10 14:20:35 +00:00
parent 049609a358
commit 0265ec7a60
844 changed files with 2129910 additions and 30 deletions
--- a/extract-patterns.js
+++ b/extract-patterns.js
@@ -0,0 +1,333 @@
+/**
+ * extract-patterns.js — Mine architectural patterns from code artifacts.
+ * 
+ * Extracts:
+ * - Layered architecture from repo/dir naming conventions
+ * - Hub/spoke model from ArgoCD ApplicationSet configs
+ * - Cloud regions from terraform configs + values.yaml
+ * - CIDR allocations from terraform variables
+ * - Naming conventions from scripts + terraform
+ * - Sync-wave ordering from Helm template annotations
+ * - Release/deployment patterns from CI configs + scripts
+ * - Tech stack from Helm chart images + dependencies
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+const LAYER_PATTERNS = [
+  { pattern: /^app[-_]/, layer: 'Application', order: 1 },
+  { pattern: /^compute[-_]/, layer: 'Compute', order: 2 },
+  { pattern: /^network[-_]/, layer: 'Network', order: 3 },
+  { pattern: /^account[-_]/, layer: 'Account', order: 4 },
+  { pattern: /^control[-_]/, layer: 'Control Plane', order: 5 },
+  { pattern: /^runtime/, layer: 'Runtime (shared)', order: 0 },
+  { pattern: /^ipam[-_]/, layer: 'IPAM', order: 3.5 },
+  { pattern: /^skills/, layer: 'Skills/Tooling', order: 6 },
+  { pattern: /^docs/, layer: 'Documentation', order: 7 },
+];
+
+/**
+ * Infer layered architecture from top-level directory names.
+ */
+function extractLayers(srcRoot) {
+  const dirs = fs.readdirSync(srcRoot, { withFileTypes: true })
+    .filter(d => d.isDirectory() && !d.name.startsWith('.'))
+    .map(d => d.name);
+
+  const layers = {};
+  for (const dir of dirs) {
+    for (const lp of LAYER_PATTERNS) {
+      if (lp.pattern.test(dir)) {
+        if (!layers[lp.layer]) layers[lp.layer] = { order: lp.order, repos: [] };
+        layers[lp.layer].repos.push(dir);
+        break;
+      }
+    }
+  }
+
+  return Object.entries(layers)
+    .sort((a, b) => a[1].order - b[1].order)
+    .map(([name, info]) => ({ layer: name, repos: info.repos, order: info.order }));
+}
+
+/**
+ * Extract ArgoCD ApplicationSet configs to infer hub/spoke ownership.
+ */
+function extractArgoCDAppSets(srcRoot) {
+  const appsets = [];
+  const walkDir = (dir) => {
+    try {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) {
+          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
+          walkDir(full);
+        } else if (entry.name.endsWith('-appset.yaml') || entry.name.endsWith('-appset.yml')) {
+          try {
+            const content = fs.readFileSync(full, 'utf8');
+            const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
+            const namespace = content.match(/namespace:\s*['"]?([^\s'"]+)/)?.[1] || '';
+            const repoURL = content.match(/repoURL:\s*['"]?([^\s'"]+)/)?.[1] || '';
+            const targetRevision = content.match(/targetRevision:\s*['"]?([^\s'"]+)/)?.[1] || '';
+            const destServer = content.match(/server:\s*['"]?([^\s'"]+)/)?.[1] || '';
+            const relPath = path.relative(srcRoot, full);
+            
+            // Determine if hub or spoke based on path
+            const isHub = relPath.includes('hub') || relPath.includes('control-plane');
+            
+            appsets.push({
+              name, namespace, repoURL, targetRevision, destServer,
+              file: relPath,
+              location: isHub ? 'hub' : 'spoke',
+              repoName: repoURL.match(/\/([^/]+?)(?:\.git)?$/)?.[1] || repoURL,
+            });
+          } catch {}
+        }
+      }
+    } catch {}
+  };
+  walkDir(srcRoot);
+  return appsets;
+}
+
+/**
+ * Extract cloud regions from terraform configs and values.yaml files.
+ */
+function extractCloudRegions(srcRoot) {
+  const regions = { aws: new Set(), azure: new Set(), gcp: new Set() };
+  
+  const walkDir = (dir) => {
+    try {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) {
+          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
+          walkDir(full);
+        } else if (entry.name.endsWith('.tf') || entry.name === 'values.yaml' || entry.name === 'variables.tf') {
+          try {
+            const content = fs.readFileSync(full, 'utf8');
+            // AWS regions
+            const awsMatches = content.match(/us-east-[12]|us-west-[12]|eu-west-[123]|eu-central-[12]|ap-southeast-[12]|ap-northeast-[123]/g);
+            if (awsMatches) awsMatches.forEach(r => regions.aws.add(r));
+            // Azure regions
+            const azureMatches = content.match(/(?:centralus|eastus[2]?|westus[23]?|westeurope|northeurope|southeastasia|australiaeast)/g);
+            if (azureMatches) azureMatches.forEach(r => regions.azure.add(r));
+            // GCP regions
+            const gcpMatches = content.match(/us-central1|us-east[14]|us-west[14]|europe-west[1-6]|asia-east[12]|asia-southeast[12]/g);
+            if (gcpMatches) gcpMatches.forEach(r => regions.gcp.add(r));
+          } catch {}
+        }
+      }
+    } catch {}
+  };
+  walkDir(srcRoot);
+  
+  return {
+    aws: [...regions.aws].sort(),
+    azure: [...regions.azure].sort(),
+    gcp: [...regions.gcp].sort(),
+  };
+}
+
+/**
+ * Extract CIDR allocations from terraform variables and configs.
+ */
+function extractCIDRAllocations(srcRoot) {
+  const cidrs = [];
+  
+  const walkDir = (dir) => {
+    try {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) {
+          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
+          walkDir(full);
+        } else if (entry.name.endsWith('.tf') || entry.name.endsWith('.tfvars')) {
+          try {
+            const content = fs.readFileSync(full, 'utf8');
+            const relPath = path.relative(srcRoot, full);
+            // Match CIDR blocks
+            const cidrMatches = content.match(/(?:cidr|CIDR|subnet|network).*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/g);
+            if (cidrMatches) {
+              for (const m of cidrMatches) {
+                const cidr = m.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/)?.[1];
+                if (cidr) cidrs.push({ cidr, context: m.trim().substring(0, 100), file: relPath });
+              }
+            }
+          } catch {}
+        }
+      }
+    } catch {}
+  };
+  walkDir(srcRoot);
+  
+  // Deduplicate by CIDR
+  const unique = {};
+  for (const c of cidrs) {
+    if (!unique[c.cidr]) unique[c.cidr] = [];
+    unique[c.cidr].push({ context: c.context, file: c.file });
+  }
+  return Object.entries(unique).map(([cidr, refs]) => ({ cidr, refs }));
+}
+
+/**
+ * Extract naming conventions from scripts and terraform.
+ */
+function extractNamingConventions(srcRoot) {
+  const conventions = [];
+  
+  const walkDir = (dir) => {
+    try {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) {
+          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
+          walkDir(full);
+        } else if (entry.name.endsWith('.sh') || entry.name.endsWith('.py') || entry.name.endsWith('.tf')) {
+          try {
+            const content = fs.readFileSync(full, 'utf8');
+            const relPath = path.relative(srcRoot, full);
+            // Only match lines that explicitly describe naming conventions with template patterns
+            const lines = content.split('\n');
+            for (const line of lines) {
+              const trimmed = line.trim();
+              // Must contain a template-like pattern AND a convention keyword
+              if (trimmed.match(/convention|naming|format/i) && trimmed.match(/\{(phase|region|cloud|index|env)\}/i)) {
+                conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
+              }
+              // Also match explicit naming examples like "aws-{phase}-{region-code}-{index}-vpc"
+              if (trimmed.match(/(?:aws|azr|gcp)-\{.*\}-\{.*\}/)) {
+                conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
+              }
+            }
+          } catch {}
+        }
+      }
+    } catch {}
+  };
+  walkDir(srcRoot);
+  
+  // Deduplicate
+  const seen = new Set();
+  return conventions.filter(c => {
+    const key = c.pattern;
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+
+/**
+ * Extract sync-wave ordering from Helm templates.
+ */
+function extractSyncWaves(srcRoot) {
+  const waves = {};
+  
+  const walkDir = (dir) => {
+    try {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) {
+          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
+          walkDir(full);
+        } else if (entry.name.endsWith('.yaml') || entry.name.endsWith('.yml')) {
+          try {
+            const content = fs.readFileSync(full, 'utf8');
+            const waveMatch = content.match(/sync-wave:\s*["']?(-?\d+)["']?/);
+            if (waveMatch) {
+              const wave = parseInt(waveMatch[1]);
+              const kind = content.match(/kind:\s*(\w+)/)?.[1] || 'Unknown';
+              const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
+              const relPath = path.relative(srcRoot, full);
+              if (!waves[wave]) waves[wave] = [];
+              waves[wave].push({ kind, name, file: relPath });
+            }
+          } catch {}
+        }
+      }
+    } catch {}
+  };
+  walkDir(srcRoot);
+  
+  return Object.entries(waves)
+    .sort((a, b) => Number(a[0]) - Number(b[0]))
+    .map(([wave, resources]) => ({ wave: Number(wave), resources }));
+}
+
+/**
+ * Extract tech stack from Helm chart images and package.json.
+ */
+function extractTechStack(srcRoot) {
+  const images = new Set();
+  const packages = {};
+  
+  const walkDir = (dir) => {
+    try {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) {
+          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
+          walkDir(full);
+        } else if (entry.name === 'values.yaml') {
+          try {
+            const content = fs.readFileSync(full, 'utf8');
+            const imgMatches = content.match(/image:\s*['"]?([^\s'"]+)/g);
+            if (imgMatches) imgMatches.forEach(m => {
+              const img = m.replace(/image:\s*['"]?/, '').replace(/['"]$/, '');
+              if (img && !img.includes('{{') && !img.includes('__helm')) images.add(img);
+            });
+            const repoMatches = content.match(/repository:\s*['"]?([^\s'"]+)/g);
+            if (repoMatches) repoMatches.forEach(m => {
+              const repo = m.replace(/repository:\s*['"]?/, '').replace(/['"]$/, '');
+              if (repo && !repo.includes('{{') && !repo.includes('__helm') && repo.includes('/')) images.add(repo);
+            });
+          } catch {}
+        } else if (entry.name === 'package.json') {
+          try {
+            const pkg = JSON.parse(fs.readFileSync(full, 'utf8'));
+            const relPath = path.relative(srcRoot, full);
+            if (pkg.dependencies) {
+              for (const [name, ver] of Object.entries(pkg.dependencies)) {
+                if (!packages[name]) packages[name] = [];
+                packages[name].push({ version: ver, file: relPath });
+              }
+            }
+          } catch {}
+        }
+      }
+    } catch {}
+  };
+  walkDir(srcRoot);
+  
+  return {
+    containerImages: [...images].sort(),
+    npmPackages: Object.entries(packages).sort((a, b) => b[1].length - a[1].length).slice(0, 30)
+      .map(([name, refs]) => ({ name, count: refs.length, versions: [...new Set(refs.map(r => r.version))] })),
+  };
+}
+
+/**
+ * Run all pattern extractors and return a unified result.
+ */
+function extractAllPatterns(srcRoot) {
+  console.log('Extracting architectural patterns...');
+  const layers = extractLayers(srcRoot);
+  console.log(`  Layers: ${layers.length}`);
+  const appsets = extractArgoCDAppSets(srcRoot);
+  console.log(`  ApplicationSets: ${appsets.length}`);
+  const regions = extractCloudRegions(srcRoot);
+  console.log(`  Regions: AWS=${regions.aws.length} Azure=${regions.azure.length} GCP=${regions.gcp.length}`);
+  const cidrs = extractCIDRAllocations(srcRoot);
+  console.log(`  CIDR allocations: ${cidrs.length}`);
+  const naming = extractNamingConventions(srcRoot);
+  console.log(`  Naming conventions: ${naming.length}`);
+  const syncWaves = extractSyncWaves(srcRoot);
+  console.log(`  Sync waves: ${syncWaves.length} distinct waves`);
+  const techStack = extractTechStack(srcRoot);
+  console.log(`  Container images: ${techStack.containerImages.length}, NPM packages: ${techStack.npmPackages.length}`);
+  
+  return { layers, appsets, regions, cidrs, naming, syncWaves, techStack };
+}
+
+module.exports = { extractAllPatterns, extractLayers, extractArgoCDAppSets, extractCloudRegions, extractCIDRAllocations, extractNamingConventions, extractSyncWaves, extractTechStack };