extract-patterns.js

/**
 * extract-patterns.js — Mine architectural patterns from code artifacts.
 * 
 * Extracts:
 * - Layered architecture from repo/dir naming conventions
 * - Hub/spoke model from ArgoCD ApplicationSet configs
 * - Cloud regions from terraform configs + values.yaml
 * - CIDR allocations from terraform variables
 * - Naming conventions from scripts + terraform
 * - Sync-wave ordering from Helm template annotations
 * - Release/deployment patterns from CI configs + scripts
 * - Tech stack from Helm chart images + dependencies
 */

const fs = require('fs');
const path = require('path');

const LAYER_PATTERNS = [
  { pattern: /^app[-_]/, layer: 'Application', order: 1 },
  { pattern: /^compute[-_]/, layer: 'Compute', order: 2 },
  { pattern: /^network[-_]/, layer: 'Network', order: 3 },
  { pattern: /^account[-_]/, layer: 'Account', order: 4 },
  { pattern: /^control[-_]/, layer: 'Control Plane', order: 5 },
  { pattern: /^runtime/, layer: 'Runtime (shared)', order: 0 },
  { pattern: /^ipam[-_]/, layer: 'IPAM', order: 3.5 },
  { pattern: /^skills/, layer: 'Skills/Tooling', order: 6 },
  { pattern: /^docs/, layer: 'Documentation', order: 7 },
];

/**
 * Infer layered architecture from top-level directory names.
 */
function extractLayers(srcRoot) {
  const dirs = fs.readdirSync(srcRoot, { withFileTypes: true })
    .filter(d => d.isDirectory() && !d.name.startsWith('.'))
    .map(d => d.name);

  const layers = {};
  for (const dir of dirs) {
    for (const lp of LAYER_PATTERNS) {
      if (lp.pattern.test(dir)) {
        if (!layers[lp.layer]) layers[lp.layer] = { order: lp.order, repos: [] };
        layers[lp.layer].repos.push(dir);
        break;
      }
    }
  }

  return Object.entries(layers)
    .sort((a, b) => a[1].order - b[1].order)
    .map(([name, info]) => ({ layer: name, repos: info.repos, order: info.order }));
}

/**
 * Extract ArgoCD ApplicationSet configs to infer hub/spoke ownership.
 */
function extractArgoCDAppSets(srcRoot) {
  const appsets = [];
  const walkDir = (dir) => {
    try {
      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
        const full = path.join(dir, entry.name);
        if (entry.isDirectory()) {
          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
          walkDir(full);
        } else if (entry.name.endsWith('-appset.yaml') || entry.name.endsWith('-appset.yml')) {
          try {
            const content = fs.readFileSync(full, 'utf8');
            const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
            const namespace = content.match(/namespace:\s*['"]?([^\s'"]+)/)?.[1] || '';
            const repoURL = content.match(/repoURL:\s*['"]?([^\s'"]+)/)?.[1] || '';
            const targetRevision = content.match(/targetRevision:\s*['"]?([^\s'"]+)/)?.[1] || '';
            const destServer = content.match(/server:\s*['"]?([^\s'"]+)/)?.[1] || '';
            const relPath = path.relative(srcRoot, full);
            
            // Determine if hub or spoke based on path
            const isHub = relPath.includes('hub') || relPath.includes('control-plane');
            
            appsets.push({
              name, namespace, repoURL, targetRevision, destServer,
              file: relPath,
              location: isHub ? 'hub' : 'spoke',
              repoName: repoURL.match(/\/([^/]+?)(?:\.git)?$/)?.[1] || repoURL,
            });
          } catch {}
        }
      }
    } catch {}
  };
  walkDir(srcRoot);
  return appsets;
}

/**
 * Extract cloud regions from terraform configs and values.yaml files.
 */
function extractCloudRegions(srcRoot) {
  const regions = { aws: new Set(), azure: new Set(), gcp: new Set() };
  
  const walkDir = (dir) => {
    try {
      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
        const full = path.join(dir, entry.name);
        if (entry.isDirectory()) {
          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
          walkDir(full);
        } else if (entry.name.endsWith('.tf') || entry.name === 'values.yaml' || entry.name === 'variables.tf') {
          try {
            const content = fs.readFileSync(full, 'utf8');
            // AWS regions
            const awsMatches = content.match(/us-east-[12]|us-west-[12]|eu-west-[123]|eu-central-[12]|ap-southeast-[12]|ap-northeast-[123]/g);
            if (awsMatches) awsMatches.forEach(r => regions.aws.add(r));
            // Azure regions
            const azureMatches = content.match(/(?:centralus|eastus[2]?|westus[23]?|westeurope|northeurope|southeastasia|australiaeast)/g);
            if (azureMatches) azureMatches.forEach(r => regions.azure.add(r));
            // GCP regions
            const gcpMatches = content.match(/us-central1|us-east[14]|us-west[14]|europe-west[1-6]|asia-east[12]|asia-southeast[12]/g);
            if (gcpMatches) gcpMatches.forEach(r => regions.gcp.add(r));
          } catch {}
        }
      }
    } catch {}
  };
  walkDir(srcRoot);
  
  return {
    aws: [...regions.aws].sort(),
    azure: [...regions.azure].sort(),
    gcp: [...regions.gcp].sort(),
  };
}

/**
 * Extract CIDR allocations from terraform variables and configs.
 */
function extractCIDRAllocations(srcRoot) {
  const cidrs = [];
  
  const walkDir = (dir) => {
    try {
      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
        const full = path.join(dir, entry.name);
        if (entry.isDirectory()) {
          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
          walkDir(full);
        } else if (entry.name.endsWith('.tf') || entry.name.endsWith('.tfvars')) {
          try {
            const content = fs.readFileSync(full, 'utf8');
            const relPath = path.relative(srcRoot, full);
            
            const lines = content.split('\n');
            for (let i = 0; i < lines.length; i++) {
              const line = lines[i];
              const cidrMatch = line.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/);
              if (cidrMatch) {
                const cidr = cidrMatch[1];
                let context = line.includes('#') ? line.substring(line.indexOf('#') + 1).trim() : '';
                if (!context) {
                  for (let j = Math.max(0, i - 3); j < i; j++) {
                    if (lines[j].trim().startsWith('#')) {
                      context = lines[j].replace(/^#\s*/, '').trim();
                      break;
                    }
                  }
                }
                if (!context) context = line.trim();
                
                cidrs.push({ cidr, context, file: relPath });
              }
            }
          } catch {}
        }
      }
    } catch {}
  };
  walkDir(srcRoot);
  
  const unique = {};
  for (const c of cidrs) {
    if (!unique[c.cidr]) unique[c.cidr] = [];
    unique[c.cidr].push(c);
  }
  
  return Object.entries(unique).map(([cidr, refs]) => {
    refs.sort((a, b) => {
      const aIsCode = a.context.includes('=') || a.context.includes('"');
      const bIsCode = b.context.includes('=') || b.context.includes('"');
      if (!aIsCode && bIsCode) return -1;
      if (aIsCode && !bIsCode) return 1;
      return 0;
    });
    return { cidr, refs };
  });
}

/**
 * Extract naming conventions from scripts and terraform.
 */
function extractNamingConventions(srcRoot) {
  const conventions = [];
  
  const walkDir = (dir) => {
    try {
      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
        const full = path.join(dir, entry.name);
        if (entry.isDirectory()) {
          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
          walkDir(full);
        } else if (entry.name.endsWith('.sh') || entry.name.endsWith('.py') || entry.name.endsWith('.tf')) {
          try {
            const content = fs.readFileSync(full, 'utf8');
            const relPath = path.relative(srcRoot, full);
            // Only match lines that explicitly describe naming conventions with template patterns
            const lines = content.split('\n');
            for (const line of lines) {
              const trimmed = line.trim();
              // Must contain a template-like pattern AND a convention keyword
              if (trimmed.match(/convention|naming|format/i) && trimmed.match(/\{(phase|region|cloud|index|env)\}/i)) {
                conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
              }
              // Also match explicit naming examples like "aws-{phase}-{region-code}-{index}-vpc"
              if (trimmed.match(/(?:aws|azr|gcp)-\{.*\}-\{.*\}/)) {
                conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });
              }
            }
          } catch {}
        }
      }
    } catch {}
  };
  walkDir(srcRoot);
  
  // Deduplicate
  const seen = new Set();
  return conventions.filter(c => {
    const key = c.pattern;
    if (seen.has(key)) return false;
    seen.add(key);
    return true;
  });
}

/**
 * Extract sync-wave ordering from Helm templates.
 */
function extractSyncWaves(srcRoot) {
  const waves = {};
  
  const walkDir = (dir) => {
    try {
      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
        const full = path.join(dir, entry.name);
        if (entry.isDirectory()) {
          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
          walkDir(full);
        } else if (entry.name.endsWith('.yaml') || entry.name.endsWith('.yml')) {
          try {
            const content = fs.readFileSync(full, 'utf8');
            const waveMatch = content.match(/sync-wave:\s*["']?(-?\d+)["']?/);
            if (waveMatch) {
              const wave = parseInt(waveMatch[1]);
              const kind = content.match(/kind:\s*(\w+)/)?.[1] || 'Unknown';
              const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] || entry.name;
              const relPath = path.relative(srcRoot, full);
              if (!waves[wave]) waves[wave] = [];
              waves[wave].push({ kind, name, file: relPath });
            }
          } catch {}
        }
      }
    } catch {}
  };
  walkDir(srcRoot);
  
  return Object.entries(waves)
    .sort((a, b) => Number(a[0]) - Number(b[0]))
    .map(([wave, resources]) => ({ wave: Number(wave), resources }));
}

/**
 * Extract tech stack from Helm chart images and package.json.
 */
function extractTechStack(srcRoot) {
  const images = new Set();
  const packages = {};
  
  const walkDir = (dir) => {
    try {
      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
        const full = path.join(dir, entry.name);
        if (entry.isDirectory()) {
          if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;
          walkDir(full);
        } else if (entry.name === 'values.yaml') {
          try {
            const content = fs.readFileSync(full, 'utf8');
            const imgMatches = content.match(/image:\s*['"]?([^\s'"]+)/g);
            if (imgMatches) imgMatches.forEach(m => {
              const img = m.replace(/image:\s*['"]?/, '').replace(/['"]$/, '');
              if (img && !img.includes('{{') && !img.includes('__helm')) images.add(img);
            });
            const repoMatches = content.match(/repository:\s*['"]?([^\s'"]+)/g);
            if (repoMatches) repoMatches.forEach(m => {
              const repo = m.replace(/repository:\s*['"]?/, '').replace(/['"]$/, '');
              if (repo && !repo.includes('{{') && !repo.includes('__helm') && repo.includes('/')) images.add(repo);
            });
          } catch {}
        } else if (entry.name === 'package.json') {
          try {
            const pkg = JSON.parse(fs.readFileSync(full, 'utf8'));
            const relPath = path.relative(srcRoot, full);
            if (pkg.dependencies) {
              for (const [name, ver] of Object.entries(pkg.dependencies)) {
                if (!packages[name]) packages[name] = [];
                packages[name].push({ version: ver, file: relPath });
              }
            }
          } catch {}
        }
      }
    } catch {}
  };
  walkDir(srcRoot);
  
  return {
    containerImages: [...images].sort(),
    npmPackages: Object.entries(packages).sort((a, b) => b[1].length - a[1].length).slice(0, 30)
      .map(([name, refs]) => ({ name, count: refs.length, versions: [...new Set(refs.map(r => r.version))] })),
  };
}

/**
 * Run all pattern extractors and return a unified result.
 */
function extractAllPatterns(srcRoot) {
  console.log('Extracting architectural patterns...');
  const layers = extractLayers(srcRoot);
  console.log(`  Layers: ${layers.length}`);
  const appsets = extractArgoCDAppSets(srcRoot);
  console.log(`  ApplicationSets: ${appsets.length}`);
  const regions = extractCloudRegions(srcRoot);
  console.log(`  Regions: AWS=${regions.aws.length} Azure=${regions.azure.length} GCP=${regions.gcp.length}`);
  const cidrs = extractCIDRAllocations(srcRoot);
  console.log(`  CIDR allocations: ${cidrs.length}`);
  const naming = extractNamingConventions(srcRoot);
  console.log(`  Naming conventions: ${naming.length}`);
  const syncWaves = extractSyncWaves(srcRoot);
  console.log(`  Sync waves: ${syncWaves.length} distinct waves`);
  const techStack = extractTechStack(srcRoot);
  console.log(`  Container images: ${techStack.containerImages.length}, NPM packages: ${techStack.npmPackages.length}`);
  
  return { layers, appsets, regions, cidrs, naming, syncWaves, techStack };
}

module.exports = { extractAllPatterns, extractLayers, extractArgoCDAppSets, extractCloudRegions, extractCIDRAllocations, extractNamingConventions, extractSyncWaves, extractTechStack };
feat: confluence benchmark, pattern extractor, agent KB, UX spec - extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md 2026-03-10 14:20:35 +00:00			`/**`
			`* extract-patterns.js — Mine architectural patterns from code artifacts.`
			`*`
			`* Extracts:`
			`* - Layered architecture from repo/dir naming conventions`
			`* - Hub/spoke model from ArgoCD ApplicationSet configs`
			`* - Cloud regions from terraform configs + values.yaml`
			`* - CIDR allocations from terraform variables`
			`* - Naming conventions from scripts + terraform`
			`* - Sync-wave ordering from Helm template annotations`
			`* - Release/deployment patterns from CI configs + scripts`
			`* - Tech stack from Helm chart images + dependencies`
			`*/`

			`const fs = require('fs');`
			`const path = require('path');`

			`const LAYER_PATTERNS = [`
			`{ pattern: /^app[-_]/, layer: 'Application', order: 1 },`
			`{ pattern: /^compute[-_]/, layer: 'Compute', order: 2 },`
			`{ pattern: /^network[-_]/, layer: 'Network', order: 3 },`
			`{ pattern: /^account[-_]/, layer: 'Account', order: 4 },`
			`{ pattern: /^control[-_]/, layer: 'Control Plane', order: 5 },`
			`{ pattern: /^runtime/, layer: 'Runtime (shared)', order: 0 },`
			`{ pattern: /^ipam[-_]/, layer: 'IPAM', order: 3.5 },`
			`{ pattern: /^skills/, layer: 'Skills/Tooling', order: 6 },`
			`{ pattern: /^docs/, layer: 'Documentation', order: 7 },`
			`];`

			`/**`
			`* Infer layered architecture from top-level directory names.`
			`*/`
			`function extractLayers(srcRoot) {`
			`const dirs = fs.readdirSync(srcRoot, { withFileTypes: true })`
			`.filter(d => d.isDirectory() && !d.name.startsWith('.'))`
			`.map(d => d.name);`

			`const layers = {};`
			`for (const dir of dirs) {`
			`for (const lp of LAYER_PATTERNS) {`
			`if (lp.pattern.test(dir)) {`
			`if (!layers[lp.layer]) layers[lp.layer] = { order: lp.order, repos: [] };`
			`layers[lp.layer].repos.push(dir);`
			`break;`
			`}`
			`}`
			`}`

			`return Object.entries(layers)`
			`.sort((a, b) => a[1].order - b[1].order)`
			`.map(([name, info]) => ({ layer: name, repos: info.repos, order: info.order }));`
			`}`

			`/**`
			`* Extract ArgoCD ApplicationSet configs to infer hub/spoke ownership.`
			`*/`
			`function extractArgoCDAppSets(srcRoot) {`
			`const appsets = [];`
			`const walkDir = (dir) => {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;`
			`walkDir(full);`
			`} else if (entry.name.endsWith('-appset.yaml') \|\| entry.name.endsWith('-appset.yml')) {`
			`try {`
			`const content = fs.readFileSync(full, 'utf8');`
			`const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] \|\| entry.name;`
			`const namespace = content.match(/namespace:\s*['"]?([^\s'"]+)/)?.[1] \|\| '';`
			`const repoURL = content.match(/repoURL:\s*['"]?([^\s'"]+)/)?.[1] \|\| '';`
			`const targetRevision = content.match(/targetRevision:\s*['"]?([^\s'"]+)/)?.[1] \|\| '';`
			`const destServer = content.match(/server:\s*['"]?([^\s'"]+)/)?.[1] \|\| '';`
			`const relPath = path.relative(srcRoot, full);`

			`// Determine if hub or spoke based on path`
			`const isHub = relPath.includes('hub') \|\| relPath.includes('control-plane');`

			`appsets.push({`
			`name, namespace, repoURL, targetRevision, destServer,`
			`file: relPath,`
			`location: isHub ? 'hub' : 'spoke',`
			`repoName: repoURL.match(/\/([^/]+?)(?:\.git)?$/)?.[1] \|\| repoURL,`
			`});`
			`} catch {}`
			`}`
			`}`
			`} catch {}`
			`};`
			`walkDir(srcRoot);`
			`return appsets;`
			`}`

			`/**`
			`* Extract cloud regions from terraform configs and values.yaml files.`
			`*/`
			`function extractCloudRegions(srcRoot) {`
			`const regions = { aws: new Set(), azure: new Set(), gcp: new Set() };`

			`const walkDir = (dir) => {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;`
			`walkDir(full);`
			`} else if (entry.name.endsWith('.tf') \|\| entry.name === 'values.yaml' \|\| entry.name === 'variables.tf') {`
			`try {`
			`const content = fs.readFileSync(full, 'utf8');`
			`// AWS regions`
			`const awsMatches = content.match(/us-east-[12]\|us-west-[12]\|eu-west-[123]\|eu-central-[12]\|ap-southeast-[12]\|ap-northeast-[123]/g);`
			`if (awsMatches) awsMatches.forEach(r => regions.aws.add(r));`
			`// Azure regions`
			`const azureMatches = content.match(/(?:centralus\|eastus[2]?\|westus[23]?\|westeurope\|northeurope\|southeastasia\|australiaeast)/g);`
			`if (azureMatches) azureMatches.forEach(r => regions.azure.add(r));`
			`// GCP regions`
			`const gcpMatches = content.match(/us-central1\|us-east[14]\|us-west[14]\|europe-west[1-6]\|asia-east[12]\|asia-southeast[12]/g);`
			`if (gcpMatches) gcpMatches.forEach(r => regions.gcp.add(r));`
			`} catch {}`
			`}`
			`}`
			`} catch {}`
			`};`
			`walkDir(srcRoot);`

			`return {`
			`aws: [...regions.aws].sort(),`
			`azure: [...regions.azure].sort(),`
			`gcp: [...regions.gcp].sort(),`
			`};`
			`}`

			`/**`
			`* Extract CIDR allocations from terraform variables and configs.`
			`*/`
			`function extractCIDRAllocations(srcRoot) {`
			`const cidrs = [];`

			`const walkDir = (dir) => {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;`
			`walkDir(full);`
			`} else if (entry.name.endsWith('.tf') \|\| entry.name.endsWith('.tfvars')) {`
			`try {`
			`const content = fs.readFileSync(full, 'utf8');`
			`const relPath = path.relative(srcRoot, full);`
Add deep extractors, reference pages, keyword index; eval 53.3% - extract-deep.js: mines addon versions, TF configs, script params, helm values, state services - generate-reference-pages.js: creates operations.md, configuration.md, network-architecture.md - reference/index.md: keyword-rich topic-to-file routing table - Enriched CIDR extractor with inline comment capture - Eval progression: 28.7% -> 33.4% -> 46.7% -> 52.5% -> 53.3% - NOT_FOUND: 25 -> 20 -> 16 -> 10 -> 11 - Top scores: config-region-code 95%, argo-gen-params 95%, multiple 100%s - Remaining gap: agent planner (haiku) doesn't consistently follow index routing 2026-03-10 19:01:21 +00:00
			`const lines = content.split('\n');`
			`for (let i = 0; i < lines.length; i++) {`
			`const line = lines[i];`
			`const cidrMatch = line.match(/(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2})/);`
			`if (cidrMatch) {`
			`const cidr = cidrMatch[1];`
			`let context = line.includes('#') ? line.substring(line.indexOf('#') + 1).trim() : '';`
			`if (!context) {`
			`for (let j = Math.max(0, i - 3); j < i; j++) {`
			`if (lines[j].trim().startsWith('#')) {`
			`context = lines[j].replace(/^#\s*/, '').trim();`
			`break;`
			`}`
			`}`
			`}`
			`if (!context) context = line.trim();`

			`cidrs.push({ cidr, context, file: relPath });`
feat: confluence benchmark, pattern extractor, agent KB, UX spec - extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md 2026-03-10 14:20:35 +00:00			`}`
			`}`
			`} catch {}`
			`}`
			`}`
			`} catch {}`
			`};`
			`walkDir(srcRoot);`

			`const unique = {};`
			`for (const c of cidrs) {`
			`if (!unique[c.cidr]) unique[c.cidr] = [];`
Add deep extractors, reference pages, keyword index; eval 53.3% - extract-deep.js: mines addon versions, TF configs, script params, helm values, state services - generate-reference-pages.js: creates operations.md, configuration.md, network-architecture.md - reference/index.md: keyword-rich topic-to-file routing table - Enriched CIDR extractor with inline comment capture - Eval progression: 28.7% -> 33.4% -> 46.7% -> 52.5% -> 53.3% - NOT_FOUND: 25 -> 20 -> 16 -> 10 -> 11 - Top scores: config-region-code 95%, argo-gen-params 95%, multiple 100%s - Remaining gap: agent planner (haiku) doesn't consistently follow index routing 2026-03-10 19:01:21 +00:00			`unique[c.cidr].push(c);`
feat: confluence benchmark, pattern extractor, agent KB, UX spec - extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md 2026-03-10 14:20:35 +00:00			`}`
Add deep extractors, reference pages, keyword index; eval 53.3% - extract-deep.js: mines addon versions, TF configs, script params, helm values, state services - generate-reference-pages.js: creates operations.md, configuration.md, network-architecture.md - reference/index.md: keyword-rich topic-to-file routing table - Enriched CIDR extractor with inline comment capture - Eval progression: 28.7% -> 33.4% -> 46.7% -> 52.5% -> 53.3% - NOT_FOUND: 25 -> 20 -> 16 -> 10 -> 11 - Top scores: config-region-code 95%, argo-gen-params 95%, multiple 100%s - Remaining gap: agent planner (haiku) doesn't consistently follow index routing 2026-03-10 19:01:21 +00:00
			`return Object.entries(unique).map(([cidr, refs]) => {`
			`refs.sort((a, b) => {`
			`const aIsCode = a.context.includes('=') \|\| a.context.includes('"');`
			`const bIsCode = b.context.includes('=') \|\| b.context.includes('"');`
			`if (!aIsCode && bIsCode) return -1;`
			`if (aIsCode && !bIsCode) return 1;`
			`return 0;`
			`});`
			`return { cidr, refs };`
			`});`
feat: confluence benchmark, pattern extractor, agent KB, UX spec - extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md 2026-03-10 14:20:35 +00:00			`}`

			`/**`
			`* Extract naming conventions from scripts and terraform.`
			`*/`
			`function extractNamingConventions(srcRoot) {`
			`const conventions = [];`

			`const walkDir = (dir) => {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;`
			`walkDir(full);`
			`} else if (entry.name.endsWith('.sh') \|\| entry.name.endsWith('.py') \|\| entry.name.endsWith('.tf')) {`
			`try {`
			`const content = fs.readFileSync(full, 'utf8');`
			`const relPath = path.relative(srcRoot, full);`
			`// Only match lines that explicitly describe naming conventions with template patterns`
			`const lines = content.split('\n');`
			`for (const line of lines) {`
			`const trimmed = line.trim();`
			`// Must contain a template-like pattern AND a convention keyword`
			`if (trimmed.match(/convention\|naming\|format/i) && trimmed.match(/\{(phase\|region\|cloud\|index\|env)\}/i)) {`
			`conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });`
			`}`
			`// Also match explicit naming examples like "aws-{phase}-{region-code}-{index}-vpc"`
			`if (trimmed.match(/(?:aws\|azr\|gcp)-\{.\}-\{.\}/)) {`
			`conventions.push({ pattern: trimmed.substring(0, 200), file: relPath });`
			`}`
			`}`
			`} catch {}`
			`}`
			`}`
			`} catch {}`
			`};`
			`walkDir(srcRoot);`

			`// Deduplicate`
			`const seen = new Set();`
			`return conventions.filter(c => {`
			`const key = c.pattern;`
			`if (seen.has(key)) return false;`
			`seen.add(key);`
			`return true;`
			`});`
			`}`

			`/**`
			`* Extract sync-wave ordering from Helm templates.`
			`*/`
			`function extractSyncWaves(srcRoot) {`
			`const waves = {};`

			`const walkDir = (dir) => {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;`
			`walkDir(full);`
			`} else if (entry.name.endsWith('.yaml') \|\| entry.name.endsWith('.yml')) {`
			`try {`
			`const content = fs.readFileSync(full, 'utf8');`
			`const waveMatch = content.match(/sync-wave:\s*["']?(-?\d+)["']?/);`
			`if (waveMatch) {`
			`const wave = parseInt(waveMatch[1]);`
			`const kind = content.match(/kind:\s*(\w+)/)?.[1] \|\| 'Unknown';`
			`const name = content.match(/name:\s*['"]?([^\s'"]+)/)?.[1] \|\| entry.name;`
			`const relPath = path.relative(srcRoot, full);`
			`if (!waves[wave]) waves[wave] = [];`
			`waves[wave].push({ kind, name, file: relPath });`
			`}`
			`} catch {}`
			`}`
			`}`
			`} catch {}`
			`};`
			`walkDir(srcRoot);`

			`return Object.entries(waves)`
			`.sort((a, b) => Number(a[0]) - Number(b[0]))`
			`.map(([wave, resources]) => ({ wave: Number(wave), resources }));`
			`}`

			`/**`
			`* Extract tech stack from Helm chart images and package.json.`
			`*/`
			`function extractTechStack(srcRoot) {`
			`const images = new Set();`
			`const packages = {};`

			`const walkDir = (dir) => {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`if (['node_modules', '.git', 'venv', '.terraform', '__pycache__'].includes(entry.name)) continue;`
			`walkDir(full);`
			`} else if (entry.name === 'values.yaml') {`
			`try {`
			`const content = fs.readFileSync(full, 'utf8');`
			`const imgMatches = content.match(/image:\s*['"]?([^\s'"]+)/g);`
			`if (imgMatches) imgMatches.forEach(m => {`
			`const img = m.replace(/image:\s*['"]?/, '').replace(/['"]$/, '');`
			`if (img && !img.includes('{{') && !img.includes('__helm')) images.add(img);`
			`});`
			`const repoMatches = content.match(/repository:\s*['"]?([^\s'"]+)/g);`
			`if (repoMatches) repoMatches.forEach(m => {`
			`const repo = m.replace(/repository:\s*['"]?/, '').replace(/['"]$/, '');`
			`if (repo && !repo.includes('{{') && !repo.includes('__helm') && repo.includes('/')) images.add(repo);`
			`});`
			`} catch {}`
			`} else if (entry.name === 'package.json') {`
			`try {`
			`const pkg = JSON.parse(fs.readFileSync(full, 'utf8'));`
			`const relPath = path.relative(srcRoot, full);`
			`if (pkg.dependencies) {`
			`for (const [name, ver] of Object.entries(pkg.dependencies)) {`
			`if (!packages[name]) packages[name] = [];`
			`packages[name].push({ version: ver, file: relPath });`
			`}`
			`}`
			`} catch {}`
			`}`
			`}`
			`} catch {}`
			`};`
			`walkDir(srcRoot);`

			`return {`
			`containerImages: [...images].sort(),`
			`npmPackages: Object.entries(packages).sort((a, b) => b[1].length - a[1].length).slice(0, 30)`
			`.map(([name, refs]) => ({ name, count: refs.length, versions: [...new Set(refs.map(r => r.version))] })),`
			`};`
			`}`

			`/**`
			`* Run all pattern extractors and return a unified result.`
			`*/`
			`function extractAllPatterns(srcRoot) {`
			`console.log('Extracting architectural patterns...');`
			`const layers = extractLayers(srcRoot);`
			console.log(` Layers: ${layers.length}`);
			`const appsets = extractArgoCDAppSets(srcRoot);`
			console.log(` ApplicationSets: ${appsets.length}`);
			`const regions = extractCloudRegions(srcRoot);`
			console.log(` Regions: AWS=${regions.aws.length} Azure=${regions.azure.length} GCP=${regions.gcp.length}`);
			`const cidrs = extractCIDRAllocations(srcRoot);`
			console.log(` CIDR allocations: ${cidrs.length}`);
			`const naming = extractNamingConventions(srcRoot);`
			console.log(` Naming conventions: ${naming.length}`);
			`const syncWaves = extractSyncWaves(srcRoot);`
			console.log(` Sync waves: ${syncWaves.length} distinct waves`);
			`const techStack = extractTechStack(srcRoot);`
			console.log(` Container images: ${techStack.containerImages.length}, NPM packages: ${techStack.npmPackages.length}`);

			`return { layers, appsets, regions, cidrs, naming, syncWaves, techStack };`
			`}`

			`module.exports = { extractAllPatterns, extractLayers, extractArgoCDAppSets, extractCloudRegions, extractCIDRAllocations, extractNamingConventions, extractSyncWaves, extractTechStack };`