extract-deep.js

/**
 * extract-deep.js — Deep extraction of specific config values, script parameters,
 * and operational details that the high-level extractors miss.
 * 
 * Targets the specific data points that Confluence reference docs contain
 * but our generated docs don't surface.
 */

const fs = require('fs');
const path = require('path');

const IGNORE_DIRS = new Set([
  'node_modules', '.git', 'venv', '.terraform', '__pycache__',
  '_bmad', '_bmad-output', '.codex', '.claude', '.cursor', '.gemini',
  '.kiro', '.agents', 'dist', 'build', 'coverage'
]);

function walk(dir, filter, results = []) {
  try {
    for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
      if (IGNORE_DIRS.has(entry.name)) continue;
      const full = path.join(dir, entry.name);
      if (entry.isDirectory()) {
        walk(full, filter, results);
      } else if (filter(entry.name)) {
        results.push(full);
      }
    }
  } catch {}
  return results;
}

/**
 * Extract EKS/GKE/AKS addon versions from values.yaml files.
 */
function extractAddonVersions(srcRoot) {
  const addons = [];
  const files = walk(srcRoot, f => f === 'values.yaml');
  
  for (const file of files) {
    try {
      const content = fs.readFileSync(file, 'utf8');
      const relPath = path.relative(srcRoot, file);
      
      // Match addon blocks with name + version
      const lines = content.split('\n');
      for (let i = 0; i < lines.length; i++) {
        const nameMatch = lines[i].match(/^\s*-?\s*name:\s*["']?([^\s"']+)/);
        if (nameMatch) {
          // Look for version on next few lines
          for (let j = i + 1; j < Math.min(i + 5, lines.length); j++) {
            const verMatch = lines[j].match(/version:\s*["']?([^\s"']+)/);
            if (verMatch) {
              addons.push({
                name: nameMatch[1],
                version: verMatch[1],
                file: relPath,
              });
              break;
            }
          }
        }
      }
    } catch {}
  }
  
  return addons;
}

/**
 * Extract Terraform resource configurations (RDS, VPC, etc.) with specific values.
 */
function extractTerraformConfigs(srcRoot) {
  const configs = [];
  const files = walk(srcRoot, f => f.endsWith('.tf'));
  
  for (const file of files) {
    try {
      const content = fs.readFileSync(file, 'utf8');
      const relPath = path.relative(srcRoot, file);
      
      // Extract key config values
      const patterns = [
        { key: 'vpc_cidr', regex: /vpc_cidr\s*=\s*"([^"]+)"/ },
        { key: 'source_ranges', regex: /source_ranges\s*=\s*\[\s*"([^"]+)"\s*\]/ },
        { key: 'backup_retention_period', regex: /backup_retention_period\s*=\s*(\d+)/ },
        { key: 'backup_window', regex: /backup_window\s*=\s*"([^"]+)"/ },
        { key: 'engine_version', regex: /engine_version\s*=\s*"([^"]+)"/ },
        { key: 'instance_class', regex: /instance_class\s*=\s*"([^"]+)"/ },
        { key: 'allocated_storage', regex: /allocated_storage\s*=\s*(\d+)/ },
        { key: 'multi_az', regex: /multi_az\s*=\s*(true|false)/ },
        { key: 'deletion_protection', regex: /deletion_protection\s*=\s*(true|false)/ },
        { key: 'node_count', regex: /(?:node_count|desired_size|min_size|max_size)\s*=\s*(\d+)/ },
        { key: 'machine_type', regex: /(?:machine_type|instance_type|vm_size)\s*=\s*"([^"]+)"/ },
      ];
      
      for (const p of patterns) {
        const match = content.match(p.regex);
        if (match) {
          configs.push({ key: p.key, value: match[1], file: relPath });
        }
      }
    } catch {}
  }
  
  return configs;
}

/**
 * Extract script parameters (timeouts, retries, poll intervals).
 */
function extractScriptParams(srcRoot) {
  const params = [];
  const files = walk(srcRoot, f => f.endsWith('.sh') || f.endsWith('.py'));
  
  for (const file of files) {
    try {
      const content = fs.readFileSync(file, 'utf8');
      const relPath = path.relative(srcRoot, file);
      
      const lines = content.split('\n');
      for (const line of lines) {
        // Match Python self.aws_block = ipaddress.IPv4Network('10.192.0.0/10')
        const pyCidrMatch = line.match(/(?:self\.)?([a-zA-Z_]+)\s*=\s*(?:ipaddress\.IPv4Network\()?['"]([^'"]+)['"]\)?/);
        if (pyCidrMatch && pyCidrMatch[2].includes('/')) {
           params.push({
            name: pyCidrMatch[1],
            value: pyCidrMatch[2],
            comment: 'CIDR Allocation',
            file: relPath,
          });
        }
        
        // Match bash variable assignments with numeric values and comments
        const match = line.match(/^([A-Z_]+)\s*=\s*(\d+)\s*(?:#\s*(.+))?/);
        if (match) {
          params.push({
            name: match[1],
            value: match[2],
            comment: match[3] || '',
            file: relPath,
          });
        }
      }
    } catch {}
  }
  
  return params;
}

/**
 * Extract Helm template specific values (product IDs, OU IDs, etc.).
 */
function extractHelmTemplateValues(srcRoot) {
  const values = [];
  const files = walk(srcRoot, f => f.endsWith('.yaml') || f.endsWith('.yml'));
  
  for (const file of files) {
    try {
      const content = fs.readFileSync(file, 'utf8');
      const relPath = path.relative(srcRoot, file);
      
      // Extract specific identifiers
      const patterns = [
        { key: 'product_id', regex: /(?:product[_-]?id|productId)\s*[:=]\s*"?([a-z]+-[a-z0-9]+)"?/i },
        { key: 'ou_id', regex: /(?:ou[_-]?id|organizationalUnit)\s*[:=]\s*"?(ou-[a-z0-9-]+)"?/i },
        { key: 'account_id', regex: /(?:account[_-]?id|accountId)\s*[:=]\s*"?(\d{12})"?/ },
        { key: 'host_project', regex: /(?:hostProject|host_project)\s*[:=]\s*"?([a-z][-a-z0-9]+)"?/ },
        { key: 'shared_vpc', regex: /sharedVpc[\s\S]*?enabled:\s*(true|false)/m },
      ];
      
      for (const p of patterns) {
        const match = content.match(p.regex);
        if (match) {
          values.push({ key: p.key, value: match[1], file: relPath });
        }
      }
    } catch {}
  }
  
  // Deduplicate
  const seen = new Set();
  return values.filter(v => {
    const k = `${v.key}:${v.value}`;
    if (seen.has(k)) return false;
    seen.add(k);
    return true;
  });
}

/**
 * Extract state management services from Helm chart names.
 */
function extractStateServices(srcRoot) {
  const stateCharts = ['elasticsearch', 'hazelcast', 'redis', 'milvus', 'cassandra', 'kafka', 'rabbitmq', 'postgresql', 'mysql', 'mongodb'];
  const found = [];
  
  const files = walk(srcRoot, f => f === 'Chart.yaml');
  for (const file of files) {
    try {
      const content = fs.readFileSync(file, 'utf8');
      const nameMatch = content.match(/name:\s*(.+)/);
      if (nameMatch) {
        const name = nameMatch[1].trim();
        if (stateCharts.some(s => name.toLowerCase().includes(s))) {
          const relPath = path.relative(srcRoot, file);
          const versionMatch = content.match(/(?:appVersion|version):\s*(.+)/);
          found.push({
            name,
            version: versionMatch ? versionMatch[1].trim() : null,
            path: path.dirname(relPath),
          });
        }
      }
    } catch {}
  }
  
  return found;
}

/**
 * Run all deep extractors.
 */
function extractDeep(srcRoot) {
  console.log('Running deep extraction...');
  const addons = extractAddonVersions(srcRoot);
  console.log(`  Addon versions: ${addons.length}`);
  const tfConfigs = extractTerraformConfigs(srcRoot);
  console.log(`  TF configs: ${tfConfigs.length}`);
  const scriptParams = extractScriptParams(srcRoot);
  console.log(`  Script params: ${scriptParams.length}`);
  const helmValues = extractHelmTemplateValues(srcRoot);
  console.log(`  Helm template values: ${helmValues.length}`);
  const stateServices = extractStateServices(srcRoot);
  console.log(`  State services: ${stateServices.length}`);
  
  return { addons, tfConfigs, scriptParams, helmValues, stateServices };
}

module.exports = { extractDeep, extractAddonVersions, extractTerraformConfigs, extractScriptParams, extractHelmTemplateValues, extractStateServices };
Add deep extractors, reference pages, keyword index; eval 53.3% - extract-deep.js: mines addon versions, TF configs, script params, helm values, state services - generate-reference-pages.js: creates operations.md, configuration.md, network-architecture.md - reference/index.md: keyword-rich topic-to-file routing table - Enriched CIDR extractor with inline comment capture - Eval progression: 28.7% -> 33.4% -> 46.7% -> 52.5% -> 53.3% - NOT_FOUND: 25 -> 20 -> 16 -> 10 -> 11 - Top scores: config-region-code 95%, argo-gen-params 95%, multiple 100%s - Remaining gap: agent planner (haiku) doesn't consistently follow index routing 2026-03-10 19:01:21 +00:00			`/**`
			`* extract-deep.js — Deep extraction of specific config values, script parameters,`
			`* and operational details that the high-level extractors miss.`
			`*`
			`* Targets the specific data points that Confluence reference docs contain`
			`* but our generated docs don't surface.`
			`*/`

			`const fs = require('fs');`
			`const path = require('path');`

			`const IGNORE_DIRS = new Set([`
			`'node_modules', '.git', 'venv', '.terraform', '__pycache__',`
			`'_bmad', '_bmad-output', '.codex', '.claude', '.cursor', '.gemini',`
			`'.kiro', '.agents', 'dist', 'build', 'coverage'`
			`]);`

			`function walk(dir, filter, results = []) {`
			`try {`
			`for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {`
			`if (IGNORE_DIRS.has(entry.name)) continue;`
			`const full = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`walk(full, filter, results);`
			`} else if (filter(entry.name)) {`
			`results.push(full);`
			`}`
			`}`
			`} catch {}`
			`return results;`
			`}`

			`/**`
			`* Extract EKS/GKE/AKS addon versions from values.yaml files.`
			`*/`
			`function extractAddonVersions(srcRoot) {`
			`const addons = [];`
			`const files = walk(srcRoot, f => f === 'values.yaml');`

			`for (const file of files) {`
			`try {`
			`const content = fs.readFileSync(file, 'utf8');`
			`const relPath = path.relative(srcRoot, file);`

			`// Match addon blocks with name + version`
			`const lines = content.split('\n');`
			`for (let i = 0; i < lines.length; i++) {`
			`const nameMatch = lines[i].match(/^\s-?\sname:\s*["']?([^\s"']+)/);`
			`if (nameMatch) {`
			`// Look for version on next few lines`
			`for (let j = i + 1; j < Math.min(i + 5, lines.length); j++) {`
			`const verMatch = lines[j].match(/version:\s*["']?([^\s"']+)/);`
			`if (verMatch) {`
			`addons.push({`
			`name: nameMatch[1],`
			`version: verMatch[1],`
			`file: relPath,`
			`});`
			`break;`
			`}`
			`}`
			`}`
			`}`
			`} catch {}`
			`}`

			`return addons;`
			`}`

			`/**`
			`* Extract Terraform resource configurations (RDS, VPC, etc.) with specific values.`
			`*/`
			`function extractTerraformConfigs(srcRoot) {`
			`const configs = [];`
			`const files = walk(srcRoot, f => f.endsWith('.tf'));`

			`for (const file of files) {`
			`try {`
			`const content = fs.readFileSync(file, 'utf8');`
			`const relPath = path.relative(srcRoot, file);`

			`// Extract key config values`
			`const patterns = [`
feat: repo-agnostic refactor (BMad spec-test-build loop) - NEW: repo-profiler.js — deterministic archetype detection (Infra, Frontend, Backend, etc.) - NEW: extract-dynamic.js — generic extractor replacing hardcoded Foxtrot patterns - NEW: eval-generator.js — dynamic ground-truth question generation from any repo graph - NEW: specs/bmad-agnostic-refactor-spec.md — full BMad spec with acceptance criteria - REFACTORED: prose.js — two-pass LLM synthesis with rich context (shared secrets, ports, service refs) - REFACTORED: sysdoc.js — wired repo-profiler + extract-dynamic, --legacy escape hatch - REFACTORED: wiggum-v2.sh — uses eval-generator before benchmarks - FIXED: graph.js — _edgeSet rebuilt on loadSnapshot() (edge dedup was broken) - FIXED: graph.js — recursive sortKeys() for deep equality in diffing - FIXED: prose.js — robust JSON array extraction from LLM output - FIXED: ratchet.js — syntax validation (node --check) before saving LLM mutations - FIXED: extract-dynamic.js — centralized state services regex, added console.warn for silent failures - TESTS: test-eval-generator, test-repo-profiler, test-synthesis-quality + mock fixtures Eval: 81.5% on Foxtrot (fully repo-agnostic, no hardcoded reference pages) BMad reviews: Architect B+, Dev Lead B-, TEA B- 2026-03-11 14:40:31 +00:00			`{ key: 'vpc_cidr', regex: /vpc_cidr\s=\s"([^"]+)"/ },`
			`{ key: 'source_ranges', regex: /source_ranges\s=\s\[\s"([^"]+)"\s\]/ },`
Add deep extractors, reference pages, keyword index; eval 53.3% - extract-deep.js: mines addon versions, TF configs, script params, helm values, state services - generate-reference-pages.js: creates operations.md, configuration.md, network-architecture.md - reference/index.md: keyword-rich topic-to-file routing table - Enriched CIDR extractor with inline comment capture - Eval progression: 28.7% -> 33.4% -> 46.7% -> 52.5% -> 53.3% - NOT_FOUND: 25 -> 20 -> 16 -> 10 -> 11 - Top scores: config-region-code 95%, argo-gen-params 95%, multiple 100%s - Remaining gap: agent planner (haiku) doesn't consistently follow index routing 2026-03-10 19:01:21 +00:00			`{ key: 'backup_retention_period', regex: /backup_retention_period\s=\s(\d+)/ },`
			`{ key: 'backup_window', regex: /backup_window\s=\s"([^"]+)"/ },`
			`{ key: 'engine_version', regex: /engine_version\s=\s"([^"]+)"/ },`
			`{ key: 'instance_class', regex: /instance_class\s=\s"([^"]+)"/ },`
			`{ key: 'allocated_storage', regex: /allocated_storage\s=\s(\d+)/ },`
			`{ key: 'multi_az', regex: /multi_az\s=\s(true\|false)/ },`
			`{ key: 'deletion_protection', regex: /deletion_protection\s=\s(true\|false)/ },`
			`{ key: 'node_count', regex: /(?:node_count\|desired_size\|min_size\|max_size)\s=\s(\d+)/ },`
			`{ key: 'machine_type', regex: /(?:machine_type\|instance_type\|vm_size)\s=\s"([^"]+)"/ },`
			`];`

			`for (const p of patterns) {`
			`const match = content.match(p.regex);`
			`if (match) {`
			`configs.push({ key: p.key, value: match[1], file: relPath });`
			`}`
			`}`
			`} catch {}`
			`}`

			`return configs;`
			`}`

			`/**`
			`* Extract script parameters (timeouts, retries, poll intervals).`
			`*/`
			`function extractScriptParams(srcRoot) {`
			`const params = [];`
			`const files = walk(srcRoot, f => f.endsWith('.sh') \|\| f.endsWith('.py'));`

			`for (const file of files) {`
			`try {`
			`const content = fs.readFileSync(file, 'utf8');`
			`const relPath = path.relative(srcRoot, file);`

			`const lines = content.split('\n');`
			`for (const line of lines) {`
feat: repo-agnostic refactor (BMad spec-test-build loop) - NEW: repo-profiler.js — deterministic archetype detection (Infra, Frontend, Backend, etc.) - NEW: extract-dynamic.js — generic extractor replacing hardcoded Foxtrot patterns - NEW: eval-generator.js — dynamic ground-truth question generation from any repo graph - NEW: specs/bmad-agnostic-refactor-spec.md — full BMad spec with acceptance criteria - REFACTORED: prose.js — two-pass LLM synthesis with rich context (shared secrets, ports, service refs) - REFACTORED: sysdoc.js — wired repo-profiler + extract-dynamic, --legacy escape hatch - REFACTORED: wiggum-v2.sh — uses eval-generator before benchmarks - FIXED: graph.js — _edgeSet rebuilt on loadSnapshot() (edge dedup was broken) - FIXED: graph.js — recursive sortKeys() for deep equality in diffing - FIXED: prose.js — robust JSON array extraction from LLM output - FIXED: ratchet.js — syntax validation (node --check) before saving LLM mutations - FIXED: extract-dynamic.js — centralized state services regex, added console.warn for silent failures - TESTS: test-eval-generator, test-repo-profiler, test-synthesis-quality + mock fixtures Eval: 81.5% on Foxtrot (fully repo-agnostic, no hardcoded reference pages) BMad reviews: Architect B+, Dev Lead B-, TEA B- 2026-03-11 14:40:31 +00:00			`// Match Python self.aws_block = ipaddress.IPv4Network('10.192.0.0/10')`
			`const pyCidrMatch = line.match(/(?:self\.)?([a-zA-Z_]+)\s=\s(?:ipaddress\.IPv4Network\()?['"]([^'"]+)['"]\)?/);`
			`if (pyCidrMatch && pyCidrMatch[2].includes('/')) {`
			`params.push({`
			`name: pyCidrMatch[1],`
			`value: pyCidrMatch[2],`
			`comment: 'CIDR Allocation',`
			`file: relPath,`
			`});`
			`}`

			`// Match bash variable assignments with numeric values and comments`
Add deep extractors, reference pages, keyword index; eval 53.3% - extract-deep.js: mines addon versions, TF configs, script params, helm values, state services - generate-reference-pages.js: creates operations.md, configuration.md, network-architecture.md - reference/index.md: keyword-rich topic-to-file routing table - Enriched CIDR extractor with inline comment capture - Eval progression: 28.7% -> 33.4% -> 46.7% -> 52.5% -> 53.3% - NOT_FOUND: 25 -> 20 -> 16 -> 10 -> 11 - Top scores: config-region-code 95%, argo-gen-params 95%, multiple 100%s - Remaining gap: agent planner (haiku) doesn't consistently follow index routing 2026-03-10 19:01:21 +00:00			`const match = line.match(/^([A-Z_]+)\s=\s(\d+)\s(?:#\s(.+))?/);`
			`if (match) {`
			`params.push({`
			`name: match[1],`
			`value: match[2],`
			`comment: match[3] \|\| '',`
			`file: relPath,`
			`});`
			`}`
			`}`
			`} catch {}`
			`}`

			`return params;`
			`}`

			`/**`
			`* Extract Helm template specific values (product IDs, OU IDs, etc.).`
			`*/`
			`function extractHelmTemplateValues(srcRoot) {`
			`const values = [];`
			`const files = walk(srcRoot, f => f.endsWith('.yaml') \|\| f.endsWith('.yml'));`

			`for (const file of files) {`
			`try {`
			`const content = fs.readFileSync(file, 'utf8');`
			`const relPath = path.relative(srcRoot, file);`

			`// Extract specific identifiers`
			`const patterns = [`
			`{ key: 'product_id', regex: /(?:product[_-]?id\|productId)\s[:=]\s"?([a-z]+-[a-z0-9]+)"?/i },`
			`{ key: 'ou_id', regex: /(?:ou[_-]?id\|organizationalUnit)\s[:=]\s"?(ou-[a-z0-9-]+)"?/i },`
			`{ key: 'account_id', regex: /(?:account[_-]?id\|accountId)\s[:=]\s"?(\d{12})"?/ },`
			`{ key: 'host_project', regex: /(?:hostProject\|host_project)\s[:=]\s"?([a-z][-a-z0-9]+)"?/ },`
			`{ key: 'shared_vpc', regex: /sharedVpc[\s\S]?enabled:\s(true\|false)/m },`
			`];`

			`for (const p of patterns) {`
			`const match = content.match(p.regex);`
			`if (match) {`
			`values.push({ key: p.key, value: match[1], file: relPath });`
			`}`
			`}`
			`} catch {}`
			`}`

			`// Deduplicate`
			`const seen = new Set();`
			`return values.filter(v => {`
			const k = `${v.key}:${v.value}`;
			`if (seen.has(k)) return false;`
			`seen.add(k);`
			`return true;`
			`});`
			`}`

			`/**`
			`* Extract state management services from Helm chart names.`
			`*/`
			`function extractStateServices(srcRoot) {`
			`const stateCharts = ['elasticsearch', 'hazelcast', 'redis', 'milvus', 'cassandra', 'kafka', 'rabbitmq', 'postgresql', 'mysql', 'mongodb'];`
			`const found = [];`

			`const files = walk(srcRoot, f => f === 'Chart.yaml');`
			`for (const file of files) {`
			`try {`
			`const content = fs.readFileSync(file, 'utf8');`
			`const nameMatch = content.match(/name:\s*(.+)/);`
			`if (nameMatch) {`
			`const name = nameMatch[1].trim();`
			`if (stateCharts.some(s => name.toLowerCase().includes(s))) {`
			`const relPath = path.relative(srcRoot, file);`
			`const versionMatch = content.match(/(?:appVersion\|version):\s*(.+)/);`
			`found.push({`
			`name,`
			`version: versionMatch ? versionMatch[1].trim() : null,`
			`path: path.dirname(relPath),`
			`});`
			`}`
			`}`
			`} catch {}`
			`}`

			`return found;`
			`}`

			`/**`
			`* Run all deep extractors.`
			`*/`
			`function extractDeep(srcRoot) {`
			`console.log('Running deep extraction...');`
			`const addons = extractAddonVersions(srcRoot);`
			console.log(` Addon versions: ${addons.length}`);
			`const tfConfigs = extractTerraformConfigs(srcRoot);`
			console.log(` TF configs: ${tfConfigs.length}`);
			`const scriptParams = extractScriptParams(srcRoot);`
			console.log(` Script params: ${scriptParams.length}`);
			`const helmValues = extractHelmTemplateValues(srcRoot);`
			console.log(` Helm template values: ${helmValues.length}`);
			`const stateServices = extractStateServices(srcRoot);`
			console.log(` State services: ${stateServices.length}`);

			`return { addons, tfConfigs, scriptParams, helmValues, stateServices };`
			`}`

			`module.exports = { extractDeep, extractAddonVersions, extractTerraformConfigs, extractScriptParams, extractHelmTemplateValues, extractStateServices };`