pipeline.js

const fs = require('fs');
const path = require('path');
const GraphStore = require('./graph.js');
const { extract } = require('./extract.js');
const { semanticDiff, formatSummary } = require('./semantic-diff.js');

/**
 * Developer Intelligence Pipeline v2 - Pipeline Orchestrator
 * Batch extraction, incremental diffing, and benchmarking.
 * No external dependencies.
 */

const SUPPORTED_EXTS = new Set([
  '.ts', '.tsx', '.js', '.jsx', '.py', '.java', '.go', '.sh', '.bash',
  '.yaml', '.yml', '.tf', '.hcl',
]);

const IGNORE_DIRS = new Set([
  'node_modules', '.git', 'dist', 'build', '__pycache__', '.next',
  '.turbo', 'coverage', '.nyc_output', 'vendor', 'venv',
  '.codex', '.claude', '.cursor', '.gemini', '.kiro', '.agents',
  '_bmad', '_bmad-output', '.terraform', 'skills', '.ci', 'docs', '.workdir',
]);

const SCRIPT_DIR = __dirname;
const EXTRACT_JS = path.join(SCRIPT_DIR, 'extract.js');

/**
 * Recursively discover supported files.
 */
function discoverFiles(dir) {
  const results = [];
  let entries;
  try {
    entries = fs.readdirSync(dir, { withFileTypes: true });
  } catch { return results; }

  for (const entry of entries) {
    if (IGNORE_DIRS.has(entry.name)) continue;
    const fullPath = path.join(dir, entry.name);
    if (entry.isDirectory()) {
      results.push(...discoverFiles(fullPath));
    } else if (entry.isFile() && SUPPORTED_EXTS.has(path.extname(entry.name))) {
      results.push(fullPath);
    }
  }
  return results;
}

/**
 * Extract a single file using in-process extract(), no subprocess.
 */
function extractFile(filePath, repoRoot) {
  try {
    return extract(filePath, repoRoot);
  } catch (err) {
    return null;
  }
}

/**
 * Batch extract all files, build graph, save snapshot.
 */
function batchExtract(repoRoot, outputDir) {
  const files = discoverFiles(repoRoot);
  console.log(`Discovered ${files.length} supported files in ${repoRoot}`);

  fs.mkdirSync(outputDir, { recursive: true });

  const results = [];
  let errors = 0;
  const startTime = Date.now();

  for (let i = 0; i < files.length; i++) {
    const result = extractFile(files[i], repoRoot);
    if (result && !result.error) {
      results.push(result);
    } else {
      errors++;
    }
    if ((i + 1) % 100 === 0) {
      console.log(`  Extracted ${i + 1}/${files.length}...`);
    }
  }

  const extractTime = Date.now() - startTime;
  console.log(`Extraction complete: ${results.length} succeeded, ${errors} failed (${extractTime}ms)`);

  const graph = GraphStore.buildGraph(results);
  const snapshotPath = path.join(outputDir, 'snapshot.json');
  GraphStore.saveSnapshot(graph, snapshotPath);
  console.log(`Graph: ${graph.nodes.size} nodes, ${graph.edges.length} edges. Saved to ${snapshotPath}`);

  // Save stats
  const stats = {
    repoRoot,
    filesDiscovered: files.length,
    filesExtracted: results.length,
    errors,
    nodes: graph.nodes.size,
    edges: graph.edges.length,
    extractionTimeMs: extractTime,
    avgTimePerFileMs: Math.round(extractTime / files.length),
    timestamp: new Date().toISOString(),
  };
  fs.writeFileSync(path.join(outputDir, 'stats.json'), JSON.stringify(stats, null, 2));
  console.log(`Stats saved. Avg ${stats.avgTimePerFileMs}ms/file`);

  return { graph, snapshotPath, stats };
}

/**
 * Incremental run: extract files, diff against previous snapshot.
 */
function incrementalRun(repoRoot, files, prevSnapshotPath, outputDir) {
  fs.mkdirSync(outputDir, { recursive: true });

  const filesToExtract = files || discoverFiles(repoRoot);
  console.log(`Extracting ${filesToExtract.length} files...`);

  const results = [];
  let errors = 0;

  for (const f of filesToExtract) {
    const result = extractFile(f, repoRoot);
    if (result && !result.error) {
      results.push(result);
    } else {
      errors++;
    }
  }

  const newGraph = GraphStore.buildGraph(results);
  const newSnapshotPath = path.join(outputDir, 'snapshot.json');
  GraphStore.saveSnapshot(newGraph, newSnapshotPath);
  console.log(`New graph: ${newGraph.nodes.size} nodes, ${newGraph.edges.length} edges`);

  if (prevSnapshotPath && fs.existsSync(prevSnapshotPath)) {
    const oldGraph = GraphStore.loadSnapshot(prevSnapshotPath);
    const diff = semanticDiff(oldGraph, newGraph);
    console.log(formatSummary(diff));

    fs.writeFileSync(path.join(outputDir, 'diff.json'), JSON.stringify({
      score: diff.score,
      severity: diff.severity,
      stats: diff.stats,
      categorized: diff.categorized,
    }, null, 2));
  }

  return { newSnapshotPath };
}

/**
 * Benchmark: extract N random files, report timing.
 */
function benchmark(repoRoot, sampleCount) {
  const allFiles = discoverFiles(repoRoot);
  console.log(`Total supported files: ${allFiles.length}`);

  // Shuffle and pick N
  const shuffled = allFiles.sort(() => Math.random() - 0.5);
  const samples = shuffled.slice(0, Math.min(sampleCount, allFiles.length));
  console.log(`Benchmarking ${samples.length} files...\n`);

  const timings = [];
  let totalEntities = 0;
  let totalRelationships = 0;
  let errors = 0;

  for (const file of samples) {
    const start = Date.now();
    const result = extractFile(file, repoRoot);
    const elapsed = Date.now() - start;

    if (result && !result.error) {
      timings.push({ file: path.relative(repoRoot, file), timeMs: elapsed, entities: result.entities.length, relationships: result.relationships.length });
      totalEntities += result.entities.length;
      totalRelationships += result.relationships.length;
    } else {
      errors++;
      timings.push({ file: path.relative(repoRoot, file), timeMs: elapsed, entities: 0, relationships: 0, error: true });
    }
  }

  // Sort by time descending
  timings.sort((a, b) => b.timeMs - a.timeMs);

  const totalTime = timings.reduce((s, t) => s + t.timeMs, 0);
  const avgTime = Math.round(totalTime / timings.length);
  const p50 = timings[Math.floor(timings.length * 0.5)]?.timeMs || 0;
  const p95 = timings[Math.floor(timings.length * 0.05)]?.timeMs || 0;

  console.log('=== V2 Pipeline Benchmark ===');
  console.log(`Repo: ${repoRoot}`);
  console.log(`Files sampled: ${samples.length} / ${allFiles.length}`);
  console.log(`Errors: ${errors}`);
  console.log(`Total entities: ${totalEntities}`);
  console.log(`Total relationships: ${totalRelationships}`);
  console.log(`Total time: ${totalTime}ms`);
  console.log(`Avg time/file: ${avgTime}ms`);
  console.log(`P50: ${p50}ms | P95: ${p95}ms`);
  console.log('');
  console.log('Top 5 slowest:');
  for (const t of timings.slice(0, 5)) {
    console.log(`  ${t.timeMs}ms  ${t.file} (${t.entities}E/${t.relationships}R)${t.error ? ' ERROR' : ''}`);
  }

  return { totalFiles: allFiles.length, sampled: samples.length, errors, totalEntities, totalRelationships, totalTime, avgTime, p50, p95 };
}

// --- CLI ---
if (require.main === module) {
  const args = process.argv.slice(2);
  const command = args[0];

  if (command === 'batch') {
    const repoRoot = args[1];
    const outputIdx = args.indexOf('--output');
    const outputDir = outputIdx >= 0 ? args[outputIdx + 1] : '/tmp/pipeline-output';

    if (!repoRoot) {
      console.error('Usage: node pipeline.js batch <repo-root> --output <dir>');
      process.exit(1);
    }
    batchExtract(repoRoot, outputDir);

  } else if (command === 'run') {
    const repoRoot = args[1];
    const snapshotIdx = args.indexOf('--snapshot');
    const prevSnapshot = snapshotIdx >= 0 ? args[snapshotIdx + 1] : null;
    const outputIdx = args.indexOf('--output');
    const outputDir = outputIdx >= 0 ? args[outputIdx + 1] : '/tmp/pipeline-output';

    if (!repoRoot) {
      console.error('Usage: node pipeline.js run <repo-root> [--snapshot <prev.json>] [--output <dir>]');
      process.exit(1);
    }
    incrementalRun(repoRoot, null, prevSnapshot, outputDir);

  } else if (command === 'benchmark') {
    const repoRoot = args[1];
    const samplesIdx = args.indexOf('--samples');
    const sampleCount = samplesIdx >= 0 ? parseInt(args[samplesIdx + 1], 10) : 10;

    if (!repoRoot) {
      console.error('Usage: node pipeline.js benchmark <repo-root> --samples <N>');
      process.exit(1);
    }
    benchmark(repoRoot, sampleCount);

  } else {
    console.error('Unknown command. Available: batch, run, benchmark');
    process.exit(1);
  }
}

module.exports = { discoverFiles, extractFile, batchExtract, incrementalRun, benchmark };
Dev Intel Pipeline v2 — multi-language semantic graph extractor Phase 1: extract.js — tree-sitter AST parser (TS/JS/Python/Go/Java/Bash) + config parsers (YAML/HCL) Phase 2: graph.js — in-memory directed graph store with build/query/diff CLI Phase 3: namespace.js — cross-repo namespace registry with 3-tier resolution Phase 4: semantic-diff.js — categorized diffs with impact scoring (0-100) Phase 5: pipeline.js — batch extraction, incremental diffing, benchmarking Benchmark: 4,325 files, 21,646 nodes, 133,979 edges in 67s (15ms/file) BMad SPA reviews: all phases GO 2026-03-09 05:29:29 +00:00			`const fs = require('fs');`
			`const path = require('path');`
			`const GraphStore = require('./graph.js');`
			`const { extract } = require('./extract.js');`
			`const { semanticDiff, formatSummary } = require('./semantic-diff.js');`

			`/**`
			`* Developer Intelligence Pipeline v2 - Pipeline Orchestrator`
			`* Batch extraction, incremental diffing, and benchmarking.`
			`* No external dependencies.`
			`*/`

			`const SUPPORTED_EXTS = new Set([`
			`'.ts', '.tsx', '.js', '.jsx', '.py', '.java', '.go', '.sh', '.bash',`
			`'.yaml', '.yml', '.tf', '.hcl',`
			`]);`

			`const IGNORE_DIRS = new Set([`
			`'node_modules', '.git', 'dist', 'build', '__pycache__', '.next',`
feat: confluence benchmark, pattern extractor, agent KB, UX spec - extract-patterns.js: mines layered arch, ArgoCD appsets, cloud regions, CIDR allocations, naming conventions, sync waves, tech stack from code - agent-kb.js: token-efficient JSON rendering of same doc tree - eval-confluence-ref-questions.json: 32 reference-only benchmark questions - wiggum-v2.sh: Ralph Wiggum loop targeting confluence baseline (77.8%) - docs/human-ux-spec.md: BMad UX designer spec for human doc structure - Eval results: V2 at 28.7% vs confluence 77.8% baseline - Hub/spoke ownership now correctly extracted (95% on that question) - Naming conventions, regions, CIDRs surfaced in system-architecture.md 2026-03-10 14:20:35 +00:00			`'.turbo', 'coverage', '.nyc_output', 'vendor', 'venv',`
			`'.codex', '.claude', '.cursor', '.gemini', '.kiro', '.agents',`
			`'_bmad', '_bmad-output', '.terraform', 'skills', '.ci', 'docs', '.workdir',`
Dev Intel Pipeline v2 — multi-language semantic graph extractor Phase 1: extract.js — tree-sitter AST parser (TS/JS/Python/Go/Java/Bash) + config parsers (YAML/HCL) Phase 2: graph.js — in-memory directed graph store with build/query/diff CLI Phase 3: namespace.js — cross-repo namespace registry with 3-tier resolution Phase 4: semantic-diff.js — categorized diffs with impact scoring (0-100) Phase 5: pipeline.js — batch extraction, incremental diffing, benchmarking Benchmark: 4,325 files, 21,646 nodes, 133,979 edges in 67s (15ms/file) BMad SPA reviews: all phases GO 2026-03-09 05:29:29 +00:00			`]);`

			`const SCRIPT_DIR = __dirname;`
			`const EXTRACT_JS = path.join(SCRIPT_DIR, 'extract.js');`

			`/**`
			`* Recursively discover supported files.`
			`*/`
			`function discoverFiles(dir) {`
			`const results = [];`
			`let entries;`
			`try {`
			`entries = fs.readdirSync(dir, { withFileTypes: true });`
			`} catch { return results; }`

			`for (const entry of entries) {`
			`if (IGNORE_DIRS.has(entry.name)) continue;`
			`const fullPath = path.join(dir, entry.name);`
			`if (entry.isDirectory()) {`
			`results.push(...discoverFiles(fullPath));`
			`} else if (entry.isFile() && SUPPORTED_EXTS.has(path.extname(entry.name))) {`
			`results.push(fullPath);`
			`}`
			`}`
			`return results;`
			`}`

			`/**`
			`* Extract a single file using in-process extract(), no subprocess.`
			`*/`
			`function extractFile(filePath, repoRoot) {`
			`try {`
			`return extract(filePath, repoRoot);`
			`} catch (err) {`
			`return null;`
			`}`
			`}`

			`/**`
			`* Batch extract all files, build graph, save snapshot.`
			`*/`
			`function batchExtract(repoRoot, outputDir) {`
			`const files = discoverFiles(repoRoot);`
			console.log(`Discovered ${files.length} supported files in ${repoRoot}`);

			`fs.mkdirSync(outputDir, { recursive: true });`

			`const results = [];`
			`let errors = 0;`
			`const startTime = Date.now();`

			`for (let i = 0; i < files.length; i++) {`
			`const result = extractFile(files[i], repoRoot);`
			`if (result && !result.error) {`
			`results.push(result);`
			`} else {`
			`errors++;`
			`}`
			`if ((i + 1) % 100 === 0) {`
			console.log(` Extracted ${i + 1}/${files.length}...`);
			`}`
			`}`

			`const extractTime = Date.now() - startTime;`
			console.log(`Extraction complete: ${results.length} succeeded, ${errors} failed (${extractTime}ms)`);

			`const graph = GraphStore.buildGraph(results);`
			`const snapshotPath = path.join(outputDir, 'snapshot.json');`
			`GraphStore.saveSnapshot(graph, snapshotPath);`
			console.log(`Graph: ${graph.nodes.size} nodes, ${graph.edges.length} edges. Saved to ${snapshotPath}`);

			`// Save stats`
			`const stats = {`
			`repoRoot,`
			`filesDiscovered: files.length,`
			`filesExtracted: results.length,`
			`errors,`
			`nodes: graph.nodes.size,`
			`edges: graph.edges.length,`
			`extractionTimeMs: extractTime,`
			`avgTimePerFileMs: Math.round(extractTime / files.length),`
			`timestamp: new Date().toISOString(),`
			`};`
			`fs.writeFileSync(path.join(outputDir, 'stats.json'), JSON.stringify(stats, null, 2));`
			console.log(`Stats saved. Avg ${stats.avgTimePerFileMs}ms/file`);

			`return { graph, snapshotPath, stats };`
			`}`

			`/**`
			`* Incremental run: extract files, diff against previous snapshot.`
			`*/`
			`function incrementalRun(repoRoot, files, prevSnapshotPath, outputDir) {`
			`fs.mkdirSync(outputDir, { recursive: true });`

			`const filesToExtract = files \|\| discoverFiles(repoRoot);`
			console.log(`Extracting ${filesToExtract.length} files...`);

			`const results = [];`
			`let errors = 0;`

			`for (const f of filesToExtract) {`
			`const result = extractFile(f, repoRoot);`
			`if (result && !result.error) {`
			`results.push(result);`
			`} else {`
			`errors++;`
			`}`
			`}`

			`const newGraph = GraphStore.buildGraph(results);`
			`const newSnapshotPath = path.join(outputDir, 'snapshot.json');`
			`GraphStore.saveSnapshot(newGraph, newSnapshotPath);`
			console.log(`New graph: ${newGraph.nodes.size} nodes, ${newGraph.edges.length} edges`);

			`if (prevSnapshotPath && fs.existsSync(prevSnapshotPath)) {`
			`const oldGraph = GraphStore.loadSnapshot(prevSnapshotPath);`
			`const diff = semanticDiff(oldGraph, newGraph);`
			`console.log(formatSummary(diff));`

			`fs.writeFileSync(path.join(outputDir, 'diff.json'), JSON.stringify({`
			`score: diff.score,`
			`severity: diff.severity,`
			`stats: diff.stats,`
			`categorized: diff.categorized,`
			`}, null, 2));`
			`}`

			`return { newSnapshotPath };`
			`}`

			`/**`
			`* Benchmark: extract N random files, report timing.`
			`*/`
			`function benchmark(repoRoot, sampleCount) {`
			`const allFiles = discoverFiles(repoRoot);`
			console.log(`Total supported files: ${allFiles.length}`);

			`// Shuffle and pick N`
			`const shuffled = allFiles.sort(() => Math.random() - 0.5);`
			`const samples = shuffled.slice(0, Math.min(sampleCount, allFiles.length));`
			console.log(`Benchmarking ${samples.length} files...\n`);

			`const timings = [];`
			`let totalEntities = 0;`
			`let totalRelationships = 0;`
			`let errors = 0;`

			`for (const file of samples) {`
			`const start = Date.now();`
			`const result = extractFile(file, repoRoot);`
			`const elapsed = Date.now() - start;`

			`if (result && !result.error) {`
			`timings.push({ file: path.relative(repoRoot, file), timeMs: elapsed, entities: result.entities.length, relationships: result.relationships.length });`
			`totalEntities += result.entities.length;`
			`totalRelationships += result.relationships.length;`
			`} else {`
			`errors++;`
			`timings.push({ file: path.relative(repoRoot, file), timeMs: elapsed, entities: 0, relationships: 0, error: true });`
			`}`
			`}`

			`// Sort by time descending`
			`timings.sort((a, b) => b.timeMs - a.timeMs);`

			`const totalTime = timings.reduce((s, t) => s + t.timeMs, 0);`
			`const avgTime = Math.round(totalTime / timings.length);`
			`const p50 = timings[Math.floor(timings.length * 0.5)]?.timeMs \|\| 0;`
			`const p95 = timings[Math.floor(timings.length * 0.05)]?.timeMs \|\| 0;`

			`console.log('=== V2 Pipeline Benchmark ===');`
			console.log(`Repo: ${repoRoot}`);
			console.log(`Files sampled: ${samples.length} / ${allFiles.length}`);
			console.log(`Errors: ${errors}`);
			console.log(`Total entities: ${totalEntities}`);
			console.log(`Total relationships: ${totalRelationships}`);
			console.log(`Total time: ${totalTime}ms`);
			console.log(`Avg time/file: ${avgTime}ms`);
			console.log(`P50: ${p50}ms \| P95: ${p95}ms`);
			`console.log('');`
			`console.log('Top 5 slowest:');`
			`for (const t of timings.slice(0, 5)) {`
			console.log(` ${t.timeMs}ms ${t.file} (${t.entities}E/${t.relationships}R)${t.error ? ' ERROR' : ''}`);
			`}`

			`return { totalFiles: allFiles.length, sampled: samples.length, errors, totalEntities, totalRelationships, totalTime, avgTime, p50, p95 };`
			`}`

			`// --- CLI ---`
			`if (require.main === module) {`
			`const args = process.argv.slice(2);`
			`const command = args[0];`

			`if (command === 'batch') {`
			`const repoRoot = args[1];`
			`const outputIdx = args.indexOf('--output');`
			`const outputDir = outputIdx >= 0 ? args[outputIdx + 1] : '/tmp/pipeline-output';`

			`if (!repoRoot) {`
			`console.error('Usage: node pipeline.js batch <repo-root> --output <dir>');`
			`process.exit(1);`
			`}`
			`batchExtract(repoRoot, outputDir);`

			`} else if (command === 'run') {`
			`const repoRoot = args[1];`
			`const snapshotIdx = args.indexOf('--snapshot');`
			`const prevSnapshot = snapshotIdx >= 0 ? args[snapshotIdx + 1] : null;`
			`const outputIdx = args.indexOf('--output');`
			`const outputDir = outputIdx >= 0 ? args[outputIdx + 1] : '/tmp/pipeline-output';`

			`if (!repoRoot) {`
			`console.error('Usage: node pipeline.js run <repo-root> [--snapshot <prev.json>] [--output <dir>]');`
			`process.exit(1);`
			`}`
			`incrementalRun(repoRoot, null, prevSnapshot, outputDir);`

			`} else if (command === 'benchmark') {`
			`const repoRoot = args[1];`
			`const samplesIdx = args.indexOf('--samples');`
			`const sampleCount = samplesIdx >= 0 ? parseInt(args[samplesIdx + 1], 10) : 10;`

			`if (!repoRoot) {`
			`console.error('Usage: node pipeline.js benchmark <repo-root> --samples <N>');`
			`process.exit(1);`
			`}`
			`benchmark(repoRoot, sampleCount);`

			`} else {`
			`console.error('Unknown command. Available: batch, run, benchmark');`
			`process.exit(1);`
			`}`
			`}`

			`module.exports = { discoverFiles, extractFile, batchExtract, incrementalRun, benchmark };`