From 4221ab4d7626fb6acef26f512439b91048357166 Mon Sep 17 00:00:00 2001
From: Jarvis Prime <bgalura+jarvis@gmail.com>
Date: Mon, 9 Mar 2026 06:20:54 +0000
Subject: [PATCH] Phase 6: LLM doc generation + Phase 7 system-docs spec

---
 docgen.js                 | 295 ++++++++++++++++++++++++++++++++++++++
 specs/system-docs-spec.md | 228 +++++++++++++++++++++++++++++
 2 files changed, 523 insertions(+)
 create mode 100644 docgen.js
 create mode 100644 specs/system-docs-spec.md

diff --git a/docgen.js b/docgen.js
new file mode 100644
index 0000000..e420727
--- /dev/null
+++ b/docgen.js
@@ -0,0 +1,295 @@
+const fs = require('fs');
+const path = require('path');
+const http = require('http');
+const https = require('https');
+const GraphStore = require('./graph.js');
+const { semanticDiff, formatSummary } = require('./semantic-diff.js');
+
+/**
+ * Developer Intelligence Pipeline v2 - Phase 6: LLM Doc Generation
+ * Uses semantic diff context to generate targeted, intelligent documentation.
+ * Supports Ollama and OpenAI-compatible APIs.
+ */
+
+const LLM_URL = process.env.LLM_URL || 'http://192.168.86.172:11434';
+const LLM_MODEL = process.env.LLM_MODEL || 'qwen2.5:7b';
+const LLM_BACKEND = process.env.LLM_BACKEND || 'ollama'; // 'ollama' or 'openai'
+
+/**
+ * Call LLM API (Ollama or OpenAI-compatible).
+ */
+function callLLM(prompt, maxTokens = 1024) {
+  return new Promise((resolve, reject) => {
+    let url, body, headers;
+
+    if (LLM_BACKEND === 'ollama') {
+      url = new URL('/api/generate', LLM_URL);
+      body = JSON.stringify({
+        model: LLM_MODEL,
+        prompt,
+        stream: false,
+        options: { num_predict: maxTokens, temperature: 0.3 },
+      });
+      headers = { 'Content-Type': 'application/json' };
+    } else {
+      url = new URL('/v1/chat/completions', LLM_URL);
+      body = JSON.stringify({
+        model: LLM_MODEL,
+        messages: [{ role: 'user', content: prompt }],
+        max_tokens: maxTokens,
+        temperature: 0.3,
+      });
+      headers = {
+        'Content-Type': 'application/json',
+        'Authorization': `Bearer ${process.env.OPENAI_API_KEY || 'not-needed'}`,
+      };
+    }
+
+    const client = url.protocol === 'https:' ? https : http;
+    const req = client.request(url, { method: 'POST', headers }, (res) => {
+      let data = '';
+      res.on('data', (chunk) => data += chunk);
+      res.on('end', () => {
+        try {
+          const parsed = JSON.parse(data);
+          if (LLM_BACKEND === 'ollama') {
+            resolve(parsed.response || '');
+          } else {
+            resolve(parsed.choices?.[0]?.message?.content || '');
+          }
+        } catch (e) {
+          reject(new Error(`LLM parse error: ${e.message}`));
+        }
+      });
+    });
+    req.on('error', reject);
+    req.setTimeout(60000, () => { req.destroy(); reject(new Error('LLM timeout')); });
+    req.write(body);
+    req.end();
+  });
+}
+
+/**
+ * Generate documentation for a single entity using graph context.
+ */
+async function generateEntityDoc(entityId, graph, sourceCode) {
+  const entity = graph.nodes.get(entityId);
+  if (!entity) return null;
+
+  // Gather context: callers, callees, container
+  const outgoing = graph.edges.filter(e => e.source === entityId);
+  const incoming = graph.edges.filter(e => e.target === entityId);
+
+  const calls = outgoing.filter(e => e.type === 'CALLS').map(e => e.target);
+  const calledBy = incoming.filter(e => e.type === 'CALLS').map(e => e.source);
+  const containedBy = incoming.filter(e => e.type === 'CONTAINS').map(e => e.source);
+
+  // Extract source snippet if available
+  let snippet = '';
+  if (sourceCode && entity.line_range) {
+    const lines = sourceCode.split('\n');
+    const [start, end] = entity.line_range;
+    snippet = lines.slice(start - 1, Math.min(end, start + 50)).join('\n');
+    if (end - start > 50) snippet += '\n// ... truncated';
+  }
+
+  const prompt = `You are a senior engineer writing concise documentation.
+
+Describe what this ${entity.type.toLowerCase()} does in 2-3 sentences. Be specific about domain logic, not syntax.
+
+Entity: ${entity.name} (${entity.type}, ${entity.kind})
+Visibility: ${entity.visibility}
+Location: ${entityId}
+${containedBy.length > 0 ? `Part of: ${containedBy.join(', ')}` : ''}
+${calls.length > 0 ? `Calls: ${calls.slice(0, 10).join(', ')}${calls.length > 10 ? ` (+${calls.length - 10} more)` : ''}` : ''}
+${calledBy.length > 0 ? `Called by: ${calledBy.slice(0, 10).join(', ')}${calledBy.length > 10 ? ` (+${calledBy.length - 10} more)` : ''}` : ''}
+
+${snippet ? `Source:\n\`\`\`\n${snippet}\n\`\`\`` : ''}
+
+Documentation:`;
+
+  return callLLM(prompt);
+}
+
+/**
+ * Generate a change summary using semantic diff context.
+ */
+async function generateDiffDoc(diff) {
+  const summary = formatSummary(diff);
+
+  const prompt = `You are a senior engineer writing a changelog entry for a code review.
+
+Given this semantic diff, write a concise 3-5 sentence summary suitable for a PR description or release note. Focus on:
+- What changed and why it matters
+- Breaking changes that need attention
+- Impact on downstream consumers
+
+Semantic Diff:
+${summary}
+
+Changelog entry:`;
+
+  return callLLM(prompt, 512);
+}
+
+/**
+ * Generate docs for all public entities in a file.
+ */
+async function generateFileDocs(filePath, graph, repoRoot) {
+  const entityIds = graph.fileIndex.get(filePath);
+  if (!entityIds) return [];
+
+  let sourceCode = '';
+  try {
+    sourceCode = fs.readFileSync(filePath, 'utf8');
+  } catch {}
+
+  const docs = [];
+  for (const id of entityIds) {
+    const entity = graph.nodes.get(id);
+    if (!entity || entity.visibility !== 'public' || entity.type === 'Dependency') continue;
+
+    try {
+      const doc = await generateEntityDoc(id, graph, sourceCode);
+      if (doc) {
+        docs.push({ entityId: id, name: entity.name, type: entity.type, doc });
+      }
+    } catch (err) {
+      console.error(`  Failed to generate doc for ${id}: ${err.message}`);
+    }
+  }
+
+  return docs;
+}
+
+/**
+ * Batch generate docs for changed entities in a diff.
+ */
+async function generateDiffDocs(diff, oldGraph, newGraph, repoRoot) {
+  const results = { changeSummary: '', entityDocs: [] };
+
+  // Generate overall change summary
+  try {
+    results.changeSummary = await generateDiffDoc(diff);
+  } catch (err) {
+    console.error(`Failed to generate change summary: ${err.message}`);
+  }
+
+  // Generate docs for new/modified public entities
+  const entitiesToDoc = [];
+  for (const item of diff.categorized.significant) {
+    if (item.entity && item.entity.visibility === 'public') {
+      entitiesToDoc.push(item.entity.id);
+    }
+    if (item.new && item.new.visibility === 'public') {
+      entitiesToDoc.push(item.new.id);
+    }
+  }
+
+  for (const id of entitiesToDoc) {
+    const entity = newGraph.nodes.get(id);
+    if (!entity || entity.type === 'Dependency') continue;
+
+    let sourceCode = '';
+    if (entity._file) {
+      try { sourceCode = fs.readFileSync(entity._file, 'utf8'); } catch {}
+    }
+
+    try {
+      const doc = await generateEntityDoc(id, newGraph, sourceCode);
+      if (doc) {
+        results.entityDocs.push({ entityId: id, name: entity.name, type: entity.type, doc });
+      }
+    } catch (err) {
+      console.error(`  Failed to generate doc for ${id}: ${err.message}`);
+    }
+  }
+
+  return results;
+}
+
+// --- CLI ---
+if (require.main === module) {
+  const args = process.argv.slice(2);
+  const command = args[0];
+
+  if (command === 'entity') {
+    const snapshotPath = args[1];
+    const entityId = args[2];
+    const filePath = args[3]; // optional source file
+
+    if (!snapshotPath || !entityId) {
+      console.error('Usage: node docgen.js entity <snapshot.json> <entityId> [source-file]');
+      process.exit(1);
+    }
+
+    const graph = GraphStore.loadSnapshot(snapshotPath);
+    let source = '';
+    if (filePath) {
+      try { source = fs.readFileSync(filePath, 'utf8'); } catch {}
+    }
+
+    generateEntityDoc(entityId, graph, source).then(doc => {
+      console.log(doc);
+    }).catch(err => {
+      console.error(err.message);
+      process.exit(1);
+    });
+
+  } else if (command === 'diff') {
+    const oldPath = args[1];
+    const newPath = args[2];
+
+    if (!oldPath || !newPath) {
+      console.error('Usage: node docgen.js diff <old-snapshot.json> <new-snapshot.json>');
+      process.exit(1);
+    }
+
+    const oldGraph = GraphStore.loadSnapshot(oldPath);
+    const newGraph = GraphStore.loadSnapshot(newPath);
+    const diff = semanticDiff(oldGraph, newGraph);
+
+    generateDiffDocs(diff, oldGraph, newGraph).then(results => {
+      console.log('=== Change Summary ===');
+      console.log(results.changeSummary);
+      console.log('');
+      if (results.entityDocs.length > 0) {
+        console.log('=== Entity Documentation ===');
+        for (const d of results.entityDocs) {
+          console.log(`\n[${d.type}] ${d.name} (${d.entityId})`);
+          console.log(d.doc);
+        }
+      }
+    }).catch(err => {
+      console.error(err.message);
+      process.exit(1);
+    });
+
+  } else if (command === 'file') {
+    const snapshotPath = args[1];
+    const filePath = args[2];
+
+    if (!snapshotPath || !filePath) {
+      console.error('Usage: node docgen.js file <snapshot.json> <source-file>');
+      process.exit(1);
+    }
+
+    const graph = GraphStore.loadSnapshot(snapshotPath);
+    generateFileDocs(filePath, graph).then(docs => {
+      for (const d of docs) {
+        console.log(`\n[${d.type}] ${d.name} (${d.entityId})`);
+        console.log(d.doc);
+      }
+      if (docs.length === 0) console.log('No public entities found in file.');
+    }).catch(err => {
+      console.error(err.message);
+      process.exit(1);
+    });
+
+  } else {
+    console.error('Unknown command. Available: entity, diff, file');
+    process.exit(1);
+  }
+}
+
+module.exports = { generateEntityDoc, generateDiffDoc, generateFileDocs, generateDiffDocs, callLLM };
diff --git a/specs/system-docs-spec.md b/specs/system-docs-spec.md
new file mode 100644
index 0000000..ae84d0c
--- /dev/null
+++ b/specs/system-docs-spec.md
@@ -0,0 +1,228 @@
+# Dev Intel Pipeline v2 — Phase 7: System-Level Documentation Generation
+
+**Status:** DRAFT
+**Author:** Max (AI) + Brian (Human)
+**Date:** 2026-03-09
+**Depends on:** Phases 1-6 (extract, graph, namespace, semantic-diff, pipeline, docgen)
+
+---
+
+## Problem Statement
+
+The V2 pipeline generates accurate file-level documentation ("this module exports X, depends on Y, calls Z"). But real platform documentation — like the Foxtrot Confluence docs — operates at the *system level*: subsystem architecture, cross-subsystem data flows, configuration contracts, deployment pipelines, and layered dependency narratives.
+
+File-level docs are reference material. System-level docs are what engineers actually read to understand how things work.
+
+## Goal
+
+Extend the V2 pipeline to generate Foxtrot-quality system documentation from the code knowledge graph, organized in the Divio documentation framework (Tutorials, How-To, Reference, Explanation).
+
+## Success Criteria
+
+| Metric | Target |
+|--------|--------|
+| Subsystem detection accuracy | ≥90% of modules correctly clustered |
+| Cross-subsystem dependency completeness | ≥85% of actual inter-subsystem edges captured |
+| Contract extraction recall | ≥80% of exported interfaces/types extracted |
+| Generated doc structure | Matches Divio 4-category template |
+| Incremental update precision | Only subsystems touched by semantic diff get regenerated |
+| LLM cost per full generation | ≤$2 (using local Ollama for drafting) |
+
+## Architecture
+
+### 7A: Subsystem Aggregator (`subsystem.js`)
+
+**Purpose:** Group file-level entities into logical subsystems and compute inter-subsystem relationships.
+
+**Clustering Strategy (tiered):**
+
+1. **Directory-based (default):** Top-level directory under `src/` = subsystem. `gateway/`, `agents/`, `cli/`, `telegram/`, etc. Simple, deterministic, zero-config.
+
+2. **Config-driven (override):** Optional `subsystems.yaml` that maps directories to named subsystems with human labels and grouping overrides.
+   ```yaml
+   subsystems:
+     - name: Gateway
+       label: "Session & Request Gateway"
+       paths: ["gateway/", "routing/"]
+     - name: Agents
+       label: "AI Agent Runtime"
+       paths: ["agents/", "auto-reply/"]
+     - name: Channels
+       label: "Channel Adapters"
+       paths: ["telegram/", "discord/", "slack/", "signal/", "whatsapp/"]
+   ```
+
+3. **Graph-based (future):** Community detection (Louvain/label propagation) on the CALLS+IMPORTS graph to find natural clusters. Useful for repos without clean directory boundaries.
+
+**Output:**
+```json
+{
+  "subsystems": [
+    {
+      "name": "gateway",
+      "label": "Session & Request Gateway",
+      "files": ["gateway/session-utils.ts", "gateway/server.ts", ...],
+      "entities": { "functions": 142, "classes": 3, "modules": 28 },
+      "publicExports": ["deriveSessionTitle", "loadSessionEntry", ...],
+      "internalDeps": [{"from": "gateway", "to": "agents", "edges": 89, "type": "CALLS"}],
+      "externalDeps": ["commander", "node:fs", "node:path"]
+    }
+  ],
+  "dependencyMatrix": {
+    "gateway→agents": { "calls": 89, "imports": 34 },
+    "agents→config": { "calls": 156, "imports": 120 },
+    ...
+  }
+}
+```
+
+### 7B: Contract Extractor (`contracts.js`)
+
+**Purpose:** Extract TypeScript interfaces, type aliases, enums, and config schemas as first-class graph entities.
+
+**What to extract:**
+- `interface Foo { ... }` → entity type `Interface`, with fields as properties
+- `type Foo = { ... }` → entity type `TypeAlias`
+- `enum Foo { ... }` → entity type `Enum`, with members
+- Exported `const` objects used as config defaults → entity type `ConfigContract`
+- YAML schema keys (from config files) → entity type `ConfigSchema`
+
+**Relationships:**
+- `IMPLEMENTS` — class → interface
+- `ACCEPTS` — function parameter → interface/type (function signature contracts)
+- `RETURNS` — function → return type
+- `EXTENDS` — interface → interface
+
+**Why this matters:**
+Foxtrot docs define explicit contracts: "`accountCreation` expects `reltioCustomerId: string`". Without extracting interfaces/types, we can't generate contract documentation. The LLM has to guess from function bodies, which is unreliable.
+
+### 7C: Flow Tracer (`flow.js`)
+
+**Purpose:** Given an entry point, walk the call graph across subsystem boundaries and produce a sequenced narrative of the data flow.
+
+**Algorithm:**
+1. Start at entry point entity (e.g., `telegram/bot-handlers.ts:onMessage`)
+2. BFS/DFS through CALLS edges, recording subsystem transitions
+3. At each subsystem boundary crossing, record: source subsystem → target subsystem, via which function call
+4. Prune: stop at depth N (configurable, default 5), skip test files, skip utility functions below a connectivity threshold
+5. Output: ordered list of subsystem hops with the specific function calls that cross boundaries
+
+**Output:**
+```json
+{
+  "entryPoint": "telegram/bot-handlers.ts:onMessage",
+  "flow": [
+    { "subsystem": "telegram", "function": "onMessage", "action": "receives incoming message" },
+    { "subsystem": "routing", "function": "routeInbound", "action": "routes to session handler", "crossedVia": "CALLS" },
+    { "subsystem": "gateway", "function": "handleSession", "action": "loads session state", "crossedVia": "CALLS" },
+    { "subsystem": "agents", "function": "runAgent", "action": "executes AI agent turn", "crossedVia": "CALLS" }
+  ]
+}
+```
+
+**LLM narration:** Feed the flow trace + source snippets at each hop to the LLM. Ask it to write a prose narrative: "When a Telegram message arrives, the bot handler dispatches it to the routing layer, which resolves the session key and..."
+
+### 7D: Hierarchical Doc Generator (`sysdoc.js`)
+
+**Purpose:** Orchestrate 7A-7C to produce a complete documentation site in Divio structure.
+
+**Output structure:**
+```
+docs/
+├── tutorials/
+│   └── (not auto-generated — requires human curation)
+├── how-to/
+│   └── (generated from flow traces of common operations)
+├── reference/
+│   ├── system-architecture.md      ← from subsystem aggregator + dependency matrix
+│   ├── subsystems/
+│   │   ├── gateway.md              ← per-subsystem: purpose, exports, deps, key modules
+│   │   ├── agents.md
+│   │   └── ...
+│   ├── contracts/
+│   │   ├── session-types.md        ← from contract extractor
+│   │   └── ...
+│   └── modules/
+│       └── (existing file-level docs from Phase 6)
+├── explanation/
+│   ├── architecture-patterns.md    ← from dependency matrix analysis
+│   ├── data-flows.md              ← from flow tracer
+│   └── design-decisions.md        ← (requires human input or commit history analysis)
+```
+
+**Generation pipeline:**
+1. Run subsystem aggregator → subsystem map + dependency matrix
+2. Run contract extractor → interface/type entities added to graph
+3. Run flow tracer on configured entry points → flow narratives
+4. For each subsystem: generate reference doc (LLM with subsystem context)
+5. Generate system architecture overview (LLM with full dependency matrix)
+6. Generate data flow explanations (LLM with flow traces)
+
+**Incremental updates:**
+- Semantic diff identifies changed files
+- Map changed files → affected subsystems
+- Only regenerate docs for affected subsystems
+- System architecture overview regenerated only if dependency matrix changed
+
+### Template System
+
+Each doc type has a Markdown template with slots:
+
+```markdown
+# {{subsystem.label}}
+
+## Purpose
+{{llm_generated_purpose}}
+
+## Key Modules
+{{for module in subsystem.topModules}}
+- `{{module.name}}` — {{module.doc}}
+{{endfor}}
+
+## Public API
+{{for export in subsystem.publicExports}}
+- `{{export.name}}({{export.params}})` → `{{export.returnType}}`
+{{endfor}}
+
+## Dependencies
+{{dependency_table}}
+
+## Data Flows
+{{for flow in subsystem.flows}}
+### {{flow.name}}
+{{flow.narrative}}
+{{endfor}}
+```
+
+## Implementation Phases
+
+| Phase | Module | Effort | Depends On |
+|-------|--------|--------|------------|
+| 7A | `subsystem.js` | 1 day | graph.js |
+| 7B | `contracts.js` | 1-2 days | extract.js (new tree-sitter queries) |
+| 7C | `flow.js` | 1 day | graph.js, subsystem.js |
+| 7D | `sysdoc.js` | 1-2 days | 7A, 7B, 7C, docgen.js |
+
+**Critical path:** 7A → 7C → 7D (flow tracer needs subsystem boundaries)
+**Parallel:** 7B can run in parallel with 7A/7C
+
+## Constraints
+
+- No new external dependencies (same as Phases 1-5)
+- LLM calls only for prose generation — all structural analysis is deterministic
+- tree-sitter@0.21.1 compatibility maintained
+- Templates are Markdown with simple mustache-style slots (no template engine dependency — string replacement)
+- Must work on OpenClaw codebase (4,325 files) as primary benchmark
+- Foxtrot repos are not available in this environment — design must work from any repo's graph snapshot
+
+## Open Questions
+
+1. **Tutorials:** Should we attempt to auto-generate tutorials from flow traces, or leave that as human-only? Foxtrot tutorials are task-oriented ("Create your first VPC") which requires domain knowledge the graph doesn't have.
+
+2. **Design decisions:** Can we infer design decisions from commit history + semantic diffs? ("We switched from X to Y in v2026.3.1 because...") Or is this always human-authored?
+
+3. **Cross-repo:** For Foxtrot's 14-repo setup, do we generate one unified doc site or per-repo docs with cross-links? The namespace registry (Phase 3) handles entity linking, but the doc generator needs to know the boundary.
+
+4. **Diagram generation:** Should we auto-generate Mermaid diagrams from the dependency matrix and flow traces? (We have the mermaid-renderer skill.)
+
+5. **Config contract depth:** How deep do we go on YAML/HCL config extraction? Just top-level keys, or full schema with types and defaults?