Files
dev-intel-poc/ingest.py
Jarvis Prime af2e54b5f3 Initial POC: Developer Intelligence knowledge graph
- SQLite backend with file/repo/relationship entities
- tree-sitter Go AST parser for deterministic import detection
- Ollama doc generation with concurrent batch processing
- MCP server (FastMCP) for Claude Code integration
- Merge simulation with staleness cascade
- Lazy refresh for stale relationship and repo docs
- CLAUDE.md for agent context
2026-03-04 04:25:14 +00:00

150 lines
5.3 KiB
Python

"""Main ingestion script. Clone repo, parse, generate docs, load SQLite."""
import os
import subprocess
import time
import json
from pathlib import Path
from parser import parse_go_file, filter_imports, get_repo_module, resolve_import_to_file
from docgen import generate_file_doc, generate_repo_doc, generate_docs_batch
from db import GraphDB
TARGET_REPO = os.environ.get("TARGET_REPO", "https://github.com/labstack/echo.git")
REPO_DIR = os.environ.get("REPO_DIR", os.path.join(os.path.dirname(__file__), "repos", "target"))
def clone_repo():
if Path(REPO_DIR).exists() and (Path(REPO_DIR) / ".git").exists():
print(f"Repo already cloned at {REPO_DIR}, pulling latest...")
subprocess.run(["git", "-C", REPO_DIR, "pull"], check=True)
else:
print(f"Cloning {TARGET_REPO}...")
Path(REPO_DIR).parent.mkdir(parents=True, exist_ok=True)
subprocess.run(["git", "clone", "--depth=1", TARGET_REPO, REPO_DIR], check=True)
result = subprocess.run(
["git", "-C", REPO_DIR, "rev-parse", "HEAD"],
capture_output=True, text=True, check=True,
)
return result.stdout.strip()
def discover_go_files() -> dict[str, str]:
files = {}
repo = Path(REPO_DIR)
for gofile in repo.rglob("*.go"):
rel = str(gofile.relative_to(repo))
if "vendor/" in rel or "_test.go" in rel:
continue
try:
files[rel] = gofile.read_text(errors="replace")
except Exception as e:
print(f" Skipping {rel}: {e}")
return files
def run():
start = time.time()
print("=" * 60)
print("Developer Intelligence POC — Ingestion")
print("=" * 60)
# Step 1: Clone
print("\n[1/6] Cloning repository...")
commit = clone_repo()
repo_name = TARGET_REPO.rstrip("/").split("/")[-1].replace(".git", "")
print(f" Repo: {repo_name}, commit: {commit[:8]}")
# Step 2: Discover files
print("\n[2/6] Discovering Go files...")
go_files = discover_go_files()
print(f" Found {len(go_files)} Go files (excluding tests and vendor)")
# Step 3: Parse AST
print("\n[3/6] Parsing AST (tree-sitter)...")
repo_module = get_repo_module(REPO_DIR)
print(f" Module: {repo_module}")
parsed = {}
all_imports = {}
for rel_path, content in go_files.items():
info = parse_go_file(rel_path, content, repo_module)
parsed[rel_path] = info
filtered = filter_imports(info.imports, repo_module)
all_imports[rel_path] = filtered
total_imports = sum(len(v) for v in all_imports.values())
first_party = sum(1 for v in all_imports.values() for i in v if i.startswith(repo_module))
print(f" Parsed {len(parsed)} files, {total_imports} filtered imports ({first_party} first-party)")
# Step 4: Generate file docs
print("\n[4/6] Generating file documentation (Ollama)...")
file_items = [(path, content) for path, content in go_files.items()]
file_docs = generate_docs_batch(file_items, generate_file_doc)
file_doc_map = {file_items[i][0]: file_docs[i] for i in range(len(file_items))}
# Step 5: Load into SQLite
print("\n[5/6] Loading into database...")
db = GraphDB()
for rel_path, content in go_files.items():
info = parsed[rel_path]
doc = file_doc_map.get(rel_path, "")
db.create_file(
path=rel_path,
repo=repo_name,
language="go",
documentation=doc,
functions=info.functions,
commit=commit[:8],
)
edges_created = 0
for from_file, imports in all_imports.items():
for imp in imports:
if not imp.startswith(repo_module):
continue
target_dir = resolve_import_to_file(imp, repo_module, go_files)
if target_dir:
# Find actual files in that directory
for fpath in go_files:
fdir = str(Path(fpath).parent)
if fdir == target_dir or fdir.endswith("/" + target_dir):
db.create_relationship(from_file, fpath)
edges_created += 1
print(f" Created {len(go_files)} file nodes, {edges_created} import edges")
# Step 6: Generate repo-level doc
print("\n[6/6] Generating repo-level documentation...")
readme_path = Path(REPO_DIR) / "README.md"
readme = readme_path.read_text(errors="replace") if readme_path.exists() else ""
entry_candidates = ["echo.go", "router.go", "context.go", "group.go", "middleware.go"]
entry_files = []
for candidate in entry_candidates:
for path, content in go_files.items():
if path.endswith(candidate):
entry_files.append((path, content))
break
repo_doc = generate_repo_doc(readme, entry_files)
db.create_repo(name=repo_name, url=TARGET_REPO, language="go", documentation=repo_doc)
stats = db.get_stats()
elapsed = time.time() - start
print("\n" + "=" * 60)
print("Ingestion complete!")
print(f" Files: {stats['files']}")
print(f" Relationships: {stats['relationships']}")
print(f" Time: {elapsed:.1f}s ({elapsed/60:.1f}m)")
print(f" Database: {os.path.abspath(db.conn.execute('PRAGMA database_list').fetchone()[2])}")
print("=" * 60)
db.close()
if __name__ == "__main__":
run()