Initial POC: Developer Intelligence knowledge graph
- SQLite backend with file/repo/relationship entities - tree-sitter Go AST parser for deterministic import detection - Ollama doc generation with concurrent batch processing - MCP server (FastMCP) for Claude Code integration - Merge simulation with staleness cascade - Lazy refresh for stale relationship and repo docs - CLAUDE.md for agent context
This commit is contained in:
149
ingest.py
Normal file
149
ingest.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""Main ingestion script. Clone repo, parse, generate docs, load SQLite."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from parser import parse_go_file, filter_imports, get_repo_module, resolve_import_to_file
|
||||
from docgen import generate_file_doc, generate_repo_doc, generate_docs_batch
|
||||
from db import GraphDB
|
||||
|
||||
TARGET_REPO = os.environ.get("TARGET_REPO", "https://github.com/labstack/echo.git")
|
||||
REPO_DIR = os.environ.get("REPO_DIR", os.path.join(os.path.dirname(__file__), "repos", "target"))
|
||||
|
||||
|
||||
def clone_repo():
|
||||
if Path(REPO_DIR).exists() and (Path(REPO_DIR) / ".git").exists():
|
||||
print(f"Repo already cloned at {REPO_DIR}, pulling latest...")
|
||||
subprocess.run(["git", "-C", REPO_DIR, "pull"], check=True)
|
||||
else:
|
||||
print(f"Cloning {TARGET_REPO}...")
|
||||
Path(REPO_DIR).parent.mkdir(parents=True, exist_ok=True)
|
||||
subprocess.run(["git", "clone", "--depth=1", TARGET_REPO, REPO_DIR], check=True)
|
||||
|
||||
result = subprocess.run(
|
||||
["git", "-C", REPO_DIR, "rev-parse", "HEAD"],
|
||||
capture_output=True, text=True, check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def discover_go_files() -> dict[str, str]:
|
||||
files = {}
|
||||
repo = Path(REPO_DIR)
|
||||
for gofile in repo.rglob("*.go"):
|
||||
rel = str(gofile.relative_to(repo))
|
||||
if "vendor/" in rel or "_test.go" in rel:
|
||||
continue
|
||||
try:
|
||||
files[rel] = gofile.read_text(errors="replace")
|
||||
except Exception as e:
|
||||
print(f" Skipping {rel}: {e}")
|
||||
return files
|
||||
|
||||
|
||||
def run():
|
||||
start = time.time()
|
||||
print("=" * 60)
|
||||
print("Developer Intelligence POC — Ingestion")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Clone
|
||||
print("\n[1/6] Cloning repository...")
|
||||
commit = clone_repo()
|
||||
repo_name = TARGET_REPO.rstrip("/").split("/")[-1].replace(".git", "")
|
||||
print(f" Repo: {repo_name}, commit: {commit[:8]}")
|
||||
|
||||
# Step 2: Discover files
|
||||
print("\n[2/6] Discovering Go files...")
|
||||
go_files = discover_go_files()
|
||||
print(f" Found {len(go_files)} Go files (excluding tests and vendor)")
|
||||
|
||||
# Step 3: Parse AST
|
||||
print("\n[3/6] Parsing AST (tree-sitter)...")
|
||||
repo_module = get_repo_module(REPO_DIR)
|
||||
print(f" Module: {repo_module}")
|
||||
|
||||
parsed = {}
|
||||
all_imports = {}
|
||||
for rel_path, content in go_files.items():
|
||||
info = parse_go_file(rel_path, content, repo_module)
|
||||
parsed[rel_path] = info
|
||||
filtered = filter_imports(info.imports, repo_module)
|
||||
all_imports[rel_path] = filtered
|
||||
|
||||
total_imports = sum(len(v) for v in all_imports.values())
|
||||
first_party = sum(1 for v in all_imports.values() for i in v if i.startswith(repo_module))
|
||||
print(f" Parsed {len(parsed)} files, {total_imports} filtered imports ({first_party} first-party)")
|
||||
|
||||
# Step 4: Generate file docs
|
||||
print("\n[4/6] Generating file documentation (Ollama)...")
|
||||
file_items = [(path, content) for path, content in go_files.items()]
|
||||
file_docs = generate_docs_batch(file_items, generate_file_doc)
|
||||
file_doc_map = {file_items[i][0]: file_docs[i] for i in range(len(file_items))}
|
||||
|
||||
# Step 5: Load into SQLite
|
||||
print("\n[5/6] Loading into database...")
|
||||
db = GraphDB()
|
||||
|
||||
for rel_path, content in go_files.items():
|
||||
info = parsed[rel_path]
|
||||
doc = file_doc_map.get(rel_path, "")
|
||||
db.create_file(
|
||||
path=rel_path,
|
||||
repo=repo_name,
|
||||
language="go",
|
||||
documentation=doc,
|
||||
functions=info.functions,
|
||||
commit=commit[:8],
|
||||
)
|
||||
|
||||
edges_created = 0
|
||||
for from_file, imports in all_imports.items():
|
||||
for imp in imports:
|
||||
if not imp.startswith(repo_module):
|
||||
continue
|
||||
target_dir = resolve_import_to_file(imp, repo_module, go_files)
|
||||
if target_dir:
|
||||
# Find actual files in that directory
|
||||
for fpath in go_files:
|
||||
fdir = str(Path(fpath).parent)
|
||||
if fdir == target_dir or fdir.endswith("/" + target_dir):
|
||||
db.create_relationship(from_file, fpath)
|
||||
edges_created += 1
|
||||
|
||||
print(f" Created {len(go_files)} file nodes, {edges_created} import edges")
|
||||
|
||||
# Step 6: Generate repo-level doc
|
||||
print("\n[6/6] Generating repo-level documentation...")
|
||||
readme_path = Path(REPO_DIR) / "README.md"
|
||||
readme = readme_path.read_text(errors="replace") if readme_path.exists() else ""
|
||||
|
||||
entry_candidates = ["echo.go", "router.go", "context.go", "group.go", "middleware.go"]
|
||||
entry_files = []
|
||||
for candidate in entry_candidates:
|
||||
for path, content in go_files.items():
|
||||
if path.endswith(candidate):
|
||||
entry_files.append((path, content))
|
||||
break
|
||||
|
||||
repo_doc = generate_repo_doc(readme, entry_files)
|
||||
db.create_repo(name=repo_name, url=TARGET_REPO, language="go", documentation=repo_doc)
|
||||
|
||||
stats = db.get_stats()
|
||||
elapsed = time.time() - start
|
||||
print("\n" + "=" * 60)
|
||||
print("Ingestion complete!")
|
||||
print(f" Files: {stats['files']}")
|
||||
print(f" Relationships: {stats['relationships']}")
|
||||
print(f" Time: {elapsed:.1f}s ({elapsed/60:.1f}m)")
|
||||
print(f" Database: {os.path.abspath(db.conn.execute('PRAGMA database_list').fetchone()[2])}")
|
||||
print("=" * 60)
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user