Files
dev-intel-v2/extract.js

807 lines
28 KiB
JavaScript
Raw Normal View History

const fs = require('fs');
const path = require('path');
const Parser = require('tree-sitter');
const jsYaml = require('js-yaml');
// --- Language Grammars (tree-sitter for code only) ---
const GRAMMARS = {
typescript: require('tree-sitter-typescript').typescript,
tsx: require('tree-sitter-typescript').tsx,
javascript: require('tree-sitter-javascript'),
python: require('tree-sitter-python'),
java: require('tree-sitter-java'),
go: require('tree-sitter-go'),
bash: require('tree-sitter-bash'),
};
const { extractYaml, extractHcl } = require('./extract-config.js');
const EXT_MAP = {
'.ts': 'typescript', '.tsx': 'tsx', '.js': 'javascript', '.jsx': 'javascript',
'.py': 'python', '.java': 'java', '.go': 'go',
'.sh': 'bash', '.bash': 'bash',
'.yaml': 'yaml', '.yml': 'yaml',
'.tf': 'hcl', '.hcl': 'hcl',
'.kcl': 'yaml', // KCL has no tree-sitter grammar; parse as YAML (structural approximation)
};
// --- Language Adapters ---
// Each adapter defines node types for that language's AST
const ADAPTERS = {
typescript: {
classNodes: ['class_declaration'],
functionNodes: ['function_declaration'],
arrowFuncParent: 'lexical_declaration',
methodNodes: ['method_definition'],
fieldNodes: ['public_field_definition'],
importNodes: ['import_statement'],
requireFunc: 'require',
exportWrapper: 'export_statement',
varDecl: ['lexical_declaration', 'variable_declaration'],
callExpr: 'call_expression',
funcField: 'function',
nameField: 'name',
bodyField: 'body',
sourceField: 'source',
valueField: 'value',
arrowTypes: ['arrow_function', 'function'],
accessModifier: 'accessibility_modifier',
heritage: 'class_heritage',
implementsClause: 'implements_clause',
},
python: {
classNodes: ['class_definition'],
functionNodes: ['function_definition'],
arrowFuncParent: null,
methodNodes: [], // methods are function_definition inside class
fieldNodes: [],
importNodes: ['import_statement', 'import_from_statement'],
requireFunc: null,
exportWrapper: null,
varDecl: ['assignment', 'augmented_assignment'],
callExpr: 'call',
funcField: 'function',
nameField: 'name',
bodyField: 'body',
sourceField: null,
valueField: 'right',
arrowTypes: ['lambda'],
accessModifier: null,
heritage: null,
implementsClause: null,
},
java: {
classNodes: ['class_declaration', 'interface_declaration', 'enum_declaration'],
functionNodes: ['method_declaration', 'constructor_declaration'],
arrowFuncParent: null,
methodNodes: ['method_declaration', 'constructor_declaration'],
fieldNodes: ['field_declaration'],
importNodes: ['import_declaration'],
requireFunc: null,
exportWrapper: null,
varDecl: ['local_variable_declaration', 'field_declaration'],
callExpr: 'method_invocation',
funcField: 'name',
nameField: 'name',
bodyField: 'body',
sourceField: null,
valueField: null,
arrowTypes: ['lambda_expression'],
accessModifier: 'modifiers',
heritage: 'superclass',
implementsClause: 'super_interfaces',
},
go: {
classNodes: ['type_declaration'], // struct types
functionNodes: ['function_declaration', 'method_declaration'],
arrowFuncParent: null,
methodNodes: ['method_declaration'],
fieldNodes: [],
importNodes: ['import_declaration'],
requireFunc: null,
exportWrapper: null,
varDecl: ['var_declaration', 'short_var_declaration', 'const_declaration'],
callExpr: 'call_expression',
funcField: 'function',
nameField: 'name',
bodyField: 'body',
sourceField: 'path',
valueField: null,
arrowTypes: ['func_literal'],
accessModifier: null,
heritage: null,
implementsClause: null,
},
yaml: {
classNodes: [],
functionNodes: [],
arrowFuncParent: null,
methodNodes: [],
fieldNodes: [],
importNodes: [],
requireFunc: null,
exportWrapper: null,
varDecl: [],
callExpr: null,
funcField: null,
nameField: null,
bodyField: null,
sourceField: null,
valueField: null,
arrowTypes: [],
accessModifier: null,
heritage: null,
implementsClause: null,
},
hcl: {
classNodes: [],
functionNodes: [],
arrowFuncParent: null,
methodNodes: [],
fieldNodes: [],
importNodes: [],
requireFunc: null,
exportWrapper: null,
varDecl: [],
callExpr: 'function_call',
funcField: null,
nameField: null,
bodyField: 'body',
sourceField: null,
valueField: null,
arrowTypes: [],
accessModifier: null,
heritage: null,
implementsClause: null,
},
};
// Alias adapters
ADAPTERS.tsx = ADAPTERS.typescript;
ADAPTERS.javascript = ADAPTERS.typescript;
ADAPTERS.bash = {
classNodes: [],
functionNodes: ['function_definition'],
arrowFuncParent: null,
methodNodes: [],
fieldNodes: [],
importNodes: [],
requireFunc: null,
exportWrapper: null,
varDecl: ['variable_assignment'],
callExpr: 'command',
funcField: 'name',
nameField: 'name',
bodyField: 'body',
sourceField: null,
valueField: null,
arrowTypes: [],
accessModifier: null,
heritage: null,
implementsClause: null,
};
// --- Core Extractor ---
function extract(filePath, repoRoot) {
const ext = path.extname(filePath);
const lang = EXT_MAP[ext];
if (!lang) {
console.error(`Unsupported extension: ${ext}`);
process.exit(1);
}
if (lang === 'yaml') return extractYaml(filePath, repoRoot);
if (lang === 'hcl') return extractHcl(filePath, repoRoot);
const grammar = GRAMMARS[lang];
const adapter = ADAPTERS[lang];
if (!grammar || !adapter) {
console.error(`No grammar/adapter for: ${lang}`);
process.exit(1);
}
const parser = new Parser();
parser.setLanguage(grammar);
let sourceCode;
try {
sourceCode = fs.readFileSync(filePath, 'utf8');
} catch (err) {
console.error(`Failed to read ${filePath}: ${err.message}`);
return { file: filePath, language: lang, entities: [], relationships: [], error: err.message };
}
let tree;
try {
tree = parser.parse(sourceCode);
} catch (err) {
console.error(`Failed to parse ${filePath}: ${err.message}`);
return { file: filePath, language: lang, entities: [], relationships: [], error: err.message };
}
const relPath = path.relative(repoRoot, filePath);
const moduleId = relPath;
const entities = [];
const relationships = [];
function getText(node) {
return sourceCode.substring(node.startIndex, node.endIndex);
}
function lineRange(node) {
return [node.startPosition.row + 1, node.endPosition.row + 1];
}
function isExported(node) {
if (adapter.exportWrapper) {
// ES6 export
if (node.parent && node.parent.type === adapter.exportWrapper) return true;
// CommonJS: module.exports = { ... } or exports.foo = ...
// Check if this function/class name appears in a module.exports assignment
const nameNode = node.childForFieldName('name');
if (nameNode) {
const name = getText(nameNode);
// Walk up to find module.exports references to this name
const root = tree.rootNode;
for (const child of root.children) {
if (child.type === 'expression_statement') {
const expr = child.children[0];
if (expr && expr.type === 'assignment_expression') {
const left = expr.childForFieldName('left');
if (left) {
const leftText = getText(left);
// module.exports.foo = ... or exports.foo = ...
if (leftText === `module.exports.${name}` || leftText === `exports.${name}`) return true;
// module.exports = { foo, bar } or module.exports = foo
if (leftText === 'module.exports') {
const right = expr.childForFieldName('right');
if (right) {
const rightText = getText(right);
if (rightText === name || rightText.includes(name)) return true;
}
}
}
}
}
}
}
return false;
}
// Python: no export concept, everything is public
// Java: check modifiers
// Go: capitalized name = exported
if (lang === 'go') {
const nameNode = node.childForFieldName('name');
if (nameNode) {
const name = getText(nameNode);
return name[0] === name[0].toUpperCase();
}
}
if (lang === 'java') {
const mods = node.children.find(c => c.type === 'modifiers');
if (mods) return getText(mods).includes('public');
return false;
}
return true; // Python: everything is public
}
function addEntity(e) {
if (!entities.find(x => x.id === e.id)) entities.push(e);
}
const _relSet = new Set();
function addRel(r) {
const key = `${r.type}:${r.source}->${r.target}`;
if (!_relSet.has(key)) {
_relSet.add(key);
relationships.push(r);
}
}
// --- Import Extraction ---
function extractImports(node) {
if (adapter.importNodes.includes(node.type)) {
if (lang === 'typescript' || lang === 'tsx' || lang === 'javascript') {
const sourceNode = node.childForFieldName('source');
if (sourceNode) {
const depName = getText(sourceNode).replace(/['"]/g, '');
// Resolve relative imports against file directory
let resolvedDep = depName;
if (depName.startsWith('.')) {
resolvedDep = path.posix.normalize(path.posix.join(path.dirname(relPath), depName));
}
const depId = `dep:${resolvedDep}`;
addEntity({ id: depId, type: 'Dependency', name: resolvedDep, kind: 'import', visibility: 'internal', line_range: lineRange(node) });
addRel({ type: 'IMPORTS', source: moduleId, target: depId });
}
return true;
}
if (lang === 'python') {
// import X or from X import Y
const modNode = node.childForFieldName('module_name') || node.childForFieldName('name');
let depName = 'unknown';
if (modNode) {
depName = getText(modNode);
} else {
// Fallback: grab dotted name from children
const dotted = node.children.find(c => c.type === 'dotted_name');
if (dotted) depName = getText(dotted);
}
const depId = `dep:${depName}`;
addEntity({ id: depId, type: 'Dependency', name: depName, kind: 'import', visibility: 'internal', line_range: lineRange(node) });
addRel({ type: 'IMPORTS', source: moduleId, target: depId });
return true;
}
if (lang === 'java') {
// import com.foo.Bar;
const scopedId = node.children.find(c => c.type === 'scoped_identifier');
if (scopedId) {
const depName = getText(scopedId);
const depId = `dep:${depName}`;
addEntity({ id: depId, type: 'Dependency', name: depName, kind: 'import', visibility: 'internal', line_range: lineRange(node) });
addRel({ type: 'IMPORTS', source: moduleId, target: depId });
}
return true;
}
if (lang === 'go') {
// import "fmt" or import ( "fmt" "os" )
for (const child of node.namedChildren) {
if (child.type === 'import_spec' || child.type === 'import_spec_list') {
const specs = child.type === 'import_spec_list' ? child.namedChildren : [child];
for (const spec of specs) {
const pathNode = spec.childForFieldName('path');
if (pathNode) {
const depName = getText(pathNode).replace(/"/g, '');
const depId = `dep:${depName}`;
addEntity({ id: depId, type: 'Dependency', name: depName, kind: 'import', visibility: 'internal', line_range: lineRange(spec) });
addRel({ type: 'IMPORTS', source: moduleId, target: depId });
}
}
}
}
return true;
}
}
// CommonJS require() for JS/TS
if (adapter.requireFunc && (node.type === 'lexical_declaration' || node.type === 'variable_declaration')) {
for (const child of node.children) {
if (child.type === 'variable_declarator') {
const value = child.childForFieldName('value');
if (value && value.type === 'call_expression') {
const func = value.childForFieldName('function');
if (func && getText(func) === adapter.requireFunc) {
const args = value.childForFieldName('arguments');
if (args && args.namedChildCount > 0) {
const arg = args.namedChildren[0];
if (arg.type === 'string') {
const depName = getText(arg).replace(/['"]/g, '');
let resolvedDep = depName;
if (depName.startsWith('.')) {
resolvedDep = path.posix.normalize(path.posix.join(path.dirname(relPath), depName));
}
const depId = `dep:${resolvedDep}`;
addEntity({ id: depId, type: 'Dependency', name: resolvedDep, kind: 'require', visibility: 'internal', line_range: lineRange(node) });
addRel({ type: 'IMPORTS', source: moduleId, target: depId });
return true;
}
}
}
}
}
}
}
// Bash: source ./utils.sh -> IMPORTS
if (lang === 'bash' && node.type === 'command') {
const cmd = node.namedChildren[0];
if (cmd && getText(cmd) === 'source') {
const arg = node.namedChildren[1];
if (arg) {
const depName = getText(arg);
const depId = `dep:${depName}`;
addEntity({ id: depId, type: 'Dependency', name: depName, kind: 'import', visibility: 'internal', line_range: lineRange(node) });
addRel({ type: 'IMPORTS', source: moduleId, target: depId });
return true;
}
}
}
return false;
}
// --- Class Extraction ---
function extractClass(node, parentId) {
const nameNode = node.childForFieldName('name');
if (!nameNode) return null;
const name = getText(nameNode);
const id = `${parentId}:${name}`;
const exported = isExported(node);
let kind = 'class';
if (lang === 'go') kind = 'struct';
if (node.type === 'interface_declaration') kind = 'interface';
if (node.type === 'enum_declaration') kind = 'enum';
addEntity({ id, type: 'Class', name, kind, visibility: exported ? 'public' : 'internal', line_range: lineRange(node) });
addRel({ type: 'CONTAINS', source: parentId, target: id });
// Implements/extends
if (adapter.heritage) {
const heritage = node.children.filter(c => c.type === adapter.heritage);
for (const h of heritage) {
for (const child of h.namedChildren) {
if (adapter.implementsClause && child.type === adapter.implementsClause) {
for (const impl of child.namedChildren) {
addRel({ type: 'IMPLEMENTS', source: id, target: getText(impl) });
}
} else {
addRel({ type: 'IMPLEMENTS', source: id, target: getText(child) });
}
}
}
}
// Java: superclass and super_interfaces
if (lang === 'java') {
const superclass = node.childForFieldName('superclass');
if (superclass) addRel({ type: 'IMPLEMENTS', source: id, target: getText(superclass).replace(/^extends\s+/, '') });
const superInterfaces = node.childForFieldName('interfaces');
if (superInterfaces) {
for (const iface of superInterfaces.namedChildren) {
addRel({ type: 'IMPLEMENTS', source: id, target: getText(iface) });
}
}
}
// Python: bases
if (lang === 'python') {
const argList = node.childForFieldName('superclasses');
if (argList) {
for (const base of argList.namedChildren) {
addRel({ type: 'IMPLEMENTS', source: id, target: getText(base) });
}
}
}
return id;
}
// --- Method Extraction ---
function extractMethod(node, parentId) {
const nameNode = node.childForFieldName('name');
if (!nameNode) return null;
const name = getText(nameNode);
const id = `${parentId}:${name}`;
let visibility = 'public';
if (adapter.accessModifier) {
for (const child of node.children) {
if (child.type === adapter.accessModifier) {
const modText = getText(child);
if (modText.includes('private')) visibility = 'private';
else if (modText.includes('protected')) visibility = 'protected';
break;
}
}
}
// Python: _ prefix = private, __ = very private
if (lang === 'python' && name.startsWith('_')) {
visibility = name.startsWith('__') ? 'private' : 'protected';
}
addEntity({ id, type: 'Function', name, kind: 'method', visibility, line_range: lineRange(node) });
addRel({ type: 'CONTAINS', source: parentId, target: id });
return id;
}
// --- Function Extraction ---
function extractFunction(node, parentId) {
const exported = isExported(node);
if (adapter.functionNodes.includes(node.type)) {
const nameNode = node.childForFieldName('name');
if (!nameNode) return null;
const name = getText(nameNode);
const id = `${parentId}:${name}`;
let visibility = exported ? 'public' : 'internal';
if (lang === 'go' && name[0] === name[0].toUpperCase()) visibility = 'public';
if (lang === 'go' && name[0] === name[0].toLowerCase()) visibility = 'internal';
addEntity({ id, type: 'Function', name, kind: 'function', visibility, line_range: lineRange(node) });
addRel({ type: 'CONTAINS', source: parentId, target: id });
return id;
}
// JS/TS arrow functions
if (adapter.arrowFuncParent && node.type === adapter.arrowFuncParent) {
for (const child of node.children) {
if (child.type === 'variable_declarator') {
const value = child.childForFieldName('value');
if (value && adapter.arrowTypes.includes(value.type)) {
const nameNode = child.childForFieldName('name');
if (!nameNode) continue;
const name = getText(nameNode);
const id = `${parentId}:${name}`;
addEntity({ id, type: 'Function', name, kind: 'function', visibility: exported ? 'public' : 'internal', line_range: lineRange(node) });
addRel({ type: 'CONTAINS', source: parentId, target: id });
return id;
}
}
}
}
return null;
}
// --- Class Field (arrow method vs property) ---
function extractClassField(node, parentId) {
if (!adapter.fieldNodes.includes(node.type)) return null;
const nameNode = node.childForFieldName('name');
if (!nameNode) return null;
const value = node.childForFieldName('value');
if (value && adapter.arrowTypes.includes(value.type)) {
return extractMethod(node, parentId);
}
return null; // Skip non-function class properties
}
// --- Call Extraction ---
function extractCalls(node, parentId) {
if (!adapter.callExpr) return;
if (node.type === adapter.callExpr) {
let funcName;
if (lang === 'java') {
const nameNode = node.childForFieldName('name');
const obj = node.childForFieldName('object');
funcName = obj ? `${getText(obj)}.${getText(nameNode)}` : (nameNode ? getText(nameNode) : null);
} else if (lang === 'python') {
const funcNode = node.childForFieldName('function');
funcName = funcNode ? getText(funcNode) : null;
} else if (lang === 'bash') {
const funcNode = node.namedChildren[0];
funcName = funcNode ? getText(funcNode) : null;
} else {
const funcNode = node.childForFieldName(adapter.funcField);
funcName = funcNode ? getText(funcNode) : null;
}
if (funcName) {
if (adapter.requireFunc && funcName === adapter.requireFunc) return;
addRel({ type: 'CALLS', source: parentId, target: funcName });
}
}
}
// --- YAML/HCL Config Extraction ---
function extractConfig(node) {
if (lang === 'yaml') {
addEntity({ id: moduleId, type: 'Config', name: relPath, kind: 'yaml-config', visibility: 'public', line_range: lineRange(node) });
// Extract top-level keys as config entries
if (node.type === 'stream') {
for (const doc of node.namedChildren) {
if (doc.type === 'document') {
const block = doc.namedChildren[0];
if (block && block.type === 'block_node') {
const mapping = block.namedChildren[0];
if (mapping && mapping.type === 'block_mapping') {
for (const pair of mapping.namedChildren) {
if (pair.type === 'block_mapping_pair') {
const key = pair.childForFieldName('key');
if (key) {
const keyName = getText(key);
const keyId = `${moduleId}:${keyName}`;
addEntity({ id: keyId, type: 'Config', name: keyName, kind: 'yaml-key', visibility: 'public', line_range: lineRange(pair) });
addRel({ type: 'CONTAINS', source: moduleId, target: keyId });
}
}
}
}
}
}
}
}
return true;
}
if (lang === 'hcl') {
addEntity({ id: moduleId, type: 'Config', name: relPath, kind: 'terraform', visibility: 'public', line_range: lineRange(node) });
// Extract top-level blocks (resource, data, variable, output, module, provider)
for (const child of node.namedChildren) {
if (child.type === 'block') {
const blockType = child.namedChildren[0]; // e.g., "resource"
const labels = child.namedChildren.filter(c => c.type === 'string_lit' || c.type === 'identifier');
const blockName = labels.map(l => getText(l).replace(/"/g, '')).join('.');
const fullName = blockType ? `${getText(blockType)}.${blockName}` : blockName;
const blockId = `${moduleId}:${fullName}`;
addEntity({ id: blockId, type: 'Config', name: fullName, kind: 'hcl-block', visibility: 'public', line_range: lineRange(child) });
addRel({ type: 'CONTAINS', source: moduleId, target: blockId });
}
}
return true;
}
return false;
}
// --- Main Walker ---
function walk(node, parentId) {
if (node.type === 'program' || node.type === 'source_file' || node.type === 'stream' || node.type === 'compilation_unit' || node.type === 'module') {
// Config files (YAML/HCL)
if (extractConfig(node)) return;
// Code files
addEntity({ id: moduleId, type: 'Module', name: relPath, kind: 'module', visibility: 'public', line_range: lineRange(node) });
for (const child of node.children) {
walk(child, moduleId);
}
return;
}
// Export wrapper (JS/TS)
if (adapter.exportWrapper && node.type === adapter.exportWrapper) {
for (const child of node.children) {
if (child.type !== 'export' && child.type !== 'default') {
walk(child, parentId);
}
}
return;
}
// Imports
if (extractImports(node)) return;
// Classes
if (adapter.classNodes.includes(node.type)) {
const classId = extractClass(node, parentId);
if (classId) {
const body = node.childForFieldName('body');
if (body) {
for (const child of body.namedChildren || body.children) {
walk(child, classId);
}
}
// Go type_declaration: walk type_spec children
if (lang === 'go') {
for (const child of node.namedChildren) {
if (child.type === 'type_spec') {
const structBody = child.childForFieldName('type');
if (structBody) {
for (const field of structBody.namedChildren) {
walk(field, classId);
}
}
}
}
}
}
return;
}
// Methods (inside class body)
if (adapter.methodNodes.includes(node.type)) {
const methodId = extractMethod(node, parentId);
if (methodId) {
const body = node.childForFieldName('body');
if (body) walkBody(body, methodId);
}
return;
}
// Class fields (arrow methods vs properties)
if (adapter.fieldNodes.includes(node.type)) {
const methodId = extractClassField(node, parentId);
if (methodId) {
const value = node.childForFieldName('value');
if (value) {
const body = value.childForFieldName('body');
if (body) walkBody(body, methodId);
}
}
return;
}
// Python: function_definition can be top-level or method (inside class)
if (lang === 'python' && node.type === 'function_definition') {
if (parentId && parentId.includes(':') && parentId !== moduleId) {
// Inside a class → method
const methodId = extractMethod(node, parentId);
if (methodId) {
const body = node.childForFieldName('body');
if (body) walkBody(body, methodId);
}
} else {
// Top-level → function
const funcId = extractFunction(node, parentId);
if (funcId) {
const body = node.childForFieldName('body');
if (body) walkBody(body, funcId);
}
}
return;
}
// Go: method_declaration (receiver-based)
if (lang === 'go' && node.type === 'method_declaration') {
const nameNode = node.childForFieldName('name');
const receiver = node.childForFieldName('receiver');
if (nameNode) {
const name = getText(nameNode);
if (!name || name.length === 0) return;
let receiverType = parentId;
if (receiver) {
const paramList = receiver.namedChildren;
for (const p of paramList) {
const typeNode = p.childForFieldName('type');
if (typeNode) {
let raw = getText(typeNode);
// Strip pointer (*) and generic brackets safely
let typeName = raw.replace(/^\*+/, '').replace(/\[.*\]$/, '').trim();
if (typeName.length > 0) {
receiverType = `${moduleId}:${typeName}`;
}
}
}
}
const id = `${receiverType}:${name}`;
const visibility = name.length > 0 && name[0] === name[0].toUpperCase() ? 'public' : 'internal';
addEntity({ id, type: 'Function', name, kind: 'method', visibility, line_range: lineRange(node) });
addRel({ type: 'CONTAINS', source: receiverType, target: id });
const body = node.childForFieldName('body');
if (body) walkBody(body, id);
}
return;
}
// Functions (top-level)
if (adapter.functionNodes.includes(node.type) || (adapter.arrowFuncParent && node.type === adapter.arrowFuncParent)) {
const funcId = extractFunction(node, parentId);
if (funcId) {
const body = node.type === adapter.arrowFuncParent
? node // For lexical_declaration, walk the whole thing
: node.childForFieldName('body');
if (body) walkBody(body, funcId);
return;
}
// Module-level variable (JS/TS only)
if (parentId === moduleId && adapter.arrowFuncParent && node.type === adapter.arrowFuncParent) {
// Not a function, might be a module-level const
return;
}
}
// Java: package_declaration
if (lang === 'java' && node.type === 'package_declaration') return;
// Top-level calls
extractCalls(node, parentId);
for (const child of node.children) {
walk(child, parentId);
}
}
// Walk function/method bodies for CALLS only
function walkBody(node, parentId) {
if (!node) return;
extractCalls(node, parentId);
for (const child of node.children) {
walkBody(child, parentId);
}
}
walk(tree.rootNode);
return { file: filePath, language: lang, entities, relationships };
}
// --- CLI ---
if (require.main === module) {
const filePath = process.argv[2];
const repoRoot = process.argv[3] || '/app/src';
if (!filePath) {
console.error("Usage: node extract.js <file> [repo-root]");
process.exit(1);
}
const result = extract(filePath, repoRoot);
console.log(JSON.stringify(result, null, 2));
}
module.exports = { extract };