diff --git a/CHANGELOG.md b/CHANGELOG.md index 143f8e92f..c003d1dde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### New Features + +- CodeGraph now indexes **CFML** (`.cfc`, `.cfm`, `.cfs`) — both the classic tag-based style (``/``) and modern bare-script `component { ... }` syntax, including `extends`/`implements`, embedded `` blocks (at any nesting depth, including inside ``/``/``), call edges, and calls embedded in `#hash#` expressions inside `` SQL bodies. ## [1.1.3] - 2026-06-29 diff --git a/README.md b/README.md index 7b6c33932..cf1f3b24d 100644 --- a/README.md +++ b/README.md @@ -244,7 +244,7 @@ The reliable, universal payoff is **surgical context and speed**: CodeGraph coll | **Full-Text Search** | Find code by name instantly across your entire codebase, powered by FTS5 | | **Impact Analysis** | Trace callers, callees, and the full impact radius of any symbol before making changes | | **Always Fresh** | File watcher uses native OS events (FSEvents/inotify/ReadDirectoryChangesW) with debounced auto-sync — the graph stays current as you code, zero config | -| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Scala, Dart, Lua, Luau, R, Svelte, Vue, Astro, Liquid, Pascal/Delphi | +| **20+ Languages** | TypeScript, JavaScript, Python, Go, Rust, Java, C#, PHP, Ruby, C, C++, Objective-C, Swift, Kotlin, Scala, Dart, Lua, Luau, R, CFML, Svelte, Vue, Astro, Liquid, Pascal/Delphi | | **Framework-aware Routes** | Recognizes web-framework routing files and links URL patterns to their handlers across 17 frameworks | | **Mixed iOS / React Native / Expo** | Closes cross-language flows that static parsing misses: Swift ↔ ObjC bridging, React Native legacy bridge + TurboModules + Fabric view components, native → JS event emitters, Expo Modules | | **100% Local** | No data leaves your machine. No API keys. No external services. SQLite database only | @@ -714,6 +714,7 @@ is written): | Lua | `.lua` | Full support (functions, methods with receivers, local variables, `require` imports, call edges) | | R | `.R` `.r` | Full support (functions in every assignment form, S4/R5/R6 classes with methods, `library`/`require` imports, `source()` file references, call edges) | | Luau | `.luau` | Full support (everything in Lua, plus `type`/`export type` aliases, typed signatures, and Roblox instance-path `require`) | +| CFML | `.cfc`, `.cfm`, `.cfs` | Full support (tag-based ``/`` and bare-script `component { ... }` styles, `extends`/`implements`, embedded `` delegation, call edges) | ## Measured cross-file coverage diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 927ecfef1..7aa2d6340 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -7505,3 +7505,252 @@ GeomPoint <- ggproto("GeomPoint", Geom, }); }); }); + +// ============================================================================= +// CFML (ColdFusion Markup Language — .cfc/.cfm tag-based and bare-script, .cfs) +// ============================================================================= + +describe('CFML Extraction', () => { + describe('Language detection', () => { + it('should detect .cfc/.cfm as cfml and .cfs as cfscript', () => { + expect(detectLanguage('Service.cfc')).toBe('cfml'); + expect(detectLanguage('index.cfm')).toBe('cfml'); + expect(detectLanguage('Helper.cfs')).toBe('cfscript'); + }); + + it('should report cfml and cfscript as supported', () => { + expect(isLanguageSupported('cfml')).toBe(true); + expect(isLanguageSupported('cfscript')).toBe(true); + expect(getSupportedLanguages()).toContain('cfml'); + expect(getSupportedLanguages()).toContain('cfscript'); + }); + }); + + describe('Bare-script .cfc (component { ... })', () => { + const code = ` +component extends="BaseService" implements="IService" { + + property name="name" type="string"; + + function init(required string name) { + variables.name = arguments.name; + return this; + } + + public string function getName() { + return variables.name; + } + + private void function logSomething(required string msg) { + writeLog(text=msg); + } +} +`; + + it('should name the component from the file name (the grammar has no name field)', () => { + const result = extractFromSource('SampleService.cfc', code); + const cls = result.nodes.find((n) => n.kind === 'class'); + expect(cls).toBeDefined(); + expect(cls?.name).toBe('SampleService'); + expect(cls?.language).toBe('cfml'); + }); + + it('should extract methods with visibility and contains edges to the class', () => { + const result = extractFromSource('SampleService.cfc', code); + const cls = result.nodes.find((n) => n.kind === 'class'); + const methods = result.nodes.filter((n) => n.kind === 'method'); + expect(methods.map((m) => m.name)).toEqual( + expect.arrayContaining(['init', 'getName', 'logSomething']) + ); + const logSomething = methods.find((m) => m.name === 'logSomething'); + expect(logSomething?.visibility).toBe('private'); + const containsLog = result.edges.find( + (e) => e.source === cls?.id && e.target === logSomething?.id && e.kind === 'contains' + ); + expect(containsLog).toBeDefined(); + }); + + it('should extract extends/implements as unresolved references from the class', () => { + const result = extractFromSource('SampleService.cfc', code); + const cls = result.nodes.find((n) => n.kind === 'class'); + const extendsRef = result.unresolvedReferences.find((r) => r.referenceKind === 'extends'); + expect(extendsRef?.referenceName).toBe('BaseService'); + expect(extendsRef?.fromNodeId).toBe(cls?.id); + const implRef = result.unresolvedReferences.find((r) => r.referenceKind === 'implements'); + expect(implRef?.referenceName).toBe('IService'); + expect(implRef?.fromNodeId).toBe(cls?.id); + }); + }); + + describe('Standalone .cfs (pure CFScript)', () => { + it('should also name an anonymous component from the file name', () => { + const code = ` +component { + function ping() { + return "pong"; + } +} +`; + const result = extractFromSource('Sample.cfs', code); + const cls = result.nodes.find((n) => n.kind === 'class'); + expect(cls).toBeDefined(); + expect(cls?.name).toBe('Sample'); + expect(cls?.language).toBe('cfscript'); + }); + + it('should extract top-level imports with no enclosing component', () => { + const code = ` +import com.foo.Bar; +import foo.cfm; +`; + const result = extractFromSource('Includes.cfs', code); + const imports = result.nodes.filter((n) => n.kind === 'import').map((n) => n.name); + expect(imports).toContain('com.foo.Bar'); + expect(imports).toContain('foo.cfm'); + }); + }); + + describe('Tag-based .cfc (/)', () => { + const code = ` +\t +\t\t +\t +\t +\t\t +\t\t\tvar x = helper(); +\t\t\tanotherCall(x); +\t\t +\t + +`; + + it('should extract the component name from the cfcomponent tag attribute', () => { + const result = extractFromSource('TagStyle.cfc', code); + const cls = result.nodes.find((n) => n.kind === 'class'); + expect(cls?.name).toBe('TagStyle'); + expect(cls?.language).toBe('cfml'); + }); + + it('should extract cffunction tags as methods with access-derived visibility', () => { + const result = extractFromSource('TagStyle.cfc', code); + const methods = result.nodes.filter((n) => n.kind === 'method'); + expect(methods.map((m) => m.name)).toEqual(expect.arrayContaining(['getName', 'doWork'])); + const getName = methods.find((m) => m.name === 'getName'); + expect(getName?.visibility).toBe('public'); + expect(getName?.returnType).toBe('string'); + const doWork = methods.find((m) => m.name === 'doWork'); + expect(doWork?.visibility).toBe('private'); + }); + + it('should not double-extract symbols from the component body (implicit-end-tag walk)', () => { + const result = extractFromSource('TagStyle.cfc', code); + const methods = result.nodes.filter((n) => n.kind === 'method' && n.name === 'getName'); + expect(methods).toHaveLength(1); + const doWorkMethods = result.nodes.filter((n) => n.kind === 'method' && n.name === 'doWork'); + expect(doWorkMethods).toHaveLength(1); + }); + + it('should delegate tag bodies to the cfscript grammar and attribute calls to the enclosing method', () => { + const result = extractFromSource('TagStyle.cfc', code); + const doWork = result.nodes.find((n) => n.kind === 'method' && n.name === 'doWork'); + const helperCall = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'helper' + ); + expect(helperCall?.fromNodeId).toBe(doWork?.id); + }); + + it('should produce exactly one correctly-ranged file node, not a leaked snippet-scoped one', () => { + const result = extractFromSource('TagStyle.cfc', code); + const fileNodes = result.nodes.filter((n) => n.kind === 'file'); + expect(fileNodes).toHaveLength(1); + expect(fileNodes[0].startLine).toBe(1); + const cls = result.nodes.find((n) => n.kind === 'class'); + const containsClass = result.edges.find( + (e) => e.source === fileNodes[0].id && e.target === cls?.id && e.kind === 'contains' + ); + expect(containsClass).toBeDefined(); + }); + }); + + describe('Top-level cffunction with no enclosing cfcomponent (.cfm template)', () => { + it('should extract as a top-level function contained by the file', () => { + const code = ` +\t + +`; + const result = extractFromSource('helper.cfm', code); + const fn = result.nodes.find((n) => n.kind === 'function' && n.name === 'helper'); + expect(fn).toBeDefined(); + const fileNode = result.nodes.find((n) => n.kind === 'file'); + const containsFn = result.edges.find( + (e) => e.source === fileNode?.id && e.target === fn?.id && e.kind === 'contains' + ); + expect(containsFn).toBeDefined(); + }); + }); + + describe(' nested inside control-flow tags (//)', () => { + it('should delegate a body nested inside within a ', () => { + const code = ` + + + + helper(); + + + + +`; + const result = extractFromSource('Nested.cfc', code); + const doStuff = result.nodes.find((n) => n.kind === 'method' && n.name === 'doStuff'); + expect(doStuff).toBeDefined(); + const helperCall = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'helper' + ); + expect(helperCall?.fromNodeId).toBe(doStuff?.id); + }); + + it('should delegate a body nested inside at top-level component scope', () => { + const code = ` + + + topLevelHelper(); + + + +`; + const result = extractFromSource('Nested2.cfc', code); + const cls = result.nodes.find((n) => n.kind === 'class'); + expect(cls).toBeDefined(); + const helperCall = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'topLevelHelper' + ); + expect(helperCall?.fromNodeId).toBe(cls?.id); + }); + }); + + describe(' SQL bodies (cfquery grammar)', () => { + it('should extract a call expression embedded in a #hash# inside the SQL body', () => { + const code = ` + + + SELECT id, name FROM users WHERE owner = #getCurrentUser().getId()# + + + + +`; + const result = extractFromSource('Query.cfc', code); + const getUsers = result.nodes.find((n) => n.kind === 'method' && n.name === 'getUsers'); + expect(getUsers).toBeDefined(); + const getCurrentUserCall = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'getCurrentUser' + ); + expect(getCurrentUserCall?.fromNodeId).toBe(getUsers?.id); + const getIdCall = result.unresolvedReferences.find( + (r) => r.referenceKind === 'calls' && r.referenceName === 'getId' + ); + expect(getIdCall?.fromNodeId).toBe(getUsers?.id); + }); + }); +}); diff --git a/src/extraction/cfml-extractor.ts b/src/extraction/cfml-extractor.ts new file mode 100644 index 000000000..4173ff169 --- /dev/null +++ b/src/extraction/cfml-extractor.ts @@ -0,0 +1,454 @@ +import type { Node as SyntaxNode } from 'web-tree-sitter'; +import { Node, Edge, ExtractionResult, ExtractionError, UnresolvedReference, Language } from '../types'; +import { generateNodeId } from './tree-sitter-helpers'; +import { TreeSitterExtractor } from './tree-sitter'; +import { getParser } from './grammars'; + +/** + * CfmlExtractor - Extracts code relationships from CFML source (.cfc/.cfm). + * + * tree-sitter-cfml splits CFML into two related grammars: `cfml` (tag-based — + * ``/``/HTML) and `cfscript` (modern bare-script + * `component { ... }` syntax). The `cfml` grammar's own injections.scm treats + * bare-script content as an opaque blob meant to be re-parsed by `cfscript` — + * that re-parsing only happens at the editor/highlighting layer, not in the + * raw AST, so this extractor replicates it: a file whose first real token + * isn't `<` is delegated wholesale to the cfscript grammar (the dominant + * modern style); otherwise the file is walked tag-by-tag with the cfml + * grammar, delegating any `` tag bodies the same way. + */ +export class CfmlExtractor { + private filePath: string; + private source: string; + private language: Language; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + + /** `language` is the file's detected language — `'cfml'` for `.cfc`/`.cfm`, `'cfscript'` for `.cfs`. Both dialect-switch internally; this only controls the language tag stamped onto emitted nodes/refs. */ + constructor(filePath: string, source: string, language: Language = 'cfml') { + this.filePath = filePath; + this.source = source; + this.language = language; + } + + extract(): ExtractionResult { + const startTime = Date.now(); + + try { + if (isBareScriptCfml(this.source)) { + this.extractBareScript(); + } else { + this.extractTagBased(); + } + } catch (error) { + this.errors.push({ + message: `CFML extraction error: ${error instanceof Error ? error.message : String(error)}`, + severity: 'error', + code: 'parse_error', + }); + } + + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } + + /** Modern bare-script `.cfc`/`.cfm`: delegate the whole file to the cfscript grammar. */ + private extractBareScript(): void { + const extractor = new TreeSitterExtractor(this.filePath, this.source, 'cfscript'); + const result = extractor.extract(); + + // cfscript's `component`/`interface` node has no `name` field — a CFC's + // component name is always implicit from its file name, never declared + // in source — so the generic extractor names it ''. + const componentName = this.componentNameFromPath(); + for (const node of result.nodes) { + node.language = this.language; + if (node.name === '' && (node.kind === 'class' || node.kind === 'interface')) { + node.name = componentName; + node.qualifiedName = `${this.filePath}::${componentName}`; + } + this.nodes.push(node); + } + this.edges.push(...result.edges); + for (const ref of result.unresolvedReferences) { + ref.language = this.language; + this.unresolvedReferences.push(ref); + } + this.errors.push(...result.errors); + } + + /** Legacy tag-based CFML: walk ``/``, delegating `` bodies. */ + private extractTagBased(): void { + const parser = getParser('cfml'); + if (!parser) { + this.errors.push({ + message: 'cfml grammar not loaded', + severity: 'error', + code: 'unsupported_language', + }); + return; + } + + const tree = parser.parse(this.source); + if (!tree) { + this.errors.push({ + message: 'Failed to parse CFML source', + severity: 'error', + code: 'parse_error', + }); + return; + } + + const fileNode = this.createFileNode(); + this.walkProgram(tree.rootNode, fileNode.id); + } + + /** Build the file's own `kind:'file'` node, spanning the whole source. Tag-based files need this explicitly — unlike `extractBareScript` (which delegates the whole file to `TreeSitterExtractor` and inherits its file node), `extractTagBased` walks the tree itself and has no other source of one. */ + private createFileNode(): Node { + const lines = this.source.split('\n'); + const id = generateNodeId(this.filePath, 'file', this.filePath, 1); + const fileNode: Node = { + id, + kind: 'file', + name: this.filePath.split(/[/\\]/).pop() || this.filePath, + qualifiedName: this.filePath, + filePath: this.filePath, + language: this.language, + startLine: 1, + endLine: lines.length, + startColumn: 0, + endColumn: lines[lines.length - 1]?.length || 0, + updatedAt: Date.now(), + }; + this.nodes.push(fileNode); + return fileNode; + } + + /** + * Walks `program`'s named children with a single forward cursor (not an + * index loop) — `extractComponent` consumes a variable run of FOLLOWING + * siblings as the component body (see its doc comment), so this must + * resume from whatever it last consumed rather than revisiting those same + * cffunction/cfscript siblings a second time as bogus top-level symbols. + */ + private walkProgram(root: SyntaxNode, fileNodeId: string): void { + let child: SyntaxNode | null = root.namedChild(0); + while (child) { + if (child.type === 'cf_component_open_tag') { + child = this.extractComponent(child, fileNodeId).nextSibling; + continue; + } else if (child.type === 'cf_function_tag') { + // A cffunction outside any cfcomponent wrapper (rare, but legal in a + // .cfm template) — extract as a top-level function, contained by the file. + this.extractFunctionTag(child, undefined, fileNodeId); + } else if (child.type === 'cf_script_tag') { + this.delegateScriptTag(child, fileNodeId); + } else if (child.type === 'cf_query_tag') { + this.delegateQueryTag(child, fileNodeId); + } else { + this.delegateNestedTags(child, fileNodeId); + } + child = child.nextSibling; + } + } + + /** + * `...`. + * The grammar's implicit-end-tag scanner means component body content + * (cffunction tags, cfscript tags, etc.) appears as the open tag's FOLLOWING + * siblings in `program`, not nested children — walk forward to the matching + * cf_component_close_tag. + */ + private extractComponent(openTag: SyntaxNode, containerId: string | undefined): SyntaxNode { + const name = this.tagAttr(openTag, 'name') ?? this.componentNameFromPath(); + const id = generateNodeId(this.filePath, 'class', name, openTag.startPosition.row + 1); + + const classNode: Node = { + id, + kind: 'class', + name, + qualifiedName: `${this.filePath}::${name}`, + filePath: this.filePath, + language: this.language, + startLine: openTag.startPosition.row + 1, + endLine: openTag.startPosition.row + 1, // extended below once the close tag is found + startColumn: openTag.startPosition.column, + endColumn: openTag.endPosition.column, + isExported: true, + updatedAt: Date.now(), + }; + this.nodes.push(classNode); + if (containerId) { + this.edges.push({ source: containerId, target: classNode.id, kind: 'contains' }); + } + + const extendsName = this.tagAttr(openTag, 'extends'); + if (extendsName) { + this.unresolvedReferences.push({ + fromNodeId: classNode.id, + referenceName: extendsName, + referenceKind: 'extends', + filePath: this.filePath, + line: openTag.startPosition.row + 1, + column: openTag.startPosition.column, + language: this.language, + }); + } + const implementsAttr = this.tagAttr(openTag, 'implements'); + if (implementsAttr) { + for (const iface of implementsAttr.split(',').map((s) => s.trim()).filter(Boolean)) { + this.unresolvedReferences.push({ + fromNodeId: classNode.id, + referenceName: iface, + referenceKind: 'implements', + filePath: this.filePath, + line: openTag.startPosition.row + 1, + column: openTag.startPosition.column, + language: this.language, + }); + } + } + + // Walk siblings between the open tag and its close tag. + let sibling = openTag.nextSibling; + let lastNode: SyntaxNode = openTag; + while (sibling) { + if (sibling.type === 'cf_component_close_tag') { + lastNode = sibling; + break; + } + if (sibling.type === 'cf_function_tag') { + this.extractFunctionTag(sibling, classNode.id, classNode.id); + } else if (sibling.type === 'cf_script_tag') { + this.delegateScriptTag(sibling, classNode.id); + } else if (sibling.type === 'cf_query_tag') { + this.delegateQueryTag(sibling, classNode.id); + } else { + this.delegateNestedTags(sibling, classNode.id); + } + lastNode = sibling; + sibling = sibling.nextSibling; + } + classNode.endLine = lastNode.endPosition.row + 1; + return lastNode; + } + + /** + * `...`. + * `parentClassId` decides `method` vs top-level `function`; `containerId` is + * the `contains`-edge target (the class when inside one, otherwise the file + * node for a bare top-level cffunction) — kept separate so a top-level + * function still gets a containment edge without being misclassified as a + * method of the file. + */ + private extractFunctionTag(tag: SyntaxNode, parentClassId: string | undefined, containerId: string | undefined): void { + const name = this.tagAttr(tag, 'name'); + if (!name) return; + + const kind = parentClassId ? 'method' : 'function'; + const id = generateNodeId(this.filePath, kind, name, tag.startPosition.row + 1); + const access = this.tagAttr(tag, 'access'); + const visibility = access === 'private' ? 'private' + : access === 'package' ? 'internal' + : access ? 'public' + : undefined; + + const fnNode: Node = { + id, + kind, + name, + qualifiedName: `${this.filePath}::${name}`, + filePath: this.filePath, + language: this.language, + startLine: tag.startPosition.row + 1, + endLine: tag.endPosition.row + 1, + startColumn: tag.startPosition.column, + endColumn: tag.endPosition.column, + visibility, + returnType: this.tagAttr(tag, 'returntype'), + updatedAt: Date.now(), + }; + this.nodes.push(fnNode); + + if (containerId) { + this.edges.push({ source: containerId, target: fnNode.id, kind: 'contains' }); + } + + // Delegate any / bodies nested inside this function, at + // any depth (e.g. inside // control-flow tags). + this.delegateNestedTags(tag, fnNode.id); + } + + /** + * Recursively delegates any `cf_script_tag`/`cf_query_tag` found within + * `node`'s subtree — e.g. a ``/`` nested inside + * ``/``/`` control-flow tags, which (unlike + * ``'s body — see the implicit-end-tag note on `extractComponent`) + * ARE normal children, just possibly several levels deep, so a direct-children + * check misses them. Does not descend into a nested `cf_function_tag` — that + * has its own scope and is walked separately. + */ + private delegateNestedTags(node: SyntaxNode, containerId: string | undefined): void { + for (let i = 0; i < node.namedChildCount; i++) { + const child = node.namedChild(i); + if (!child) continue; + if (child.type === 'cf_script_tag') { + this.delegateScriptTag(child, containerId); + } else if (child.type === 'cf_query_tag') { + this.delegateQueryTag(child, containerId); + } else if (child.type === 'cf_function_tag') { + continue; + } else { + this.delegateNestedTags(child, containerId); + } + } + } + + /** Delegate a `...` tag body to the cfscript grammar. */ + private delegateScriptTag(scriptTag: SyntaxNode, parentId: string | undefined): void { + const content = scriptTag.namedChildren.find((c: SyntaxNode) => c.type === 'cf_script_content'); + if (!content) return; + + const inner = this.source.substring(content.startIndex, content.endIndex); + const startLine = content.startPosition.row; + + const extractor = new TreeSitterExtractor(this.filePath, inner, 'cfscript'); + const result = extractor.extract(); + + // The inner TreeSitterExtractor always synthesizes its own `file`-kind + // node scoped to just this snippet — drop it (and any edges touching it) + // since this tag-based file already owns one correctly-ranged file node + // (see createFileNode); the per-node `parentId` contains-edge below + // already links every emitted symbol into the real tree. + const innerFileNodeId = result.nodes.find((n) => n.kind === 'file')?.id; + for (const node of result.nodes) { + if (node.kind === 'file') continue; + node.startLine += startLine; + node.endLine += startLine; + node.language = this.language; + this.nodes.push(node); + if (parentId) { + this.edges.push({ source: parentId, target: node.id, kind: 'contains' }); + } + } + for (const edge of result.edges) { + if (edge.source === innerFileNodeId || edge.target === innerFileNodeId) continue; + if (edge.line) edge.line += startLine; + this.edges.push(edge); + } + for (const ref of result.unresolvedReferences) { + ref.line += startLine; + ref.filePath = this.filePath; + ref.language = this.language; + // Calls inside a body with no enclosing function (rare — a + // top-level script in a .cfm template, or any statement directly in + // the snippet body) attribute to the filtered-out snippet file node by + // default — redirect those (and any genuinely unset ones) to parentId. + if ((!ref.fromNodeId || ref.fromNodeId === innerFileNodeId) && parentId) ref.fromNodeId = parentId; + this.unresolvedReferences.push(ref); + } + for (const error of result.errors) { + if (error.line) error.line += startLine; + this.errors.push(error); + } + } + + /** + * Delegate a `...` tag's SQL body to the `cfquery` grammar. + * `#hash#` expressions inside the SQL (e.g. `#getCurrentUser().getId()#` in a + * WHERE clause) are real CFML calls/references — tree-sitter-cfml's `cfquery` + * grammar parses them structurally (same `call_expression`/`member_expression` + * shape as cfscript), so without this delegation they're silently dropped as + * opaque SQL text. The grammar models no other symbols, so only call/reference + * extraction is relevant here — unlike `delegateScriptTag`, there are no nodes + * or contains-edges to merge. + */ + private delegateQueryTag(queryTag: SyntaxNode, parentId: string | undefined): void { + const content = queryTag.namedChildren.find((c: SyntaxNode) => c.type === 'cf_query_content'); + if (!content) return; + + const sql = this.source.substring(content.startIndex, content.endIndex); + const startLine = content.startPosition.row; + + const extractor = new TreeSitterExtractor(this.filePath, sql, 'cfquery'); + const result = extractor.extract(); + + const innerFileNodeId = result.nodes.find((n) => n.kind === 'file')?.id; + for (const ref of result.unresolvedReferences) { + ref.line += startLine; + ref.filePath = this.filePath; + ref.language = this.language; + if ((!ref.fromNodeId || ref.fromNodeId === innerFileNodeId) && parentId) ref.fromNodeId = parentId; + this.unresolvedReferences.push(ref); + } + for (const error of result.errors) { + if (error.line) error.line += startLine; + this.errors.push(error); + } + } + + /** Read a `cf_attribute`'s value by name from a tag node's direct `cf_attribute`/`cf_tag_attributes` children. */ + private tagAttr(tag: SyntaxNode, attrName: string): string | undefined { + const attrs: SyntaxNode[] = []; + for (let i = 0; i < tag.namedChildCount; i++) { + const child = tag.namedChild(i); + if (!child) continue; + if (child.type === 'cf_attribute') attrs.push(child); + else if (child.type === 'cf_tag_attributes') { + for (let j = 0; j < child.namedChildCount; j++) { + const inner = child.namedChild(j); + if (inner?.type === 'cf_attribute') attrs.push(inner); + } + } + } + for (const attr of attrs) { + const nameNode = attr.namedChildren.find((c: SyntaxNode) => c.type === 'cf_attribute_name'); + if (!nameNode) continue; + const text = this.source.substring(nameNode.startIndex, nameNode.endIndex); + if (text.toLowerCase() !== attrName.toLowerCase()) continue; + const valueWrapper = attr.namedChildren.find((c: SyntaxNode) => c.type === 'quoted_cf_attribute_value'); + const valueNode = valueWrapper?.namedChildren.find((c: SyntaxNode) => c.type === 'attribute_value'); + if (!valueNode) return ''; + return this.source.substring(valueNode.startIndex, valueNode.endIndex); + } + return undefined; + } + + private componentNameFromPath(): string { + const fileName = this.filePath.split(/[/\\]/).pop() || this.filePath; + return fileName.replace(/\.(cfc|cfm|cfs)$/i, ''); + } +} + +/** + * Sniff whether CFML source is bare-script (`component { ... }`, modern style) + * vs tag-based (``, ``, HTML). Skips leading whitespace and + * `//`/`/* *\/` comments to find the first real token; tag-based files start + * with `<`, script-based files don't. + */ +export function isBareScriptCfml(source: string): boolean { + let i = 0; + const len = source.length; + while (i < len) { + const ch = source[i]; + if (ch === ' ' || ch === '\t' || ch === '\n' || ch === '\r') { + i++; + } else if (ch === '/' && source[i + 1] === '/') { + const nl = source.indexOf('\n', i); + i = nl === -1 ? len : nl + 1; + } else if (ch === '/' && source[i + 1] === '*') { + const end = source.indexOf('*/', i + 2); + i = end === -1 ? len : end + 2; + } else { + return ch !== '<'; + } + } + return true; // empty/whitespace-only file — treat as script (no-op extraction either way) +} diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index 1b15996c0..7bdb90c4f 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -39,6 +39,9 @@ const WASM_GRAMMAR_FILES: Record = { r: 'tree-sitter-r.wasm', luau: 'tree-sitter-luau.wasm', objc: 'tree-sitter-objc.wasm', + cfml: 'tree-sitter-cfml.wasm', + cfscript: 'tree-sitter-cfscript.wasm', + cfquery: 'tree-sitter-cfquery.wasm', }; /** @@ -108,6 +111,11 @@ export const EXTENSION_MAP: Record = { '.luau': 'luau', '.m': 'objc', '.mm': 'objc', + // CFML: .cfc/.cfm parse with the tag-aware `cfml` grammar (custom CfmlExtractor + // dialect-switches to cfscript for bare-script content); .cfs is pure CFScript. + '.cfc': 'cfml', + '.cfm': 'cfml', + '.cfs': 'cfscript', // XML: file-level tracking; the MyBatis extractor matches `` // shape and emits SQL-statement nodes (other XML returns empty). '.xml': 'xml', @@ -199,6 +207,14 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise tag bodies, and + // SQL bodies to the cfscript/cfquery grammars (see injections.scm in + // tree-sitter-cfml) — load both even when no standalone .cfs file is in the + // index set. + if (languages.some((l) => l === 'cfml')) { + languages = [...languages, 'cfscript', 'cfquery']; + } + // Deduplicate and filter to languages that have WASM grammars and aren't already loaded const toLoad = [...new Set(languages)].filter( (lang): lang is GrammarLanguage => @@ -221,7 +237,7 @@ export async function loadGrammarsForLanguages(languages: Language[]): Promise` SQL bodies: `#hash#` expressions inside the SQL text are real + * CFML expressions (tree-sitter-cfml's `cfquery` grammar parses them + * structurally — `call_expression`/`member_expression`, same shape as + * cfscript's), so a call like `#getCurrentUser().getId()#` embedded in a + * WHERE clause is a genuine call edge. The surrounding SQL keywords/ + * identifiers aren't symbols CodeGraph models — only `call_expression` is + * mapped, so extraction yields call references and nothing else. + */ +export const cfqueryExtractor: LanguageExtractor = { + functionTypes: [], + classTypes: [], + methodTypes: [], + interfaceTypes: [], + structTypes: [], + enumTypes: [], + typeAliasTypes: [], + importTypes: [], + callTypes: ['call_expression'], + variableTypes: [], + nameField: 'name', + bodyField: 'body', + paramsField: 'parameters', +}; diff --git a/src/extraction/languages/cfscript.ts b/src/extraction/languages/cfscript.ts new file mode 100644 index 000000000..361806f4b --- /dev/null +++ b/src/extraction/languages/cfscript.ts @@ -0,0 +1,69 @@ +import type { Node as SyntaxNode } from 'web-tree-sitter'; +import { getNodeText, getChildByField } from '../tree-sitter-helpers'; +import type { LanguageExtractor } from '../tree-sitter-types'; + +/** CFML access modifiers (`public`/`private`/`package`/`remote`) on a function_declaration. */ +function cfmlVisibility(node: SyntaxNode): 'public' | 'private' | 'protected' | 'internal' | undefined { + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (child?.type === 'access_type') { + const text = child.text; + if (text === 'public') return 'public'; + if (text === 'private') return 'private'; + if (text === 'package') return 'internal'; + if (text === 'remote') return 'public'; + } + } + return undefined; +} + +export const cfscriptExtractor: LanguageExtractor = { + functionTypes: ['function_declaration', 'function_expression', 'arrow_function'], + classTypes: ['component'], + // `component` is reused for both `component { ... }` and `interface { ... }` — + // the only difference is the literal first token (verified via the grammar's + // native binding: child(0).type is 'component' or 'interface', both unnamed). + classifyClassNode: (node) => (node.child(0)?.type === 'interface' ? 'interface' : 'class'), + methodTypes: ['function_declaration', 'method_definition'], + interfaceTypes: [], + structTypes: [], + enumTypes: [], + typeAliasTypes: [], + importTypes: ['import_statement', 'include_statement'], + callTypes: ['call_expression'], + variableTypes: ['variable_declaration'], + propertyTypes: ['property_declaration'], + nameField: 'name', + bodyField: 'body', + paramsField: 'parameters', + getVisibility: cfmlVisibility, + getSignature: (node, source) => { + const params = getChildByField(node, 'parameters'); + return params ? getNodeText(params, source) : undefined; + }, + extractImport: (node, source) => { + const importText = source.substring(node.startIndex, node.endIndex).trim(); + + if (node.type === 'include_statement') { + // `include "path/to/file.cfm";` — the included template path. + const expr = node.namedChildren.find((c: SyntaxNode) => c.type === 'string'); + if (!expr) return null; + const moduleName = getNodeText(expr, source).replace(/^["']|["']$/g, ''); + return moduleName ? { moduleName, signature: importText } : null; + } + + // `import com.foo.Bar;` (dotted path) or `import "java:java.util.ArrayList";` (string form) + const sourceNode = getChildByField(node, 'source'); + if (!sourceNode) return null; + + let moduleName: string; + if (sourceNode.type === 'import_path') { + moduleName = sourceNode.namedChildren + .map((c: SyntaxNode) => getNodeText(c, source)) + .join('.'); + } else { + moduleName = getNodeText(sourceNode, source).replace(/^["']|["']$/g, ''); + } + return moduleName ? { moduleName, signature: importText } : null; + }, +}; diff --git a/src/extraction/languages/index.ts b/src/extraction/languages/index.ts index 9d4a949a5..7b0bacbd9 100644 --- a/src/extraction/languages/index.ts +++ b/src/extraction/languages/index.ts @@ -27,6 +27,8 @@ import { luaExtractor } from './lua'; import { rExtractor } from './r'; import { luauExtractor } from './luau'; import { objcExtractor } from './objc'; +import { cfscriptExtractor } from './cfscript'; +import { cfqueryExtractor } from './cfquery'; export const EXTRACTORS: Partial> = { typescript: typescriptExtractor, @@ -51,4 +53,6 @@ export const EXTRACTORS: Partial> = { r: rExtractor, luau: luauExtractor, objc: objcExtractor, + cfscript: cfscriptExtractor, + cfquery: cfqueryExtractor, }; diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index 36e43cd82..0d422d771 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -29,6 +29,7 @@ import { AstroExtractor } from './astro-extractor'; import { DfmExtractor } from './dfm-extractor'; import { VueExtractor } from './vue-extractor'; import { MyBatisExtractor } from './mybatis-extractor'; +import { CfmlExtractor } from './cfml-extractor'; import { getAllFrameworkResolvers, getApplicableFrameworks, @@ -2392,7 +2393,7 @@ export class TreeSitterExtractor { // Extract variable declarators based on language if (this.language === 'typescript' || this.language === 'javascript' || - this.language === 'tsx' || this.language === 'jsx') { + this.language === 'tsx' || this.language === 'jsx' || this.language === 'cfscript') { // Handle lexical_declaration and variable_declaration // These contain one or more variable_declarator children for (let i = 0; i < node.namedChildCount; i++) { @@ -4724,6 +4725,34 @@ export class TreeSitterExtractor { if (child.type === 'field_declaration_list' || child.type === 'class_heritage') { this.extractInheritance(child, classId); } + + // CFML cfscript `component extends="Base" implements="IFoo,IBar" { ... }` + // (also covers `interface extends="IBase" { ... }`, which reuses the same + // component_attribute shape). Attributes are generic name=value pairs — + // (identifier label, expression value) — not a dedicated extends_clause, + // so filter by the label text. `implements` is a comma-separated list. + if (child.type === 'component_attribute' && node.type === 'component') { + const label = child.namedChildren.find((c: SyntaxNode) => c.type === 'identifier'); + const value = child.namedChildren.find((c: SyntaxNode) => c.type !== 'identifier'); + if (label && value) { + const labelText = getNodeText(label, this.source).toLowerCase(); + if (labelText === 'extends' || labelText === 'implements') { + const valueText = getNodeText(value, this.source).replace(/^["']|["']$/g, ''); + const names = labelText === 'implements' + ? valueText.split(',').map((s) => s.trim()).filter(Boolean) + : [valueText.trim()].filter(Boolean); + for (const name of names) { + this.unresolvedReferences.push({ + fromNodeId: classId, + referenceName: name, + referenceKind: labelText === 'implements' ? 'implements' : 'extends', + line: value.startPosition.row + 1, + column: value.startPosition.column, + }); + } + } + } + } } } @@ -5732,6 +5761,16 @@ export function extractFromSource( // file node so the watcher tracks it without emitting symbols. const extractor = new MyBatisExtractor(filePath, source); result = extractor.extract(); + } else if (detectedLanguage === 'cfml' || detectedLanguage === 'cfscript') { + // Custom extractor for CFML (.cfc/.cfm) — dialect-switches between the + // tag-based cfml grammar and the bare-script cfscript grammar. Standalone + // `.cfs` files (language 'cfscript') are always pure script (never `<`-led), + // so routing them through here too gets them the same anonymous-component + // filename fallback as a bare-script `.cfc` — without it a `.cfs` whose + // `component { ... }` declares no name (the grammar has no `name` field; + // CFML never spells one in source) stays ``. + const extractor = new CfmlExtractor(filePath, source, detectedLanguage); + result = extractor.extract(); } else if (isFileLevelOnlyLanguage(detectedLanguage)) { // No symbol extraction at this stage — files are tracked at the file-record // level only. Framework extractors (Drupal routing yml, Spring `@Value` diff --git a/src/extraction/wasm/tree-sitter-cfml.wasm b/src/extraction/wasm/tree-sitter-cfml.wasm new file mode 100755 index 000000000..f488a2290 Binary files /dev/null and b/src/extraction/wasm/tree-sitter-cfml.wasm differ diff --git a/src/extraction/wasm/tree-sitter-cfquery.wasm b/src/extraction/wasm/tree-sitter-cfquery.wasm new file mode 100755 index 000000000..48f8c313d Binary files /dev/null and b/src/extraction/wasm/tree-sitter-cfquery.wasm differ diff --git a/src/extraction/wasm/tree-sitter-cfscript.wasm b/src/extraction/wasm/tree-sitter-cfscript.wasm new file mode 100755 index 000000000..c50aa75a5 Binary files /dev/null and b/src/extraction/wasm/tree-sitter-cfscript.wasm differ diff --git a/src/types.ts b/src/types.ts index a3122bf9a..9d7713bc3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -95,6 +95,9 @@ export const LANGUAGES = [ 'twig', 'xml', 'properties', + 'cfml', + 'cfscript', + 'cfquery', 'unknown', ] as const;