diff --git a/CHANGELOG.md b/CHANGELOG.md index abc341c09..4105ff906 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixes + +- C++ `class Foo;` forward declarations are no longer indexed as classes, so a heavily used type is no longer buried under phantom copies of itself. A forward declaration is just a promise that a type exists — it has no body, members, or base classes — but CodeGraph still minted a full `class` node for each one. In a large C++/Unreal-Engine codebase a hot class such as `APXCharacter` is forward-declared (`class APXCharacter;`) at the top of dozens of headers, so the graph ended up with dozens of bodiless `APXCharacter` nodes competing with the single real definition; `codegraph_explore` then returned a spray of forward-declaration sites — and picked one as the blast-radius representative — while the actual definition (with its members and callers) was crowded out of the results. Bodiless class specifiers are now skipped, exactly as bodiless structs (#831) and enums already were, so only the real definition is indexed. The skip is gated to C/C++, where a bodiless class is always a forward declaration; languages in which a bodiless class is a complete definition (Kotlin `class Empty`, Scala) are unaffected. Thanks @luoyxy for the report and fix. ## [1.1.6] - 2026-06-30 diff --git a/__tests__/cpp-forward-decl.test.ts b/__tests__/cpp-forward-decl.test.ts new file mode 100644 index 000000000..1f5c35e81 --- /dev/null +++ b/__tests__/cpp-forward-decl.test.ts @@ -0,0 +1,51 @@ +/** + * C++ forward-declaration extraction. + * + * A `class Foo;` forward declaration parses as a bodiless `class_specifier`. + * It is NOT a definition, so it must not mint a `class` node — otherwise every + * forward decl repeated across dozens of headers creates a phantom `class Foo` + * that competes with, and in `codegraph_explore` results MASKS, the single real + * definition (structs and enums already skip their bodiless forms). Languages + * where a bodiless class IS a definition (Kotlin `class Empty`, Scala) must be + * unaffected — the skip is gated on the C/C++ extractor's `skipBodilessClass`. + */ +import { describe, it, expect, beforeAll } from 'vitest'; +import { extractFromSource } from '../src/extraction'; +import { initGrammars, loadAllGrammars } from '../src/extraction/grammars'; + +beforeAll(async () => { + await initGrammars(); + await loadAllGrammars(); +}); + +describe('C++ forward-declaration handling', () => { + it('does NOT emit phantom class nodes for forward declarations', () => { + const res = extractFromSource('Fwd.h', `class APXCharacter;\nclass UFoo;\n`, 'cpp', []); + expect(res.nodes.filter((n) => n.kind === 'class').length).toBe(0); + }); + + it('still emits a class node for a real definition', () => { + const res = extractFromSource('Bar.h', `class Bar {\npublic:\n void doThing();\n};\n`, 'cpp', []); + expect(res.nodes.filter((n) => n.kind === 'class').map((c) => c.name)).toContain('Bar'); + }); + + it('keeps only the real definition when a fwd decl precedes it', () => { + const src = `class APXCharacter;\n\nclass APXCharacter {\npublic:\n void run() {}\n};\n`; + const res = extractFromSource('Mix.h', src, 'cpp', []); + const chars = res.nodes.filter((n) => n.kind === 'class' && n.name === 'APXCharacter'); + expect(chars.length).toBe(1); + // the surviving node is the definition — its inline member method is extracted + expect(res.nodes.filter((n) => n.kind === 'method' && n.name === 'run').length).toBe(1); + }); + + it('templated forward declaration is skipped too', () => { + const res = extractFromSource('T.h', `template class TFoo;\n`, 'cpp', []); + expect(res.nodes.filter((n) => n.kind === 'class').length).toBe(0); + }); + + it('Kotlin bodiless class remains a real definition (no regression)', () => { + const res = extractFromSource('K.kt', `class Empty\nclass WithBody { fun f() {} }\n`, 'kotlin', []); + const names = res.nodes.filter((n) => n.kind === 'class').map((c) => c.name); + expect(names).toEqual(expect.arrayContaining(['Empty', 'WithBody'])); + }); +}); diff --git a/src/extraction/languages/c-cpp.ts b/src/extraction/languages/c-cpp.ts index aa5f90ea9..48e282f16 100644 --- a/src/extraction/languages/c-cpp.ts +++ b/src/extraction/languages/c-cpp.ts @@ -243,6 +243,11 @@ export const cppExtractor: LanguageExtractor = { // Recover macro-annotated class/struct definitions (`class MYMODULE_API Foo : Base`) // that tree-sitter otherwise misparses into a phantom function (#1061/#946). preParse: blankCppExportMacros, + // A bodiless `class_specifier` in C++ is a forward declaration (`class Foo;`) + // or an elaborated type reference, never a definition — skip it so repeated + // forward decls across headers don't mint phantom class nodes that mask the + // real definition (matches the bodiless skip structs/enums already get). + skipBodilessClass: true, functionTypes: ['function_definition'], classTypes: ['class_specifier'], methodTypes: ['function_definition'], diff --git a/src/extraction/tree-sitter-types.ts b/src/extraction/tree-sitter-types.ts index b65ee09c9..c5dac014f 100644 --- a/src/extraction/tree-sitter-types.ts +++ b/src/extraction/tree-sitter-types.ts @@ -165,6 +165,18 @@ export interface LanguageExtractor { methodsAreTopLevel?: boolean; /** NodeKind to use for interface-like declarations (Rust: 'trait'). Default: 'interface' */ interfaceKind?: NodeKind; + /** + * When true, a class node with no body is a forward declaration / elaborated + * type reference — NOT a definition — and is skipped, mirroring the bodiless + * skip already applied to structs (#831) and enums. In C++ a `class Foo;` + * forward declaration parses as a bodiless `class_specifier`; repeated across + * dozens of headers it mints one phantom `class Foo` node per header that + * competes with — and in `codegraph_explore` results MASKS — the single real + * definition (the phantom, bodiless nodes crowd out the one that carries the + * members and callers). Off by default because some languages (Kotlin `class + * Empty`, Scala) treat a bodiless class as a complete definition. C/C++-only. + */ + skipBodilessClass?: boolean; // --- New hooks --- diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index 36e43cd82..f5cd449d3 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -1528,6 +1528,18 @@ export class TreeSitterExtractor { private extractClass(node: SyntaxNode, kind: NodeKind = 'class'): void { if (!this.extractor) return; + // Resolve the body once up front so a bodiless class can be skipped before a + // node is minted (mirrors extractStruct/extractEnum). In C++ a `class Foo;` + // forward declaration is a bodiless `class_specifier`; without this guard + // every such forward decl — repeated across dozens of headers — mints a + // phantom `class Foo` node that competes with and MASKS the single real + // definition in query results (the real one carries the members/callers). + // Gated on `skipBodilessClass` (C/C++ only) because Kotlin/Scala treat a + // bodiless class as a complete definition. + let body = this.extractor.resolveBody?.(node, this.extractor.bodyField) + ?? getChildByField(node, this.extractor.bodyField); + if (!body && this.extractor.skipBodilessClass) return; + const name = extractName(node, this.source, this.extractor); const docstring = getPrecedingDocstring(node, this.source); const visibility = this.extractor.getVisibility?.(node); @@ -1551,8 +1563,6 @@ export class TreeSitterExtractor { // Push to stack and visit body this.nodeStack.push(classNode.id); - let body = this.extractor.resolveBody?.(node, this.extractor.bodyField) - ?? getChildByField(node, this.extractor.bodyField); if (!body) body = node; // Visit all children for methods and properties