diff --git a/README.md b/README.md index 7b6c33932..14e23e03d 100644 --- a/README.md +++ b/README.md @@ -599,6 +599,9 @@ What it skips out of the box: - **Anything in your `.gitignore`** — honored in git repos via git, and in non-git projects by reading `.gitignore` directly (root and nested). - **Files larger than 1 MB** — generated bundles, minified JS, vendored blobs. + Override the threshold (in bytes) with `CODEGRAPH_MAX_FILE_SIZE` for repos with + legitimately large hand-written sources — e.g. `CODEGRAPH_MAX_FILE_SIZE=5242880` + for 5 MB. Invalid or non-positive values fall back to the 1 MB default. To keep something else out, add it to `.gitignore`. To pull a default-excluded directory back **in** (say you really do want a vendored dependency indexed), diff --git a/__tests__/resolve-max-file-size.test.ts b/__tests__/resolve-max-file-size.test.ts new file mode 100644 index 000000000..b9039588b --- /dev/null +++ b/__tests__/resolve-max-file-size.test.ts @@ -0,0 +1,34 @@ +import { afterEach, describe, expect, it } from 'vitest'; +import { resolveMaxFileSize } from '../src/extraction'; + +const DEFAULT = 1024 * 1024; +const ENV = 'CODEGRAPH_MAX_FILE_SIZE'; + +describe('resolveMaxFileSize', () => { + afterEach(() => { + delete process.env[ENV]; + }); + + it('falls back to the 1 MB default when the env var is unset', () => { + delete process.env[ENV]; + expect(resolveMaxFileSize()).toBe(DEFAULT); + }); + + it('honours a valid positive override', () => { + process.env[ENV] = String(5 * 1024 * 1024); + expect(resolveMaxFileSize()).toBe(5 * 1024 * 1024); + }); + + it('floors fractional byte counts', () => { + process.env[ENV] = '2097152.9'; + expect(resolveMaxFileSize()).toBe(2 * 1024 * 1024); + }); + + it.each(['', 'not-a-number', '0', '-1', 'NaN', 'Infinity'])( + 'falls back to the default for invalid value %j', + (raw) => { + process.env[ENV] = raw; + expect(resolveMaxFileSize()).toBe(DEFAULT); + }, + ); +}); diff --git a/src/extraction/index.ts b/src/extraction/index.ts index daadc7131..afd4e2c57 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -114,7 +114,23 @@ export function hashContent(content: string): string { * vendored blobs blow the WASM heap and the worker-recycle budget for no useful * symbols. 1 MB covers essentially all hand-written source. */ -const MAX_FILE_SIZE = 1024 * 1024; +const DEFAULT_MAX_FILE_SIZE = 1024 * 1024; + +/** + * Resolve the max-file-size threshold (bytes), allowing an override via the + * `CODEGRAPH_MAX_FILE_SIZE` environment variable for repos with legitimately + * large hand-written sources. Falls back to the 1 MB default when the value is + * unset, non-numeric, or not a positive number. + */ +export function resolveMaxFileSize(): number { + const raw = process.env.CODEGRAPH_MAX_FILE_SIZE; + if (raw === undefined || raw === '') return DEFAULT_MAX_FILE_SIZE; + const parsed = Number(raw); + if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_MAX_FILE_SIZE; + return Math.floor(parsed); +} + +const MAX_FILE_SIZE = resolveMaxFileSize(); /** * Directory names that are dependency, build, cache, or tooling output across the