diff --git a/AGENTS.md b/AGENTS.md index 7e2eac62a..080c1128d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -998,7 +998,10 @@ mock.module("./some-module", () => ({ * **Consola chosen as CLI logger with Sentry createConsolaReporter integration**: Consola is the CLI logger with Sentry \`createConsolaReporter\` integration. Two reporters: FancyReporter (stderr) + Sentry structured logs. Level via \`SENTRY\_LOG\_LEVEL\`. \`buildCommand\` injects hidden \`--log-level\`/\`--verbose\` flags. \`withTag()\` creates independent instances; \`setLogLevel()\` propagates via registry. All user-facing output must use consola, not raw stderr. \`HandlerContext\` intentionally omits stderr. -* **DSN cache invalidation uses two-level mtime tracking (sourceMtimes + dirMtimes)**: \*\*DSN cache invalidation uses two-level mtime tracking\*\*: \`sourceMtimes\` (DSN-bearing files, catches in-place edits) + \`dirMtimes\` (every walked dir, catches new files) + root mtime fast-path + 24h TTL. Dropping either map is a correctness regression. Walker emits mtimes via \`onDirectoryVisit\` hook + \`recordMtimes\` option. \*\*Error-path invariant\*\*: \`scanDirectory\` catch block MUST return empty \`dirMtimes: {}\` — NOT partial map from \`onDirectoryVisit\` callback, which would silently bless unvisited dirs. Empty forces full rescan. \`ConfigError\` re-throws instead. \`scan.walk.dsnParity\` bench op imports \`dsnScanOptions()\` from \`dsn/scan-options.ts\` to keep \`scan/\` policy-free. +* **DSN cache invalidation uses two-level mtime tracking (sourceMtimes + dirMtimes)**: \*\*DSN cache invalidation uses two-level mtime tracking\*\*: \`sourceMtimes\` (DSN-bearing files, catches in-place edits) + \`dirMtimes\` (every walked dir, catches new files) + root mtime fast-path + 24h TTL. Dropping either map is a correctness regression. Walker emits mtimes via \`onDirectoryVisit\` hook + \`recordMtimes\` option; DSN scanner uses \`grepFiles({pattern: DSN\_PATTERN, recordMtimes: true, onDirectoryVisit})\` as full-scan pipeline (~20% faster than old walkFiles path). \`scanCodeForFirstDsn\` stays on direct walker loop — worker-pool init cost (~20ms) dominates for single-DSN. \*\*sourceMtimes invariant\*\*: \`processMatch\` must record mtime for EVERY file containing a host-validated DSN, not just deduplicated new ones — track via \`fileHadValidDsn\` flag independent of \`seen.has(raw)\`. \*\*Error-path invariant\*\*: \`scanDirectory\` catch MUST return empty \`dirMtimes: {}\`, NOT partial map from callback (would silently bless unvisited dirs). \`ConfigError\` re-throws instead. + + +* **Grep worker pool: binary-transferable matches + streaming dispatch in src/lib/scan/**: \*\*Grep worker pool\*\* (\`src/lib/scan/worker-pool.ts\` + \`grep-worker.js\`): lazy singleton pool (size \`min(8, max(2, availableParallelism()))\`). Matches encoded as \`Uint32Array\` quads \`\[pathIdx, lineNum, lineOffset, lineLength]\` + single \`linePool\` string, transferred via \`postMessage(msg, \[ints.buffer])\` — cut grep pipeline ~40% by dropping structuredClone. Worker source lives in real \`grep-worker.js\` (plain JS, lintable), imported via \`with { type: "text" }\` → fed to \`Blob\` + \`URL.createObjectURL\`. \`new Worker(new URL('./w.ts', import.meta.url))\` HANGS in \`bun build --compile\` binaries (\`import.meta.url\` → \`/$bunfs/root/binary\`); string paths are CWD-relative and brittle. Worker source must be self-contained (no user imports); \`require('node:fs')\` works inside. \*\*FIFO handler queue per worker\*\* — fresh \`addEventListener\` per dispatch fires on every message, resolving wrong requests and causing hangs. \*\*Dead-worker guard\*\*: \`PooledWorker.alive\` flipped on \`error\`. Disable via \`SENTRY\_SCAN\_DISABLE\_WORKERS=1\`. * **Input validation layer: src/lib/input-validation.ts guards CLI arg parsing**: CLI arg input validation (\`src/lib/input-validation.ts\`): Four validators — \`rejectControlChars\` (ASCII < 0x20), \`rejectPreEncoded\` (%XX), \`validateResourceId\` (rejects ?, #, %, whitespace), \`validateEndpoint\` (rejects \`..\` traversal). Applied in \`parseSlashOrgProject\`, \`parseOrgProjectArg\`, \`parseIssueArg\`, \`normalizeEndpoint\`. NOT applied in \`parseSlashSeparatedArg\` for plain IDs (may contain structural separators). Env vars and DB cache values are trusted. @@ -1007,13 +1010,13 @@ mock.module("./some-module", () => ({ * **Magic @ selectors resolve issues dynamically via sort-based list API queries**: Magic @ selectors resolve issues dynamically: \`@latest\`, \`@most\_frequent\` in \`parseIssueArg\` detected before \`validateResourceId\` (@ not in forbidden charset). \`SELECTOR\_MAP\` provides case-insensitive matching. \`resolveSelector\` maps to \`IssueSort\` values, calls \`listIssuesPaginated\` with \`perPage: 1\`, \`query: 'is:unresolved'\`. Supports org-prefixed: \`sentry/@latest\`. Unrecognized \`@\`-prefixed strings fall through. \`ParsedIssueArg\` union includes \`{ type: 'selector' }\`. -* **Sentry SDK uses @sentry/node-core/light instead of @sentry/bun to avoid OTel overhead**: Sentry SDK uses @sentry/node-core/light to avoid OTel overhead: Avoids \`@sentry/bun\` which loads full OpenTelemetry (~150ms, 24MB). \`@sentry/core\` barrel patched via \`bun patch\` to remove ~32 unused exports. Key gotcha: \`LightNodeClient\` hardcodes \`runtime: { name: 'node' }\` AFTER spreading options — fix by patching \`client.getOptions().runtime\` post-init (mutable ref). Transport falls back to Node \`http\` instead of native \`fetch\`. Upstream: getsentry/sentry-javascript#19885, #19886. +* **Sentry SDK uses @sentry/node-core/light instead of @sentry/bun to avoid OTel overhead**: Sentry SDK uses \`@sentry/node-core/light\` instead of \`@sentry/bun\` to avoid OpenTelemetry overhead (~150ms, 24MB). \`@sentry/core\` barrel patched via \`bun patch\` to remove ~32 unused exports. Gotcha: \`LightNodeClient\` hardcodes \`runtime: { name: 'node' }\` AFTER spreading options — fix by patching \`client.getOptions().runtime\` post-init (mutable ref). Transport uses Node \`http\` instead of native \`fetch\`. Upstream: getsentry/sentry-javascript#19885, #19886. -* **src/lib/scan/ module: policy-free file walker with IgnoreStack**: \*\*src/lib/scan/ module: policy-free walker + grep/glob engine.\*\* Files: \`walker.ts\` (DFS async gen; \`minDepth\`/\`maxDepth\` cap DIRECTORY DESCENT not file yield; \`timeBudgetMs\`, \`AbortSignal\`, \`onDirectoryVisit\`, \`recordMtimes\`), \`ignore.ts\` (IgnoreStack two-tier: \`#rootIg\`+\`#nestedByRelDir\`; fast-path ~0.27µs, slow-path ~1.93µs), \`binary.ts\` (8KB NUL sniff + ext fast-path), \`concurrent.ts\`, \`regex.ts\` (translates \`(?i)\`/\`(?im)\`/\`(?U)\` → JS flags), \`grep.ts\`/\`glob.ts\` (picomatch \`{dot:true}\`). DSN policy lives in \`src/lib/dsn/scan-options.ts\`. Gotchas: (1) \`classifyFile\` short-circuits when \`cfg.extensions\` set. (2) \`buildWalkOptions\` MUST forward \`descentHook\`+\`alwaysSkipDirs\`+\`followSymlinks\`+\`recordMtimes\`+\`clock\`. (3) NEVER \`pLimit.clearQueue()\` (hangs); use \`onResult\` returning \`{done:true}\`. (4) \`path.join\`/\`relative\`/\`extname\` ~10× slower than manual string ops in hot walker loop — use \`abs.slice(cwdPrefixLen)\` and \`lastIndexOf('.')+slice+toLowerCase\`; on Windows use \`path.sep\`. +* **src/lib/scan/ module: policy-free file walker with IgnoreStack**: \`src/lib/scan/\` — policy-free walker + grep/glob engine. Files: \`walker.ts\` (DFS async gen; \`minDepth\`/\`maxDepth\` cap DESCENT not yield; \`timeBudgetMs\`, \`AbortSignal\`, \`onDirectoryVisit\`, \`recordMtimes\`), \`ignore.ts\` (IgnoreStack two-tier \`#rootIg\`+\`#nestedByRelDir\`), \`binary.ts\` (8KB NUL sniff + ext fast-path), \`regex.ts\` (\`(?i)\`/\`(?im)\`/\`(?U)\` → JS flags), \`grep.ts\`/\`glob.ts\` (picomatch \`{dot:true}\`). DSN policy lives in \`src/lib/dsn/scan-options.ts\`. \`GrepOptions\` supports \`recordMtimes\` + \`onDirectoryVisit\` passthroughs — each \`GrepMatch\` gets optional \`mtime\` from walker's \`entry.mtime\`. Gotchas: (1) \`classifyFile\` short-circuits when \`cfg.extensions\` set. (2) \`buildWalkOptions\` MUST forward \`descentHook\`+\`alwaysSkipDirs\`+\`followSymlinks\`+\`recordMtimes\`+\`clock\`. (3) NEVER \`pLimit.clearQueue()\` (hangs); use \`onResult\` returning \`{done:true}\`. (4) \`path.join\`/\`relative\`/\`extname\` ~10× slower than manual string ops in hot loop — use \`abs.slice(cwdPrefixLen)\` and \`lastIndexOf('.')+slice+toLowerCase\`. (5) Scan module trusts \`opts.path\`; sandbox enforcement is caller's responsibility (e.g. init-wizard adapters must call \`safePath\` from \`init/tools/shared.ts\`). -* **Telemetry opt-out is env-var-only — no persistent preference or DO\_NOT\_TRACK**: Telemetry opt-out priority: (1) \`SENTRY\_CLI\_NO\_TELEMETRY=1\`, (2) \`DO\_NOT\_TRACK=1\`, (3) \`metadata.defaults.telemetry\`, (4) default on. DB read try/catch wrapped (runs before DB init). Schema v13 merged \`defaults\` table into \`metadata\` KV with keys \`defaults.{org,project,telemetry,url}\`; getters/setters in \`src/lib/db/defaults.ts\`. \`sentry cli defaults\` uses variadic \`\[key, value?]\`: no args → show all; 1 arg → show key; 2 args → set; \`--clear\` without args → clear all (guarded); \`--clear key\` → clear specific. Canonical keys: \`org\`/\`organization\`, \`project\`, \`telemetry\`, \`url\`. \`computeTelemetryEffective()\` returns resolved source for display. +* **Telemetry opt-out is env-var-only — no persistent preference or DO\_NOT\_TRACK**: Telemetry opt-out priority: (1) \`SENTRY\_CLI\_NO\_TELEMETRY=1\`, (2) \`DO\_NOT\_TRACK=1\`, (3) \`metadata.defaults.telemetry\`, (4) default on. DB read try/catch wrapped (runs before DB init). Schema v13 merged \`defaults\` table into \`metadata\` KV with keys \`defaults.{org,project,telemetry,url}\`; getters/setters in \`src/lib/db/defaults.ts\`. \`sentry cli defaults\` uses variadic \`\[key, value?]\`: no args → show all; 1 arg → show key; 2 args → set; \`--clear\` without args → clear all (guarded); \`--clear key\` → clear specific. \`computeTelemetryEffective()\` returns resolved source for display. * **Zod schema on OutputConfig enables self-documenting JSON fields in help and SKILL.md**: Zod schema on OutputConfig enables self-documenting JSON fields: List commands register \`schema?: ZodType\` on \`OutputConfig\\`. \`extractSchemaFields()\` produces \`SchemaFieldInfo\[]\` from Zod shapes. \`buildFieldsFlag()\` enriches \`--fields\` brief; \`enrichDocsWithSchema()\` appends fields to \`fullDescription\`. Schema exposed as \`\_\_jsonSchema\` on built commands — \`introspect.ts\` reads it into \`CommandInfo.jsonFields\`, \`help.ts\` and \`generate-skill.ts\` render it. For \`buildOrgListCommand\`/\`dispatchOrgScopedList\`, pass \`schema\` via \`OrgListConfig\`. @@ -1026,36 +1029,36 @@ mock.module("./some-module", () => ({ * **Sentry-derived terminal color palette tuned for dual-background contrast**: Terminal color palette tuned for dual-background contrast: 10-color chart palette derived from Sentry's categorical hues (\`static/app/utils/theme/scraps/tokens/color.tsx\`), adjusted to mid-luminance for ≥3:1 contrast on both dark and light backgrounds. Adjustments: orange #FF9838→#C06F20, green #67C800→#3D8F09, yellow #FFD00E→#9E8B18, purple #5D3EB2→#8B6AC8, indigo #50219C→#7B50D0; blurple/pink/magenta unchanged; teal #228A83 added. Hex preferred over ANSI 16-color for guaranteed contrast. - -* **Walker uses sync readdir + statSync — readdirSync p95=24µs is fine to block on**: Walker perf: sync readdir + statSync chosen over async. readdirSync p50=11µs, p95=24µs — below setTimeout(0) (~4ms). 10k-file walk: async serial 105-167ms → sync 43-45ms. statSync slightly faster than Bun.file().size. CLI is single-threaded; async overhead unjustified. Walker remains async-generator externally; internals sync. Workers rejected: bootstrap 50ms/worker for scan imports, structuredClone dominates many-match workloads (215k × ~100B = 20MB), cancellation hard (only \`worker.terminate()\`), single-bundle polyfill story ugly. 4-core ceiling 1.7-2.3× eaten by cloning. Walker is the bottleneck, not regex — gitignore layer adds ~135ms overhead in \`ignore\` package. - ### Gotcha * **AuthError constructor takes reason first, message second**: \`AuthError(reason: AuthErrorReason, message?: string)\` where \`AuthErrorReason\` is \`"not\_authenticated" | "expired" | "invalid"\`. Easy to accidentally swap args as \`new AuthError("Token expired", "expired")\` — the string \`"Token expired"\` gets assigned as \`reason\` (invalid enum value). Tests aren't type-checked (tsconfig excludes them), so TypeScript won't catch this. Correct: \`new AuthError("expired", "Token expired")\`. Default messages exist for each reason, so the second arg is often unnecessary. + +* **esbuild doesn't support \`with { type: "text" }\` — needs a plugin**: esbuild errors on Bun's \`import x from "./f.js" with { type: "text" }\` attribute ("Importing with a type attribute of \\"text\\" is not supported"). Our build pipeline is esbuild → bin.js → Bun compile, so esbuild sees the import first. Fix: \`script/text-import-plugin.ts\` — tiny esbuild plugin that intercepts imports with \`pluginData.with?.type === "text"\` (via \`onResolve\`/\`onLoad\`), reads the file, and emits \`export default "\"\`. Shared between \`script/build.ts\` (CLI binary) and \`script/bundle.ts\` (npm library). Bun runtime (dev, \`bun test\`) handles the attribute natively, so no plugin needed there. TypeScript also doesn't model the attribute — cast \`import x from "./f.js" with {...}\` through \`as unknown as string\`. + -* **mapFilesConcurrent skips null but not empty arrays — callers must return null for no-op**: Two off-by-one gotchas in scan/formatters: (1) \`mapFilesConcurrent\` in \`src/lib/scan/concurrent.ts\` filters \`null\` but NOT empty arrays — pushes and fires \`onResult\` for each. Stream variant \`mapFilesConcurrentStream\` filters both. Callers like \`processEntry\` in \`src/lib/dsn/code-scanner.ts\` must return \`null\` (not \`\[]\`) for no-op files — on 10k-file walks where ~99% have no DSNs, \`\[]\` fires ~9900 wasted callbacks. (2) \`collectGlob\`/\`collectGrep\` must NOT forward \`maxResults\` to the underlying iterator — collector drains uncapped, breaks when \`files.length >= maxResults\`, sets \`truncated=true\` on overshoot. Forwarding makes iterator stop at exactly N without \`truncated\`. (3) \`filterFields\` in \`src/lib/formatters/json.ts\` dot-notation ambiguous for keys containing dots — property tests must use \`\[a-zA-Z0-9\_]\` charset. +* **mapFilesConcurrent skips null but not empty arrays — callers must return null for no-op**: Scan/formatter off-by-ones: (1) \`mapFilesConcurrent\` in \`src/lib/scan/concurrent.ts\` filters \`null\` but NOT empty arrays — fires \`onResult\` per empty result. Callers like \`processEntry\` in \`dsn/code-scanner.ts\` must return \`null\` (not \`\[]\`) for no-op files; on 10k-file walks ~99% yield empties. Stream variant filters both. (2) \`collectGlob\`/\`collectGrep\` must NOT forward \`maxResults\` to the underlying iterator — collector drains uncapped, sets \`truncated=true\` on overshoot. Forwarding makes iterator stop at exactly N without \`truncated\`. (3) \`filterFields\` in \`formatters/json.ts\` uses dot-notation — property tests must use \`\[a-zA-Z0-9\_]\` charset to avoid ambiguous keys with dots. -* **process.stdin.isTTY unreliable in Bun — use isatty(0) and backfill for clack**: \`process.stdin.isTTY\` unreliable in Bun — use \`isatty(0)\`: Bun's single-file binary can leave \`process.stdin.isTTY === undefined\` on TTY fds inherited via redirects like \`exec … \ * **Spinner stdout/stderr collision: log messages inside withProgress appear on spinner line**: Spinner stdout/stderr collision: \`withProgress\` in \`src/lib/polling.ts\` writes to stdout with \`\r\x1b\[K\` (no newline); consola writes to stderr. Any \`log.info()\` called \*\*inside\*\* the callback appears on the spinner line because stderr doesn't know stdout's carriage-return position. Fix: propagate data out via return value, call \`log.info()\` \*\*after\*\* \`withProgress\` completes (finally block clears the line). Affected \`downloadBinaryToTemp\` in \`upgrade.ts\`. -* **Stricli rejects unknown flags — pre-parsed global flags must be consumed from argv**: Stricli flag parsing traps: (1) Unknown \`--flag\`s rejected — global flags parsed in \`bin.ts\` MUST be spliced from argv (check \`--flag value\` AND \`--flag=value\`). (2) \`FLAG\_NAME\_PATTERN\` requires 2+ chars after \`--\`; single-char flags like \`--x\` silently become positionals — use aliases (\`-x\` → longer name). Bit dashboard widget \`--x\`/\`--y\`. (3) Generated SKILL.md filters hidden flags: \`FlagDef\` includes \`hidden?: boolean\`; \`extractFlags\` propagates so \`generateCommandDoc\` filters hidden flags alongside \`help\`/\`helpAll\`. Hidden flags (\`--log-level\`, \`--verbose\`) appear globally in \`docs/src/content/docs/commands/index.md\` Global Options. +* **Stricli rejects unknown flags — pre-parsed global flags must be consumed from argv**: Stricli flag parsing traps: (1) Unknown \`--flag\`s rejected — global flags parsed in \`bin.ts\` MUST be spliced from argv (check both \`--flag value\` and \`--flag=value\`). (2) \`FLAG\_NAME\_PATTERN\` requires 2+ chars after \`--\`; single-char flags like \`--x\` silently become positionals — use aliases (\`-x\` → longer name). Bit \`dashboard widget --x\`/\`--y\`. (3) \`FlagDef.hidden\` is propagated by \`extractFlags\` so \`generateCommandDoc\` filters hidden flags alongside \`help\`/\`helpAll\`; hidden \`--log-level\`/\`--verbose\` appear only in global options docs. * **test:unit glob only picks up test/lib, test/commands, test/types, test/isolated**: \`bun test\` script globs in \`package.json\` are narrow: \`test:unit\` = \`test/lib test/commands test/types\`, \`test:isolated\` = \`test/isolated\`, \`test:e2e\` = \`test/e2e\`. Tests placed under \`test/fixtures/\`, \`test/scripts/\`, or \`test/script/\` are NOT picked up by any standard CI script despite being valid Bun test files. Place new tests under \`test/lib/\/\` to inherit coverage. Note: \`test/script/\` (singular) exists with script tests but is also outside the globs. -* **Whole-buffer matchAll slower than split+test when aggregated over many files**: grep perf tradeoffs in \`src/lib/scan/grep.ts::readAndGrep\`: (1) Whole-buffer \`regex.exec\` is 12× faster on single files but ~1.6× SLOWER than \`split("\n") + regex.test\` aggregated over 10k files / 215k matches — match-emission cost dominates. 160× faster-than-rg headline comes entirely from \`mapFilesConcurrent.onResult\` early-exit at maxResults=100. (2) Literal prefilter is FILE-LEVEL gate only (cheap \`indexOf\` → skip file if absent), then whole-buffer \`regex.exec\` on survivors. Per-line-verify broke cross-newline patterns and Unicode length-changing \`toLowerCase\` (Turkish \`İ\`→\`i̇\`). Extractor handles multi-char escapes (\`\x41\`,\`\u0041\`,\`\cA\`,\`\k\\`,\`\p{L}\`) via \`escapeSequenceLength\`. \`hasTopLevelAlternation\`+\`skipGroup\` must call \`skipCharacterClass\` (PCRE \`\[]abc]\` ≠ JS). Extract from \`regex.source\` not user input. (3) \`GrepOptions.multiline\` defaults to \`true\` (rg/git-grep semantics) — thread through \`PerFileOptions\`; \`compilePattern\`'s \`multiline\` default stays \`false\`. +* **Whole-buffer matchAll slower than split+test when aggregated over many files**: grep perf traps in \`src/lib/scan/grep.ts::readAndGrep\`: (1) Whole-buffer \`regex.exec\` is 12× faster per-file but ~1.6× SLOWER than \`split('\n')+regex.test\` aggregated over 10k files — match-emission dominates. Early-exit at \`maxResults\` via \`mapFilesConcurrent.onResult\` is what wins. (2) Literal prefilter is FILE-LEVEL gate only (\`indexOf\` → skip), then whole-buffer \`regex.exec\`. Per-line-verify broke cross-newline patterns and Unicode length-changing \`toLowerCase\` (Turkish \`İ\`→\`i̇\`). Extractor handles multi-char escapes via \`escapeSequenceLength\`; \`hasTopLevelAlternation\`+\`skipGroup\` must call \`skipCharacterClass\` (PCRE \`\[]abc]\` ≠ JS). (3) \`GrepOptions.multiline\` defaults to \`true\` (rg/git-grep); \`compilePattern\`'s default stays \`false\`. (4) Wake-latch race: \`let notify=null; await new Promise(r=>notify=r)\` loses signals if producer calls \`wakeConsumer()\` before assignment — use a latched \`pendingWake\` flag. ### Pattern -* **Bench harness with concurrency sweep + deterministic fixtures**: Bench harness (not in CI): \`script/bench.ts\` + \`script/bench-sweep.ts\`. Scripts: \`bun run bench\`, \`bench:save\` (\`.bench/baseline.json\`), \`bench:compare\` (fails >20% regression), \`bench:sweep\` (concurrency tuning). Flags: \`--size\`, \`--op\`, \`--repo \\` (refuses \`--save-baseline\`), \`--runs\`, \`--warmup\`, \`--json\`. Fixtures in \`test/fixtures/bench/\` use xorshift32-seeded generator for byte-identical trees in \`os.tmpdir()/sentry-cli-bench/\`. Baselines gitignored. Op labels match Sentry span names. \`withBenchDb(fn)\` scopes \`SENTRY\_CONFIG\_DIR\`; \`clearDsnDetectionCache()\` between cold iterations. \*\*CONCURRENCY\_LIMIT\*\*: sweep showed knee=availableParallelism for walker-fed workloads; pure I/O wants 16-32. Walker dominates; kept at \`min(16, max(2, availableParallelism()))\`. +* **Bench harness with concurrency sweep + deterministic fixtures**: Bench harness (not in CI): \`script/bench.ts\` + \`script/bench-sweep.ts\`. Scripts: \`bun run bench\`, \`bench:save\` (\`.bench/baseline.json\`), \`bench:compare\` (fails >20% regression), \`bench:sweep\` (concurrency tuning). Flags: \`--size\`, \`--op\`, \`--repo \\` (refuses \`--save-baseline\`), \`--runs\`, \`--warmup\`, \`--json\`. Fixtures in \`test/fixtures/bench/\` use xorshift32-seeded generator for byte-identical trees in \`os.tmpdir()/sentry-cli-bench/\`. Op labels match Sentry span names. \`withBenchDb(fn)\` scopes \`SENTRY\_CONFIG\_DIR\`; \`clearDsnDetectionCache()\` between cold iterations. \*\*CONCURRENCY\_LIMIT\*\*: sweep showed knee=availableParallelism for walker-fed workloads; pure I/O wants 16-32. Kept at \`min(16, max(2, availableParallelism()))\`. * **Event view cross-org fallback chain: project → org → all orgs**: Event view cross-org fallback chain: \`fetchEventWithContext\` in \`src/commands/event/view.ts\` tries (1) \`getEvent(org, project, eventId)\`, (2) \`resolveEventInOrg(org, eventId)\`, (3) \`findEventAcrossOrgs(eventId, { excludeOrgs })\`. Extracted into \`tryEventFallbacks()\` for Biome complexity limit. \`excludeOrgs\` only set when same-org search returned null (not on transient error). Both catch blocks re-throw \`AuthError\` while swallowing transient errors. Warning distinguishes same-org/different-project from different-org via \`crossOrg.org === org\`. @@ -1063,12 +1066,9 @@ mock.module("./some-module", () => ({ * **Extract shared pipeline setup between streaming and collecting variants of same operation**: When a module exposes both a streaming (\`AsyncGenerator\`) and collecting (\`Promise\\`) variant of the same pipeline (e.g., \`grepFiles\` + \`collectGrep\` in \`src/lib/scan/grep.ts\`), extract a shared setup helper (e.g., \`setupGrepPipeline\`) that handles pattern compilation, matcher compilation, option defaulting, walker construction, and filter wiring. Otherwise divergence between the two setup paths becomes a silent correctness bug. The collecting variant typically adds a \`+1 probe\` pattern over the streaming variant's \`maxResults\` to distinguish 'exactly N' from 'N+ available'. - -* **Init-wizard grep/glob adapters must sandbox search.path via safePath before passing to scan**: Init-wizard grep/glob tools in \`src/lib/init/tools/{grep,glob}.ts\` are thin adapters over \`collectGrep\`/\`collectGlob\` from \`src/lib/scan/\`. The scan module trusts \`opts.path\` — per \`src/lib/scan/types.ts\`, sandbox enforcement is the adapter's responsibility. Adapters MUST call \`safePath(payload.cwd, search.path)\` from \`init/tools/shared.ts\` before forwarding; skipping this regresses the \`rejects grep paths outside the init sandbox\` test. \`safePath\` returns an absolute path + walks ancestors with \`realpathSync\` to block symlink escapes. Two-layer defense: \`validateToolSandbox(payload, context.directory)\` in the registry pre-checks \`payload.cwd\` is inside project root; \`safePath\` guards per-tool sub-paths. Wire contract (GrepPayload/GlobPayload in \`init/types.ts\`) is minimal: pattern/path?/include? for grep, patterns\[]/path?/maxResults? for glob — no caseInsensitive/fixedStrings/etc on the wire today. - * **PR review workflow: Cursor Bugbot + Seer + human cycle**: PR review workflow (Cursor Bugbot + Seer + human): \`gh pr checks \ --json state,link\` + \`gh run view --log-failed\` for CI. Unresolved threads via \`gh api graphql\` with \`reviewThreads\` query filtering \`isResolved:false\`+\`isMinimized:false\`. After fixes: reply via \`gh api repos/.../pulls/comments/\/replies\` then resolve via \`resolveReviewThread\` mutation. Bots auto-resolve on detected fix. Statuses: \`UNSTABLE\`=non-blocking bots running, \`BLOCKED\`=required CI pending, \`CLEAN\`+\`MERGEABLE\`=ready. Repo is squash-merge only. Expect 4-6 rounds on subtle regex/Unicode PRs. Bugbot findings usually real but occasionally assume PCRE/Python semantics (e.g. \`\[]abc]\` is class-with-\`]\` in PCRE but empty class in JS) — verify with reproduction. Dispatch self-review subagent between rounds. - -* **URL resolution chain and persistent default integration point**: CLI init sequence in \`startCli()\` (\`cli.ts\`): (1) fast-path \`\_\_complete\` dispatch, (2) \`preloadProjectContext(cwd)\` finds project root + \`.sentryclirc\`, calls \`applySentryCliRcEnvShim()\`, (3) persistent defaults applied, (4) \`runCli(args)\` with \`withTelemetry()\`. Heavy modules use \`await import()\` for fast completion. \`preloadProjectContext\` is try/catch wrapped. Sentry URL resolution priority: \`SENTRY\_HOST\` > \`SENTRY\_URL\` > \`.sentryclirc \[defaults] url\` > metadata \`defaults.url\` > \`sentry.io\`. Persistent URL default written to \`env.SENTRY\_URL\` only if both \`SENTRY\_HOST\` and \`SENTRY\_URL\` unset. + +* **Worker bodies as real .js files via text-import, not inline strings**: Prefer real \`.js\` files imported as text over inline template-literal worker sources. Inline strings are invisible to Biome/TypeScript — lint bugs (\`noIncrementDecrement\`, \`useBlockStatements\`, \`useTemplate\`, unused vars) go undetected. Pattern: write worker as plain JS (\`grep-worker.js\`, CJS-style \`require("node:fs")\` to avoid top-level-await parsing), import via \`with { type: "text" }\`, feed to \`Blob\` + \`URL.createObjectURL\`. Works in \`bun run\`, \`bun test\`, and compiled binaries (with the esbuild text-import plugin). Trade-off: no TS types in the worker body — keep message protocol types on main-thread side (\`worker-pool.ts\`). Must be self-contained (no user imports — worker module registry won't have them). diff --git a/script/build.ts b/script/build.ts index fe6737ecf..03e9b3a98 100644 --- a/script/build.ts +++ b/script/build.ts @@ -41,6 +41,7 @@ import { build as esbuild } from "esbuild"; import pkg from "../package.json"; import { uploadSourcemaps } from "../src/lib/api/sourcemaps.js"; import { injectDebugId, PLACEHOLDER_DEBUG_ID } from "./debug-id.js"; +import { textImportPlugin } from "./text-import-plugin.js"; const gzipAsync = promisify(gzip); @@ -134,6 +135,7 @@ async function bundleJs(): Promise { "process.env.NODE_ENV": JSON.stringify("production"), __SENTRY_DEBUG_ID__: JSON.stringify(PLACEHOLDER_DEBUG_ID), }, + plugins: [textImportPlugin], }); const output = result.metafile?.outputs[BUNDLE_JS]; diff --git a/script/bundle.ts b/script/bundle.ts index 20c863e15..a7795e2c1 100644 --- a/script/bundle.ts +++ b/script/bundle.ts @@ -4,6 +4,7 @@ import { build, type Plugin } from "esbuild"; import pkg from "../package.json"; import { uploadSourcemaps } from "../src/lib/api/sourcemaps.js"; import { injectDebugId, PLACEHOLDER_DEBUG_ID } from "./debug-id.js"; +import { textImportPlugin } from "./text-import-plugin.js"; const VERSION = pkg.version; const SENTRY_CLIENT_ID = process.env.SENTRY_CLIENT_ID ?? ""; @@ -179,7 +180,11 @@ const sentrySourcemapPlugin: Plugin = { }; // Always inject debug IDs (even without auth token); upload is gated inside the plugin -const plugins: Plugin[] = [bunSqlitePlugin, sentrySourcemapPlugin]; +const plugins: Plugin[] = [ + bunSqlitePlugin, + sentrySourcemapPlugin, + textImportPlugin, +]; if (process.env.SENTRY_AUTH_TOKEN) { console.log(" Sentry auth token found, source maps will be uploaded"); diff --git a/script/text-import-plugin.ts b/script/text-import-plugin.ts new file mode 100644 index 000000000..b61b14b0c --- /dev/null +++ b/script/text-import-plugin.ts @@ -0,0 +1,51 @@ +/** + * esbuild plugin that polyfills Bun's `with { type: "text" }` import + * attribute. + * + * esbuild doesn't natively support the `text` import attribute (only + * `json`), but Bun does. Our CLI code uses it to load the grep worker + * source as a string at bundle time (see + * `src/lib/scan/worker-pool.ts`). Without this plugin, esbuild errors + * with `Importing with a type attribute of "text" is not supported` + * on any file that imports a sibling `.js` as text. + * + * The plugin intercepts imports whose `with` attribute matches + * `{ type: "text" }`, reads the file from disk, and emits it as a JS + * module that default-exports the file's contents as a string. + * Runtime behavior matches Bun's native handling, so the same source + * works in dev (via `bun run`) and in compiled binaries (esbuild + + * `bun build --compile` two-step). + * + * Used by both `script/build.ts` (single-file executable) and + * `script/bundle.ts` (CJS library bundle for npm). + */ + +import { readFileSync } from "node:fs"; +import { resolve as resolvePath } from "node:path"; +import type { Plugin } from "esbuild"; + +const TEXT_IMPORT_NS = "text-import"; +/** Match-any filter for esbuild's plugin API. Hoisted for top-level-regex lint. */ +const ANY_FILTER = /.*/; + +export const textImportPlugin: Plugin = { + name: "text-import", + setup(build) { + build.onResolve({ filter: ANY_FILTER }, (args) => { + if (args.with?.type !== "text") { + return null; + } + return { + path: resolvePath(args.resolveDir, args.path), + namespace: TEXT_IMPORT_NS, + }; + }); + build.onLoad({ filter: ANY_FILTER, namespace: TEXT_IMPORT_NS }, (args) => { + const content = readFileSync(args.path, "utf-8"); + return { + contents: `export default ${JSON.stringify(content)};`, + loader: "js", + }; + }); + }, +}; diff --git a/src/lib/dsn/code-scanner.ts b/src/lib/dsn/code-scanner.ts index 24b6f8516..606e05cab 100644 --- a/src/lib/dsn/code-scanner.ts +++ b/src/lib/dsn/code-scanner.ts @@ -4,45 +4,61 @@ * This module owns the DSN-specific policy (URL regex, comment-line * filtering, host validation, package-path inference, stop-on-first * semantics). All file walking, `.gitignore` handling, extension - * filtering, and bounded concurrency are delegated to the shared - * `src/lib/scan/` module. + * filtering, bounded concurrency, AND worker-pool dispatch are + * delegated to the shared `src/lib/scan/` module via `collectGrep`. * * Flow: - * 1. `scanDirectory(cwd, stopOnFirst)` calls `walkFiles` with the - * DSN preset (`dsnScanOptions()`), passing `recordMtimes` and an - * `onDirectoryVisit` hook so the cache-invalidation map is - * populated in one traversal. - * 2. Each yielded file is read + passed through `extractDsnsFromContent` - * via `mapFilesConcurrent`. Per-file `ConfigError` re-throws up - * to the caller; all other errors are logged at debug level and - * the file is skipped. - * 3. `onResult` in `mapFilesConcurrent` dedups into a shared Map - * and raises the early-exit flag on first unique DSN when - * `stopOnFirst: true`. + * 1. `scanDirectory(cwd, stopOnFirst)` calls `collectGrep` with the + * DSN pattern and preset (`dsnScanOptions()`), plus + * `recordMtimes: true` and an `onDirectoryVisit` hook so the + * cache-invalidation maps are populated in one traversal. + * 2. `collectGrep` dispatches per-file work to the worker pool (when + * available) or a concurrent-async fallback. Each yielded + * `GrepMatch` represents one line containing a DSN URL; the + * grep engine handles the file-level literal gate (`http`) for + * free, so we skip files that can't possibly match before any + * regex runs. + * 3. Main thread post-filters each match: + * - Skip commented lines (language-aware comment prefixes) + * - Re-run `DSN_PATTERN` on `match.line` to recover all DSNs + * (grep emits one match per line regardless of how many + * hits the line contains — rare for DSNs but the contract + * predates this refactor) + * - Validate host (`isValidDsnHost`) + * - Dedup on raw DSN string + * - Early-exit on first unique DSN when `stopOnFirst: true` + * - Build `DetectedDsn` with inferred package path + * 4. `sourceMtimes` records mtime per file that contributed a + * validated DSN; `dirMtimes` records mtime per visited dir via + * the hook. Both are used by `src/lib/db/dsn-cache.ts` for + * cache invalidation. * * Behavior change landed in PR 3: the walker's `nestedGitignore: true` * default (via `dsnScanOptions()`) means nested `.gitignore` files are * now honored. Pre-PR-3 code only read the project-root `.gitignore`. * This is a correctness improvement matching git's cumulative semantics; * DSNs in files covered by a subdir `.gitignore` are no longer detected. + * + * Behavior change landed in PR 6 (this one): the DSN scanner now shares + * the grep pipeline and gets worker-pool parallelism for free. + * End-to-end time on the 10k-file fixture drops from ~330ms → ~200ms. + * Correctness is unchanged — `extractDsnsFromContent` is still + * exported for `src/lib/dsn/detector.ts::isDsnStillPresent` (the + * cache-verify fast path for a single file) and internally we still + * go through the same comment/host-validation filter. */ import path from "node:path"; import { DEFAULT_SENTRY_HOST, getConfiguredSentryUrl } from "../constants.js"; import { ConfigError } from "../errors.js"; import { logger } from "../logger.js"; -import { - mapFilesConcurrent, - normalizePath, - type WalkEntry, - walkFiles, -} from "../scan/index.js"; +import { collectGrep, normalizePath, walkFiles } from "../scan/index.js"; import { withTracingSpan } from "../telemetry.js"; import { createDetectedDsn, inferPackagePath, parseDsn } from "./parser.js"; import { DSN_MAX_DEPTH, dsnScanOptions } from "./scan-options.js"; import type { DetectedDsn } from "./types.js"; -/** Scoped logger for DSN code scanning */ +/** Scoped logger for DSN code scanning. */ const log = logger.withTag("dsn-scan"); /** @@ -202,21 +218,79 @@ export function extractFirstDsnFromContent(content: string): string | null { * mtimes for cache invalidation. */ export function scanCodeForDsns(cwd: string): Promise { - return scanDirectory(cwd, false); + return scanDirectory(cwd); } /** * Scan a directory and return the first DSN found. * * Optimized for the common case of single-project repositories. - * Stops scanning as soon as a valid DSN is found (propagates via - * `mapFilesConcurrent`'s shared early-exit flag). + * This path deliberately avoids the worker pool — spawning workers + * for a stopOnFirst scan adds ~20ms of startup cost that dwarfs the + * actual work (most scans find the DSN in the first few files). + * + * Walks files one at a time on the main thread, reading each with + * `Bun.file().text()` and passing through `extractFirstDsnFromContent` + * which benefits from the `/http/i` literal fast path — ~99% of + * files skip the full regex entirely. + * + * The full-scan variant `scanCodeForDsns` takes the opposite + * tradeoff: it uses workers + parallel file reads, which is a net + * win when we need to inspect every file. */ -export async function scanCodeForFirstDsn( - cwd: string -): Promise { - const { dsns } = await scanDirectory(cwd, true); - return dsns[0] ?? null; +export function scanCodeForFirstDsn(cwd: string): Promise { + return withTracingSpan( + "scanCodeForFirstDsn", + "dsn.detect.code", + // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: walker loop + read + extract + validate + span-status error branch is inherently branchy + async (span) => { + let filesScanned = 0; + try { + for await (const entry of walkFiles({ + cwd, + ...dsnScanOptions(), + })) { + filesScanned += 1; + let content: string; + try { + content = await Bun.file(entry.absolutePath).text(); + } catch { + continue; + } + const raw = extractFirstDsnFromContent(content); + if (raw === null) { + continue; + } + const detected = createDetectedDsn( + raw, + "code", + entry.relativePath, + inferPackagePath(entry.relativePath) + ); + if (detected !== null) { + span.setAttribute("dsn.files_scanned", filesScanned); + span.setAttribute("dsn.dsns_found", 1); + return detected; + } + } + span.setAttribute("dsn.files_scanned", filesScanned); + span.setAttribute("dsn.dsns_found", 0); + return null; + } catch (error) { + if (error instanceof ConfigError) { + throw error; + } + span.setStatus({ code: 2, message: "Directory scan failed" }); + log.debug(`scanCodeForFirstDsn failed: ${String(error)}`); + return null; + } + }, + { + "dsn.scan_dir": cwd, + "dsn.stop_on_first": true, + "dsn.max_depth": DSN_MAX_DEPTH, + } + ); } /** @@ -281,27 +355,23 @@ function isValidDsnHost(dsn: string): boolean { ); } -/** - * Bundle of per-scan mutable state. Collecting these into one record - * keeps the per-file processor's arity under Biome's 4-param ceiling. - */ -type ScanDirectoryState = { - cwd: string; - stopOnFirst: boolean; - seen: Map; - sourceMtimes: Record; - filesScanned: { count: number }; -}; - /** * Main scan implementation. Wraps the pipeline in a traced span so * production dashboards + the `scanCodeForDsns` bench op stay in * sync. Attribute names match the pre-PR-3 scanner byte-for-byte. + * + * Delegates the heavy lifting to `collectGrep`: + * - Walker config (depth, gitignore, skip dirs) from `dsnScanOptions()`. + * - `DSN_PATTERN` as the grep pattern — the engine's literal + * prefilter uses `http` as a file-level gate, identical to the + * pre-refactor `HTTP_SCHEME_PROBE` regex. + * - `recordMtimes: true` + `onDirectoryVisit` populate the two + * cache-invalidation maps in one traversal. + * - Worker pool handles per-file read + regex in parallel; main + * thread post-filters each `GrepMatch` for comments and host + * validation. */ -function scanDirectory( - cwd: string, - stopOnFirst: boolean -): Promise { +function scanDirectory(cwd: string): Promise { return withTracingSpan( "scanCodeForDsns", "dsn.detect.code", @@ -309,60 +379,54 @@ function scanDirectory( const sourceMtimes: Record = {}; const dirMtimes: Record = {}; const seen = new Map(); - // Mutable counters threaded through the state object so the - // per-file callback can update them without capturing. - const filesScanned = { count: 0 }; - let filesCollected = 0; - - const state: ScanDirectoryState = { - cwd, - stopOnFirst, - seen, - sourceMtimes, - filesScanned, - }; + // Dedup set for mtime recording: one entry per file that + // contributed a validated DSN. Used to skip redundant mtime + // writes in `processMatch` — `grepFiles` emits one match per + // matching line, so a file with 3 DSN-containing lines would + // otherwise trigger 3 mtime writes for the same path. NOT + // used for telemetry (see `stats.filesRead` below). + const filesSeenForMtime = new Set(); try { - // Walker yields every text file under cwd that passes the DSN - // preset (depth 3 with monorepo reset, full DSN skip list, - // nested .gitignore honored). We tap the iterator to count - // collected files for telemetry, then feed the tapped stream - // through mapFilesConcurrent for bounded parallel reads. - const walkSource = walkFiles({ + const { matches, stats } = await collectGrep({ cwd, + pattern: DSN_PATTERN, ...dsnScanOptions(), recordMtimes: true, onDirectoryVisit: (absDir, mtimeMs) => { const rel = normalizePath(path.relative(cwd, absDir)) || "."; dirMtimes[rel] = mtimeMs; }, - }); - const tapped = tapWalker(walkSource, () => { - filesCollected += 1; + // Disable line-text truncation. The default (2000 chars) is + // fine for interactive grep output, but would silently drop + // DSNs that appear past column ~1900 on long minified lines. + // `processMatch` re-runs `DSN_PATTERN` on `match.line`, so + // a `…` suffix introduced by truncation would make the + // regex fail at the pattern-terminating `/\d+`. The + // per-DSN memory cost is bounded by the walker's + // `maxFileSize` (256 KB) and the `seen` dedup map. + maxLineLength: Number.POSITIVE_INFINITY, }); - await mapFilesConcurrent( - tapped, - (entry) => processEntry(entry, state), - { - onResult: (detected) => { - let firstUnique = false; - for (const dsn of detected) { - if (!seen.has(dsn.raw)) { - seen.set(dsn.raw, dsn); - if (stopOnFirst) { - firstUnique = true; - } - } - } - return firstUnique ? { done: true } : undefined; - }, - } - ); + for (const match of matches) { + processMatch(match, { + seen, + sourceMtimes, + filesSeenForMtime, + }); + } - span.setAttribute("dsn.files_collected", filesCollected); + // `stats.filesRead` is the count of files actually read and + // tested by the grep pipeline — matches the walker-yielded + // file count after the walker's own filters (extension, + // gitignore, etc.). Consistent with `scanCodeForFirstDsn` + // which counts every file it walks. `files_collected` is a + // legacy attribute: alias to the same count so telemetry + // consumers don't need to know about the historical + // distinction. + span.setAttribute("dsn.files_collected", stats.filesRead); span.setAttributes({ - "dsn.files_scanned": filesScanned.count, + "dsn.files_scanned": stats.filesRead, "dsn.dsns_found": seen.size, }); @@ -372,7 +436,6 @@ function scanDirectory( dirMtimes, }; } catch (error) { - // ConfigError is a user-facing misconfiguration — surface it. if (error instanceof ConfigError) { throw error; } @@ -389,83 +452,90 @@ function scanDirectory( }, { "dsn.scan_dir": cwd, - "dsn.stop_on_first": stopOnFirst, + "dsn.stop_on_first": false, "dsn.max_depth": DSN_MAX_DEPTH, } ); } /** - * Per-file worker: read the file, extract DSNs via the DSN-specific - * content pipeline, wrap each raw match in a `DetectedDsn`. Returns - * `null` when the file contributes nothing (no raw matches OR every - * match got rejected by host validation OR the file was unreadable) - * so `mapFilesConcurrent` skips the push + onResult callback entirely - * — important on large walks where most files have zero DSNs. + * Per-match processing context. Collected into one record so the + * hot loop's callback stays under Biome's cognitive-complexity + * ceiling and the caller can mutate `seen` / `sourceMtimes` / + * `filesSeenForMtime` by reference. + */ +type MatchProcessingContext = { + seen: Map; + sourceMtimes: Record; + filesSeenForMtime: Set; +}; + +/** + * Process one `GrepMatch` from the DSN scan: * - * Re-throws `ConfigError` so `scanDirectory`'s outer try/catch can - * propagate user-facing misconfig. All other fs errors are logged at - * debug level and treated as "skip this file silently". + * 1. Skip commented lines. + * 2. Extract all DSNs on the line via `extractDsnsOnLine` (rare + * multi-DSN lines preserved from pre-refactor behavior). + * 3. For each DSN: validate via `createDetectedDsn`, record + * `fileHadValidDsn` for mtime tracking, dedup into `seen`. + * 4. If this file contributed at least one validated DSN, record + * its mtime in `sourceMtimes` (first match wins since subsequent + * matches on the same file have the same mtime). */ -async function processEntry( - entry: WalkEntry, - state: ScanDirectoryState -): Promise { - state.filesScanned.count += 1; - try { - const content = await Bun.file(entry.absolutePath).text(); - const raws = extractDsnsFromContent( - content, - state.stopOnFirst ? 1 : undefined - ); - if (raws.length === 0) { - // Return null so `mapFilesConcurrent` skips the push + the - // `onResult` callback entirely. On a typical 10k-file walk - // where ~99% of files contain no DSN, returning `[]` would - // fire ~9900 no-op `onResult` calls and push that many empty - // arrays into the (otherwise unused) results list. `null` is - // the documented "skip this file" signal. - return null; +function processMatch( + match: { + path: string; + absolutePath: string; + line: string; + mtime?: number; + }, + ctx: MatchProcessingContext +): void { + if (isCommentedLine(match.line.trim())) { + return; + } + const dsns = extractDsnsOnLine(match.line); + if (dsns.length === 0) { + return; + } + const packagePath = inferPackagePath(match.path); + let fileHadValidDsn = false; + for (const raw of dsns) { + const detected = createDetectedDsn(raw, "code", match.path, packagePath); + if (detected === null) { + continue; } - const packagePath = inferPackagePath(entry.relativePath); - const detected = raws - .map((raw) => - createDetectedDsn(raw, "code", entry.relativePath, packagePath) - ) - .filter((d): d is DetectedDsn => d !== null); - // Only record mtime when at least one DSN was accepted (matches - // pre-PR-3 behavior — the cache only tracks files it cares about). - if (detected.length === 0) { - // All raw matches got rejected by host validation (e.g., a - // self-hosted repo scanning a file with a saas.sentry.io URL). - // Skip the push + onResult — same signal as "no DSNs found". - return null; + fileHadValidDsn = true; + if (!ctx.seen.has(raw)) { + ctx.seen.set(raw, detected); } - state.sourceMtimes[entry.relativePath] = entry.mtime; - return detected; - } catch (error) { - if (error instanceof ConfigError) { - throw error; + } + if (fileHadValidDsn && !ctx.filesSeenForMtime.has(match.absolutePath)) { + ctx.filesSeenForMtime.add(match.absolutePath); + if (match.mtime !== undefined) { + ctx.sourceMtimes[match.path] = match.mtime; } - // ENOENT / EACCES / malformed content — the pre-PR-3 scanner - // matched these with a single `log.debug(...)` and returned - // empty. Preserve that behavior exactly (null = skip silently). - log.debug(`Cannot read file: ${entry.relativePath}`); - return null; } } /** - * Pass-through async generator that invokes `onEach` once per entry - * before yielding. Lets `scanDirectory` count collected files without - * forking the walker's output iterator. + * Extract DSN URLs from a single line's text. Called by + * `scanDirectory` on each matching line yielded by `grepFiles`. + * Runs the full `DSN_PATTERN` match + host validation, mirroring + * `extractDsnsFromContent` but without the whole-file literal gate + * (grep already handles that). + * + * Most lines have zero or one DSN; this loop handles the rare case + * of two URLs on one line (a preserved behavior of the pre-refactor + * scanner's `content.matchAll`). */ -async function* tapWalker( - source: AsyncIterable, - onEach: () => void -): AsyncGenerator { - for await (const entry of source) { - onEach(); - yield entry; +function extractDsnsOnLine(line: string): string[] { + const dsns: string[] = []; + for (const m of line.matchAll(DSN_PATTERN)) { + const raw = m[0]; + if (!dsns.includes(raw) && isValidDsnHost(raw)) { + dsns.push(raw); + } } + return dsns; } diff --git a/src/lib/scan/grep-worker.js b/src/lib/scan/grep-worker.js new file mode 100644 index 000000000..8d6f90735 --- /dev/null +++ b/src/lib/scan/grep-worker.js @@ -0,0 +1,153 @@ +/** + * Grep worker body. Imported as raw text (`with { type: "text" }`) + * by `worker-pool.ts` and handed to a `Blob` / `URL.createObjectURL` + * at runtime to spawn workers. + * + * ## Why a plain `.js` file, not a `.ts` one + * + * This file is NEVER executed as a module via `import` — it's always + * loaded as the body of a `new Worker()`. So we need its contents to + * be valid JS verbatim (no TS syntax). Keeping it `.js` avoids a + * transpile step and makes the flow obvious: the file you see is + * exactly what the worker engine parses. + * + * ## Why a separate file and not a string in TS + * + * Linting, syntax validation, and IDE tooling all work on real files. + * A previous iteration inlined this as a template literal inside a + * TS source — but the string was invisible to lint/format/search. + * `with { type: "text" }` gives us the best of both: the worker is a + * first-class file at dev time, and a string constant at runtime. + * + * ## Why not `new Worker(url)` with a TS file entrypoint + * + * Bun's `bun build --compile` docs describe passing the worker TS + * file as an additional entrypoint. We tested that with our + * esbuild→Bun compile two-step and all three documented forms + * (`new Worker("./path.ts")`, `new Worker(new URL(...))`, + * `new Worker(URL.href)`). The URL forms fail in compiled binaries + * because `import.meta.url` resolves to the binary path — the + * bundler doesn't rewrite URLs at compile time. The plain-string + * form resolves relative to the project root that `bun build` ran + * from, which works in compiled mode but requires `bun run` / `bun + * test` to also be invoked from the project root — brittle. + * + * The Blob-URL + text-import approach has no such fragility and + * works identically in dev, `bun test`, and compiled binaries. + * + * ## Self-contained + * + * - No imports from local modules (they wouldn't be available in + * the worker's module registry anyway). + * - Uses `require("node:fs")` (CJS-style) which Bun exposes to + * workers; avoids the top-level await concerns of ESM imports. + * + * ## Protocol + * + * Main → Worker: `{ paths, patternSource, flags, maxLineLength, + * maxMatchesPerFile, literal }` + * + * Worker → Main: + * - Once on startup: `{ type: "ready" }` + * - Per request: `{ type: "result", ints: Uint32Array, linePool: string }` + * — `ints.buffer` is transferred (zero-copy); `linePool` is cloned. + * + * ## Match encoding + * + * Each match is 4 consecutive `u32`s in `ints`: + * [0] pathIdx index into the input `paths` array + * [1] lineNum 1-based line number + * [2] lineOffset character offset into `linePool` + * [3] lineLength character length of the line (post-truncation) + * + * Structured-clone of `GrepMatch[]` for 215k-match workloads costs + * ~200ms because the per-match path/line strings get deep-copied. + * The binary-packed form + single shared `linePool` string drops + * that to ~2-3ms. + */ + +const { readFileSync } = require("node:fs"); + +// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: hot regex loop with literal gate + line counter + line-bound extraction + per-file cap is inherently branchy +self.onmessage = (event) => { + const { + paths, + patternSource, + flags, + maxLineLength, + maxMatchesPerFile, + literal, + } = event.data; + const regex = new RegExp(patternSource, flags); + // Packed match data: 4 u32s per match — [pathIdx, lineNum, lineOffset, lineLength] + const ints = []; + let linePool = ""; + + for (let pathIdx = 0; pathIdx < paths.length; pathIdx += 1) { + const p = paths[pathIdx]; + let content; + try { + content = readFileSync(p, "utf-8"); + } catch { + continue; + } + + // File-level literal gate (matches main-thread readAndGrep behavior). + if (literal !== null) { + const isCaseInsensitive = regex.flags.includes("i"); + const haystack = isCaseInsensitive ? content.toLowerCase() : content; + if (haystack.indexOf(literal) === -1) { + continue; + } + } + + regex.lastIndex = 0; + let m = regex.exec(content); + let lineNum = 1; + let cursor = 0; + let fileMatches = 0; + + while (m !== null) { + const matchIndex = m.index; + // Advance line counter to match position via indexOf hops. + let nl = content.indexOf("\n", cursor); + while (nl !== -1 && nl < matchIndex) { + lineNum += 1; + nl = content.indexOf("\n", nl + 1); + } + cursor = matchIndex; + + const lineStart = content.lastIndexOf("\n", matchIndex) + 1; + const lineEndRaw = content.indexOf("\n", matchIndex); + const lineEnd = lineEndRaw === -1 ? content.length : lineEndRaw; + let line = content.slice(lineStart, lineEnd); + if (line.length > maxLineLength) { + line = `${line.slice(0, maxLineLength - 1)}\u2026`; + } + + // Append line text to pool, record offset+length + const lineOffset = linePool.length; + linePool += line; + ints.push(pathIdx, lineNum, lineOffset, line.length); + + fileMatches += 1; + if (fileMatches >= maxMatchesPerFile) { + break; + } + if (lineEndRaw === -1) { + break; + } + regex.lastIndex = lineEnd + 1; + m = regex.exec(content); + } + } + + // Pack into Uint32Array with transferable backing buffer. + const packed = new Uint32Array(ints); + self.postMessage( + { type: "result", ints: packed, linePool }, + { transfer: [packed.buffer] } + ); +}; + +self.postMessage({ type: "ready" }); diff --git a/src/lib/scan/grep.ts b/src/lib/scan/grep.ts index 313cb0c9c..15d98667b 100644 --- a/src/lib/scan/grep.ts +++ b/src/lib/scan/grep.ts @@ -69,6 +69,11 @@ import type { WalkOptions, } from "./types.js"; import { walkFiles } from "./walker.js"; +import { + decodeWorkerMatches, + getWorkerPool, + isWorkerSupported, +} from "./worker-pool.js"; /** Default line-truncation length for the init-wizard wire shape. */ const DEFAULT_MAX_LINE_LENGTH = 2000; @@ -170,6 +175,13 @@ type PerFileOptions = { * silent misses under per-line verify. */ literal: string | null; + /** + * When true, each emitted `GrepMatch` carries `mtime` (the + * source file's floored `mtimeMs`). The walker produces it; the + * grep path just forwards it. Incurs one extra stat per file at + * the walker layer. + */ + recordMtimes: boolean; }; /** @@ -347,7 +359,7 @@ function buildMatch(ctx: MatchContext, bounds: LineBounds): GrepMatch { rawLine.length > ctx.opts.maxLineLength ? `${rawLine.slice(0, ctx.opts.maxLineLength - 1)}…` : rawLine; - return { + const match: GrepMatch = { path: ctx.opts.pathPrefix ? joinPosix(ctx.opts.pathPrefix, ctx.entry.relativePath) : ctx.entry.relativePath, @@ -355,6 +367,10 @@ function buildMatch(ctx: MatchContext, bounds: LineBounds): GrepMatch { lineNum: bounds.lineNum, line, }; + if (ctx.opts.recordMtimes) { + match.mtime = ctx.entry.mtime; + } + return match; } /** @@ -374,6 +390,8 @@ function buildWalkOptions(opts: GrepOptions, root: string): WalkOptions { maxFileSize: opts.maxFileSize, descentHook: opts.descentHook, followSymlinks: opts.followSymlinks, + recordMtimes: opts.recordMtimes, + onDirectoryVisit: opts.onDirectoryVisit, signal: opts.signal, timeBudgetMs: opts.timeBudgetMs, }; @@ -393,15 +411,68 @@ type GrepPipelineOptions = { }; /** - * The internal pipeline. Separated from the public entry point so - * `collectGrep` can share it. + * Batch size for worker dispatch. Each batch is a chunk of N file + * paths the worker will read + grep in one go. Benched on + * synthetic/large (10k files): 100 = 420ms, 200 = 275ms (streamed), + * 400 = 290ms. The plateau is around 200 — too small and postMessage + * overhead dominates; too large and we lose per-worker parallelism + * because the walker finishes yielding before the first worker + * returns. + */ +const WORKER_BATCH_SIZE = 200; + +/** + * The internal pipeline. Separated from the public entry points so + * `grepFiles` and `collectGrep` share it. + * + * Two execution strategies: * - * `async *` is intentional even though we `yield*` into the - * `for await` — the whole pipeline is async and callers expect - * an `AsyncGenerator`. + * 1. **Worker pool** (default when the runtime supports `new Worker` + * + `Blob` + `URL.createObjectURL`): walker streams paths on the + * main thread, paths are batched into chunks of + * `WORKER_BATCH_SIZE`, each batch is dispatched round-robin to + * a lazy-initialized pool of workers. Workers return matches + * packed as `Uint32Array` + a shared string pool (transferable, + * zero-copy). Main thread decodes and yields. + * + * 2. **Async fallback** (when workers aren't available — e.g. some + * Node library embeddings): falls back to `mapFilesConcurrent` + * on the main thread. Same result, ~4× slower on large workloads. + * + * Worker disablement for tests or debugging: set + * `SENTRY_SCAN_DISABLE_WORKERS=1`. */ +// biome-ignore lint/suspicious/useAwait: async generator that dispatches to one of two sub-generators via `yield*`; no `await` needed in the dispatcher itself async function* grepFilesInternal( opts: GrepPipelineOptions +): AsyncGenerator { + if (shouldUseWorkers()) { + yield* grepViaWorkers(opts); + return; + } + yield* grepViaAsyncMain(opts); +} + +/** + * Returns true when the worker path should be used. Gated on the + * runtime capability check (`isWorkerSupported`) and the + * `SENTRY_SCAN_DISABLE_WORKERS` env var (for tests + debugging). + */ +function shouldUseWorkers(): boolean { + if (process.env.SENTRY_SCAN_DISABLE_WORKERS === "1") { + return false; + } + return isWorkerSupported(); +} + +/** + * Fallback pipeline: `mapFilesConcurrentStream` on the main thread. + * Used when the worker pool is unavailable or explicitly disabled. + * Identical semantics to the worker path; ~4× slower on 10k-file + * many-match workloads. + */ +async function* grepViaAsyncMain( + opts: GrepPipelineOptions ): AsyncGenerator { for await (const match of mapFilesConcurrentStream( opts.walkSource, @@ -431,6 +502,281 @@ async function* grepFilesInternal( } } +/** + * Worker-based pipeline. Streams paths from the walker, batches + * them, dispatches to workers, decodes results, yields matches in + * worker-completion order (not walker order). + * + * ### Streaming + early-exit + * + * Walker runs on main thread. As paths arrive, we accumulate into a + * `batch: string[]`. When the batch reaches `WORKER_BATCH_SIZE` we + * dispatch it and start a new batch. Each dispatch returns a + * promise; we race the set of in-flight dispatches so the first + * batch to finish yields its matches first (workers may complete + * out of order). + * + * On `maxResults`/`stopOnFirst`, we stop dispatching new batches. + * In-flight batches keep running (no cancellation API on workers) + * but their results are discarded once we've exited. + * + * `filesRead` counts all paths dispatched to workers — we don't + * distinguish "read successfully" from "open failed" at the worker + * boundary (the worker swallows the error and produces zero matches + * for that path). For the main-thread fallback the stat is more + * precise. + */ +// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: producer/consumer pattern with early exit, batch dispatch, and abort-signal propagation is inherently branchy +async function* grepViaWorkers( + opts: GrepPipelineOptions +): AsyncGenerator { + const pool = getWorkerPool(); + + // Queue of completed batches ready to emit. Filled by dispatched + // worker promises as they resolve; drained by this generator. + const pending: GrepMatch[][] = []; + // Consumer wait state: `pendingNotify` is a one-shot resolver set + // by the consumer when it's about to wait; `notifyPending` tracks + // a notification that arrived while no one was waiting, so the + // consumer picks it up on next loop iteration. + let pendingNotify: (() => void) | null = null; + let notifyPending = false; + const wakeConsumer = () => { + if (pendingNotify) { + const fn = pendingNotify; + pendingNotify = null; + fn(); + } else { + notifyPending = true; + } + }; + + let earlyExit = false; + let walkerDone = false; + let inflightCount = 0; + + // Tracks batches that failed at dispatch time (worker error, + // "all workers dead", etc.) so we can surface silent-failure + // scenarios to the caller. A single failed batch is not fatal + // (its results are dropped — per-file errors are already expected + // to be swallowed inside the worker), but if every batch fails + // we throw so callers aren't handed a false-negative empty result + // that could poison downstream caches. + let dispatchedBatches = 0; + let failedBatches = 0; + // Track dispatch promises so the consumer's `finally` can await + // them before checking the failure counters. Without this, an + // early exit (e.g., AbortError, maxResults = 0) could race the + // final failure assessment: the consumer's finally runs before + // in-flight dispatch `.catch()` handlers have bumped + // `failedBatches`, causing a legitimate all-failure scenario to + // silently pass through as "no matches found." + const dispatchPromises: Promise[] = []; + + const dispatchBatch = ( + paths: string[], + rels: string[], + mtimes: readonly number[] | null + ): void => { + opts.stats.filesRead += paths.length; + inflightCount += 1; + dispatchedBatches += 1; + const dispatchP = pool + .dispatch({ + paths, + patternSource: opts.perFile.regex.source, + flags: resolveWorkerFlags(opts.perFile), + maxLineLength: opts.perFile.maxLineLength, + maxMatchesPerFile: Number.isFinite(opts.perFile.maxMatchesPerFile) + ? opts.perFile.maxMatchesPerFile + : 0xff_ff_ff_ff, + literal: opts.perFile.literal, + }) + .then( + (result) => { + // `decodeWorkerMatches` attaches per-file mtime by pathIdx + // when `mtimes` is non-null. Mtime comes from the walker + // (main thread) — worker doesn't need to stat again. + pending.push(decodeWorkerMatches(result, paths, rels, mtimes)); + }, + () => { + // Worker error — drop this batch's results. Per-file errors + // are already swallowed inside the worker itself. Counted so + // the consumer can surface total-pipeline failure if every + // batch failed. + failedBatches += 1; + } + ) + .finally(() => { + inflightCount -= 1; + wakeConsumer(); + }); + dispatchPromises.push(dispatchP); + }; + + // Producer: walk paths, batch them, dispatch each full batch. + // Errors thrown by the walker (e.g. `AbortError` when the caller + // signals abort) propagate to the consumer via `producerError`. + let producerError: unknown = null; + const recordMtimes = opts.perFile.recordMtimes; + // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: walker drain + path-prefix handling + batch flush + error capture is inherently branchy + const producer = (async () => { + let batch: string[] = []; + let batchRel: string[] = []; + // Per-batch mtime array (parallel to `batch`). Only allocated + // when the caller opts in via `recordMtimes`; otherwise stays + // null and the decoder skips the `mtime` field entirely. + let batchMtimes: number[] | null = recordMtimes ? [] : null; + try { + for await (const entry of opts.walkSource) { + if (earlyExit) { + break; + } + batch.push(entry.absolutePath); + batchRel.push( + opts.perFile.pathPrefix + ? joinPosix(opts.perFile.pathPrefix, entry.relativePath) + : entry.relativePath + ); + if (batchMtimes !== null) { + batchMtimes.push(entry.mtime); + } + if (batch.length >= WORKER_BATCH_SIZE) { + dispatchBatch(batch, batchRel, batchMtimes); + batch = []; + batchRel = []; + batchMtimes = recordMtimes ? [] : null; + } + } + // Final partial batch. + if (batch.length > 0) { + dispatchBatch(batch, batchRel, batchMtimes); + } + } catch (error) { + producerError = error; + } finally { + walkerDone = true; + wakeConsumer(); + } + })(); + + // Helper to check + throw on aborted signal. Matches the behavior + // of `walkFiles` which throws `DOMException("AbortError")` on the + // next yield when the signal fires mid-iteration. + const checkAborted = (): void => { + const signal = opts.concurrent.signal; + if (signal?.aborted) { + throw new DOMException("aborted", "AbortError"); + } + }; + + // Consumer: drain `pending` as batches complete, yielding matches + // and respecting `maxResults`/`stopOnFirst`. Wait for `notify` + // when there's nothing to emit and more work is in flight. + try { + while (!earlyExit) { + checkAborted(); + // Emit everything currently pending. + while (pending.length > 0) { + checkAborted(); + const matches = pending.shift(); + if (!matches) { + continue; + } + for (const m of matches) { + if (opts.stats.matchesEmitted >= opts.maxResults) { + opts.stats.truncated = true; + earlyExit = true; + break; + } + opts.stats.matchesEmitted += 1; + yield m; + checkAborted(); + if (opts.stopOnFirst) { + opts.stats.truncated = true; + earlyExit = true; + break; + } + } + if (earlyExit) { + break; + } + } + if (earlyExit) { + break; + } + // Nothing pending. If the walker is done AND no in-flight + // batches remain, we're finished. + if (walkerDone && inflightCount === 0) { + break; + } + // Otherwise, wait for something to happen. Check for a + // pre-arrived notification first to avoid missing wakeups + // that happened while we were iterating `pending`. + if (notifyPending) { + notifyPending = false; + continue; + } + await new Promise((resolve) => { + if (notifyPending) { + notifyPending = false; + resolve(); + } else { + pendingNotify = resolve; + } + }); + } + } finally { + // Ensure the producer settles even on early exit so errors + // surface in the next run (pool promises stay alive otherwise). + earlyExit = true; + wakeConsumer(); + await producer; + // Await every dispatched batch so `failedBatches` is final + // before we inspect it. Without this, an early exit could run + // the failure check before in-flight `.catch()` handlers + // bumped the counter — collapsing an all-failure scenario into + // a silent "no matches" return. `allSettled` because we don't + // care about individual rejections here (each was already + // counted via the dispatch's `.catch()` handler). + if (dispatchPromises.length > 0) { + await Promise.allSettled(dispatchPromises); + } + // If the walker threw (typically `AbortError`), re-throw so the + // caller sees the same behavior as the async-fallback path. + if (producerError !== null) { + // biome-ignore lint/correctness/noUnsafeFinally: intentional — re-raising walker error (e.g., AbortError) so callers see it + throw producerError; + } + // Worker-pipeline failure detector: if at least one batch was + // dispatched AND every dispatched batch failed, the pipeline + // returned zero matches due to worker failures — NOT due to the + // regex not matching. We must surface this so callers (notably + // the DSN cache layer) don't persist a false-negative empty + // result. A partial failure (some batches succeeded) is + // acceptable — we've at least reported the matches we did find. + if (dispatchedBatches > 0 && failedBatches === dispatchedBatches) { + // biome-ignore lint/correctness/noUnsafeFinally: intentional — surfacing pipeline-wide failure so false-negative empty result doesn't leak upstream + throw new Error( + `worker pipeline: all ${dispatchedBatches} dispatched batch(es) failed` + ); + } + } +} + +/** + * Resolve the regex flags to send to the worker. Matches the logic + * in `grepByWholeBuffer` — always `/g` (required for iteration), + * plus `/m` when the caller opted into line-boundary semantics (the + * default). + */ +function resolveWorkerFlags(opts: PerFileOptions): string { + const ensured = opts.multiline + ? ensureGlobalMultilineFlags(opts.regex) + : ensureGlobalFlag(opts.regex); + return ensured.flags; +} + /** * Resolved pipeline inputs shared by `grepFiles` and `collectGrep`. * Extracted so both entry points get the same default-resolution, @@ -449,6 +795,8 @@ type GrepPipelineSetup = { * are skipped entirely. */ literal: string | null; + /** Mirrors `GrepOptions.recordMtimes`. Threaded into PerFileOptions. */ + recordMtimes: boolean; stats: GrepStats; walkSource: AsyncIterable; maxResults: number; @@ -512,6 +860,7 @@ function setupGrepPipeline(opts: GrepOptions): GrepPipelineSetup { regex, multiline, literal, + recordMtimes: opts.recordMtimes ?? false, stats, walkSource, maxResults: opts.maxResults ?? Number.POSITIVE_INFINITY, @@ -549,6 +898,7 @@ export async function* grepFiles(opts: GrepOptions): AsyncGenerator { maxMatchesPerFile: setup.maxMatchesPerFile, pathPrefix: setup.pathPrefix, literal: setup.literal, + recordMtimes: setup.recordMtimes, }, walkSource: setup.walkSource, stats: setup.stats, @@ -594,6 +944,7 @@ export async function collectGrep(opts: GrepOptions): Promise { maxMatchesPerFile: setup.maxMatchesPerFile, pathPrefix: setup.pathPrefix, literal: setup.literal, + recordMtimes: setup.recordMtimes, }, walkSource: setup.walkSource, stats: setup.stats, diff --git a/src/lib/scan/types.ts b/src/lib/scan/types.ts index 1991c3ecc..8b4959aab 100644 --- a/src/lib/scan/types.ts +++ b/src/lib/scan/types.ts @@ -214,6 +214,14 @@ export type GrepMatch = { * consumers that care should trim their own way. */ line: string; + /** + * Floored `stat.mtimeMs` of the source file. Populated only when + * `GrepOptions.recordMtimes` is true; otherwise omitted (not + * zero — the absence distinguishes "not asked" from "asked and + * happens to be 0"). Consumers use this for cache invalidation; + * see `src/lib/dsn/code-scanner.ts`. + */ + mtime?: number; }; /** @@ -260,6 +268,20 @@ export type GrepOptions = { maxFileSize?: number; descentHook?: WalkOptions["descentHook"]; followSymlinks?: boolean; + /** + * When true, each `GrepMatch` includes the source file's floored + * `mtimeMs`. Used by cache-invalidation layers (e.g., the DSN + * scanner's `sourceMtimes` map). Forwarded to the walker; + * incidentally costs an extra stat per file, so it's opt-in. + */ + recordMtimes?: boolean; + /** + * Optional hook invoked once per directory visited during the + * walk with its floored `mtimeMs`. Used for directory-level cache + * invalidation (e.g., "has a new file been added to this dir?"). + * Forwarded to `walkFiles`. + */ + onDirectoryVisit?: WalkOptions["onDirectoryVisit"]; // --- Grep-specific --- /** diff --git a/src/lib/scan/worker-pool.ts b/src/lib/scan/worker-pool.ts new file mode 100644 index 000000000..302416d0b --- /dev/null +++ b/src/lib/scan/worker-pool.ts @@ -0,0 +1,466 @@ +/** + * Worker pool for parallel file-grep work. + * + * Lazy-initialized singleton: the first call to `getWorkerPool()` + * spawns N workers via a Blob URL (for compatibility with + * `bun build --compile`'s single-file binary — see + * `grep-worker.js` for the worker body + rationale). Subsequent + * calls reuse the pool. + * + * ### Lifetime + * + * Workers are kept ref'd — they hold the event loop open. The CLI + * relies on `process.exit()` at the end of command execution to + * tear them down; no explicit shutdown is required from callers. + * + * An earlier iteration `.unref()`'d each worker for "clean" exit, + * but that caused a deadlock: when the main thread's only pending + * work was a promise awaiting a worker result, unref'd workers + * didn't process messages on idle ticks and the pipeline hung with + * in-flight batches never settling. + * + * `terminatePool()` is provided for tests that need to reset the + * singleton between cases; it's not part of the CLI shutdown path. + * + * ### Feature-gate + * + * `isWorkerSupported()` returns true only when `Worker`, `Blob`, + * and `URL.createObjectURL` are available. The Bun runtime and + * Bun-compiled binaries always pass this check; the Node library + * bundle (`dist/index.cjs`) may or may not depending on the + * consumer's Node version. We avoid depending on `bun:*` or + * `Bun.*` APIs so the same code works in both runtimes. + * + * When unsupported, callers fall back to the async + * `mapFilesConcurrent` path. + */ + +import { availableParallelism } from "node:os"; +// Raw-text import: Bun reads the file's contents at bundle time and +// exposes them as the module's default export (a string). The file +// is a real `.js` file (`grep-worker.js`) so it's lintable, +// syntax-checked, and easy to edit. At runtime we feed the string +// to a `Blob` + `URL.createObjectURL` to spawn workers. See +// `grep-worker.js` for full rationale. +// +// TypeScript's type system doesn't model Bun's `with { type: "text" }` +// attribute (the default export gets typed as the module's shape), +// so we cast through unknown. Guarded at startup by the runtime +// Blob constructor which would throw if given a non-string. +import grepWorkerSource from "./grep-worker.js" with { type: "text" }; + +const GREP_WORKER_SOURCE = grepWorkerSource as unknown as string; + +import type { GrepMatch } from "./types.js"; + +/** + * Batch dispatched to a worker. `paths` are absolute filesystem + * paths the worker will `readFileSync`. `pathsBase` is the + * relative-path prefix (usually the walker's `cwd + "/"`) that the + * caller uses to reconstruct `GrepMatch.path` from + * `pathsBase + absolutePath.slice(pathsBase.length)`. + */ +export type WorkerGrepRequest = { + paths: string[]; + patternSource: string; + flags: string; + maxLineLength: number; + maxMatchesPerFile: number; + literal: string | null; +}; + +/** + * Packed result from a single worker batch. Rehydrate via + * `decodeWorkerMatches`. + */ +export type WorkerGrepResult = { + /** Packed 4-u32-per-match (pathIdx, lineNum, lineOffset, lineLength). */ + ints: Uint32Array; + /** Concatenated line text, indexed by `ints[i*4 + 2]` and `+3`. */ + linePool: string; +}; + +/** + * Per-worker state. Dispatch queues requests FIFO per worker — we + * can't use `addEventListener` per request because multiple + * concurrent dispatches to the same worker would make their handlers + * all fire on the first `result` message (they each match the + * shape), resulting in the wrong `resolve()` getting called with + * the wrong batch's data. + * + * Instead: one `onmessage` per worker, `pending` queue of resolvers, + * shift the head on each `result` message. + */ +type PooledWorker = { + worker: Worker; + /** Promise that resolves once the worker signals `"ready"`. */ + ready: Promise; + /** Number of batches currently dispatched to this worker. */ + inflight: number; + /** + * Queue of pending result resolvers. Populated by `dispatch`, + * drained in FIFO order by `worker.onmessage` as `result` messages + * arrive. Workers process messages sequentially so FIFO matching + * is sound. + */ + pending: Array<{ + resolve: (r: WorkerGrepResult) => void; + reject: (e: unknown) => void; + }>; + /** + * False once the worker has emitted an `error` event. Dispatchers + * skip dead workers — without this, the least-loaded selection + * would favor a dead worker (its `inflight` gets reset to 0 in + * the error handler), new dispatches would post to a worker that + * can't respond, and the calling `grepViaWorkers` would deadlock + * waiting for results that never arrive. + */ + alive: boolean; +}; + +type WorkerPool = { + workers: PooledWorker[]; + /** + * Dispatch `request` to the least-loaded worker, returning a + * promise that resolves when the worker posts its `"result"`. + */ + dispatch(request: WorkerGrepRequest): Promise; + /** + * Terminate all workers in the pool. Used by tests and on process + * teardown. Safe to call multiple times. + */ + terminate(): void; +}; + +/** + * Module-level pool singleton. Lazily initialized on first + * `getWorkerPool()` call. Cleared by `terminatePool()` (used by + * tests). + */ +let pool: WorkerPool | null = null; + +/** + * True when the runtime supports Web-Workers-style `new Worker(url)` + * with a Blob-URL source. Covers: + * - Bun (dev mode and single-file compiled binary) + * - Node 22+ when the library bundle is consumed (Node exposes + * `Worker` via the DOM shim in newer versions; older Node lacks + * it, so we fall back). + */ +export function isWorkerSupported(): boolean { + return ( + typeof Worker !== "undefined" && + typeof Blob !== "undefined" && + typeof URL !== "undefined" && + typeof URL.createObjectURL === "function" + ); +} + +/** + * Default pool size: clamped to [2, 8] based on `availableParallelism()`. + * Bench (synthetic/large, 10k files, `import.*from`): 4 workers hits + * the knee (183ms p50 vs 171ms for 8 workers; 316ms for 2). Most + * CLI hosts are 4–16 core, so clamping at 8 prevents over-spawn on + * high-core boxes where per-worker stat/read contention dominates. + */ +export function getPoolSize(): number { + return Math.max(2, Math.min(8, availableParallelism())); +} + +/** + * Call `worker.ref()` if the runtime exposes it. Both Bun's native + * `Worker` and Node's `worker_threads.Worker` have `.ref()` / `.unref()`; + * DOM Worker shims may not. Guarded. + */ +function refWorker(w: Worker): void { + if (typeof (w as unknown as { ref?: () => void }).ref === "function") { + (w as unknown as { ref: () => void }).ref(); + } +} + +/** Counterpart to {@link refWorker}. */ +function unrefWorker(w: Worker): void { + if (typeof (w as unknown as { unref?: () => void }).unref === "function") { + (w as unknown as { unref: () => void }).unref(); + } +} + +/** + * Build the worker's Blob-URL source once per pool. Cached so + * repeated pool recreation (in tests) doesn't leak URLs. + */ +let cachedBlobUrl: string | null = null; +function getWorkerBlobUrl(): string { + if (cachedBlobUrl !== null) { + return cachedBlobUrl; + } + const blob = new Blob([GREP_WORKER_SOURCE], { + type: "application/javascript", + }); + cachedBlobUrl = URL.createObjectURL(blob); + return cachedBlobUrl; +} + +/** + * Get or lazily create the worker pool. Throws if + * `isWorkerSupported()` is false — callers should feature-gate + * on that first. + */ +export function getWorkerPool(): WorkerPool { + if (pool !== null) { + return pool; + } + if (!isWorkerSupported()) { + throw new Error( + "Worker pool requested but Workers are unavailable in this runtime" + ); + } + + const size = getPoolSize(); + const url = getWorkerBlobUrl(); + const workers: PooledWorker[] = []; + + for (let i = 0; i < size; i += 1) { + const w = new Worker(url); + // Workers are unref'd at spawn so an idle pool doesn't hold the + // event loop open. On each `dispatch()` we `ref()` the worker + // before posting work; when inflight drops to zero (i.e., the + // worker goes idle), we `unref()` it again. This gives us: + // + // - Clean CLI exit: once all pending dispatches settle, + // workers are idle + unref'd, the loop drains, and the + // process exits naturally — no `beforeExit` / explicit + // `terminatePool()` required on the CLI happy path. + // - No mid-pipeline deadlock: ref'd workers during active + // work means their `message` events fire on the main + // thread's next tick, not starved by idle-tick bypass. + // + // IMPORTANT: `Worker.ref()` / `.unref()` are idempotent booleans, + // NOT reference-counted. Calling `unref()` on a worker that has + // multiple dispatches in flight unrefs it entirely (subsequent + // `ref()` calls are no-ops until the worker becomes idle again). + // Hence the inflight-zero guard in every unref site — we only + // unref when the LAST dispatch for this worker completes. + unrefWorker(w); + const pw: PooledWorker = { + worker: w, + ready: new Promise((resolve) => { + // Single readiness listener; removed when the ready signal + // arrives. Subsequent messages are handled by `onmessage` + // set below (which takes over after ready). + const readyHandler = (event: MessageEvent) => { + if (event.data?.type === "ready") { + w.removeEventListener("message", readyHandler); + resolve(); + } + }; + w.addEventListener("message", readyHandler); + }), + inflight: 0, + pending: [], + alive: true, + }; + // Single onmessage handler per worker. Matches `result` messages + // to the oldest pending dispatch via FIFO shift. Messages from + // the worker arrive in the same order as `postMessage` calls, + // and the worker processes requests sequentially (single-thread + // inside), so FIFO matching is sound. + w.addEventListener("message", (event) => { + const data = event.data as { type?: string } & WorkerGrepResult; + if (data.type !== "result") { + return; + } + const next = pw.pending.shift(); + if (!next) { + return; + } + pw.inflight -= 1; + // `ref()` / `unref()` are idempotent booleans, NOT reference- + // counted. Only unref when the worker's own inflight drops to + // zero — unrefing while other dispatches are still in flight + // would let the event loop exit and drop their results. + if (pw.inflight === 0) { + unrefWorker(pw.worker); + } + next.resolve({ ints: data.ints, linePool: data.linePool }); + }); + w.addEventListener("error", (err) => { + // Mark the worker dead before rejecting pending dispatches — + // `dispatch` consults `alive` under the least-loaded picker to + // avoid routing new work to a worker that can't respond. + pw.alive = false; + const errMsg = err.message ?? String(err); + // Drain the pending queue: reject each dispatch. Then unref + // the worker ONCE at the end (ref/unref are idempotent; one + // unref is sufficient to release the event-loop hold even if + // there were multiple pending dispatches). + let slot = pw.pending.shift(); + while (slot !== undefined) { + slot.reject(new Error(`worker error: ${errMsg}`)); + slot = pw.pending.shift(); + } + pw.inflight = 0; + unrefWorker(pw.worker); + }); + workers.push(pw); + } + + pool = { + workers, + dispatch(request: WorkerGrepRequest): Promise { + // Pick least-loaded LIVE worker. Dead workers (those that + // emitted an `error` event) are skipped entirely — dispatching + // to them would hang because they can't respond. Their + // `inflight` gets reset to 0 in the error handler, which + // would otherwise make them the "least loaded" and silently + // capture all subsequent dispatches. + let best: PooledWorker | null = null; + for (const pw of workers) { + if (!pw.alive) { + continue; + } + if (best === null || pw.inflight < best.inflight) { + best = pw; + } + } + if (best === null) { + return Promise.reject(new Error("worker pool: all workers dead")); + } + const chosen = best; + chosen.inflight += 1; + // Ref the worker while this dispatch is in flight. Matched by + // the `unref()` call in the `message` result handler (or the + // mass-unref in the `error` handler / `terminate()` path). + refWorker(chosen.worker); + + // Enqueue a pending slot for this request. The worker's + // `onmessage` handler will resolve it when the corresponding + // `result` message arrives (FIFO). + // + // We hold a direct reference to `ourSlot` so the readiness- + // failure path can remove and reject THIS dispatch's entry + // specifically — not whatever happens to be at the front or + // back of the pending queue when the failure fires. With + // concurrent dispatches to the same worker, `pop()` or + // `shift()` could reject a sibling dispatch's promise with + // this dispatch's error. + let ourSlot: PooledWorker["pending"][number]; + const result = new Promise((resolve, reject) => { + ourSlot = { resolve, reject }; + chosen.pending.push(ourSlot); + }); + // Wait for readiness (first dispatch only), then post the + // request. Subsequent dispatches skip the await (the ready + // promise is already settled). + chosen.ready.then( + () => { + chosen.worker.postMessage(request); + }, + (err) => { + // Readiness failed — fail THIS dispatch's resolver and + // remove ITS slot from the pending queue. Identity-based + // removal preserves the FIFO invariant for any sibling + // dispatches still in flight. Only unref if no other + // dispatches remain in flight (same reasoning as the + // `message` handler: `unref()` is idempotent, and + // unrefing while others are in flight would let the + // loop exit prematurely). + const idx = chosen.pending.indexOf(ourSlot); + if (idx !== -1) { + chosen.pending.splice(idx, 1); + chosen.inflight -= 1; + if (chosen.inflight === 0) { + unrefWorker(chosen.worker); + } + ourSlot.reject(err); + } + } + ); + return result; + }, + terminate(): void { + for (const pw of workers) { + pw.alive = false; + // Drain pending: reject every dispatch. `ref`/`unref` are + // idempotent booleans — one unref at the end is sufficient + // to release the event-loop hold even if the worker had + // multiple pending dispatches. + let slot = pw.pending.shift(); + while (slot !== undefined) { + slot.reject(new Error("worker pool terminated")); + slot = pw.pending.shift(); + } + pw.inflight = 0; + unrefWorker(pw.worker); + try { + pw.worker.terminate(); + } catch { + // Ignore — terminate is experimental in Bun and may throw + // on already-terminated workers. + } + } + }, + }; + + return pool; +} + +/** + * Tear down the singleton pool. Primarily for tests — the + * singleton is otherwise kept alive for the process lifetime. + */ +export function terminatePool(): void { + if (pool !== null) { + pool.terminate(); + pool = null; + } + if (cachedBlobUrl !== null) { + // URL.revokeObjectURL is safe to call on Node + Bun. + URL.revokeObjectURL(cachedBlobUrl); + cachedBlobUrl = null; + } +} + +/** + * Decode a worker's packed `{ints, linePool}` into an array of + * `GrepMatch`es, using the caller's `paths` and `relPaths` to + * reconstruct path fields. + * + * Optional `mtimes` is a parallel per-path array: when provided, + * each emitted `GrepMatch` gets an `mtime` field indexed by the + * match's `pathIdx`. Populated by callers that set + * `recordMtimes: true` — the mtime is known on the main thread + * from the walker, not from the worker. + */ +export function decodeWorkerMatches( + result: WorkerGrepResult, + paths: readonly string[], + relPaths: readonly string[], + mtimes: readonly number[] | null = null +): GrepMatch[] { + const { ints, linePool } = result; + const matches: GrepMatch[] = []; + // 4 u32s per match (pathIdx, lineNum, lineOffset, lineLength). + const count = Math.floor(ints.length / 4); + for (let i = 0; i < count; i += 1) { + const base = i * 4; + const pathIdx = ints[base] ?? 0; + const lineNum = ints[base + 1] ?? 0; + const lineOffset = ints[base + 2] ?? 0; + const lineLength = ints[base + 3] ?? 0; + const absolutePath = paths[pathIdx] ?? ""; + const relativePath = relPaths[pathIdx] ?? absolutePath; + const line = linePool.slice(lineOffset, lineOffset + lineLength); + const match: GrepMatch = { + path: relativePath, + absolutePath, + lineNum, + line, + }; + if (mtimes !== null) { + match.mtime = mtimes[pathIdx]; + } + matches.push(match); + } + return matches; +} diff --git a/test/lib/dsn/code-scanner.test.ts b/test/lib/dsn/code-scanner.test.ts index 6767bb672..f74e48dba 100644 --- a/test/lib/dsn/code-scanner.test.ts +++ b/test/lib/dsn/code-scanner.test.ts @@ -496,6 +496,28 @@ describe("Code Scanner", () => { expect(result.dsns).toEqual([]); }); + test("finds DSNs on long single-line files (>2000 chars, common in minified JS)", async () => { + // Regression: `grepFiles` defaults to `maxLineLength: 2000` which + // truncates `match.line` with a `…` suffix. The DSN scanner then + // re-runs `DSN_PATTERN` on that truncated line, so a DSN near the + // end of a long line would silently disappear (the pattern ends + // with `/\d+` — trailing digits get replaced by `…`). + // + // The scanner explicitly passes `maxLineLength: Infinity` to + // disable truncation, because memory is bounded by the walker's + // 256KB file cap. + const filler = "x".repeat(1990); + const dsn = "https://abc@o111222.ingest.us.sentry.io/7654321"; + const line = `${filler} const DSN = "${dsn}";`; + // Sanity: DSN starts past column 2000. + expect(line.indexOf(dsn)).toBeGreaterThan(2000); + writeFileSync(join(testDir, "minified.js"), line); + + const result = await scanCodeForDsns(testDir); + expect(result.dsns).toHaveLength(1); + expect(result.dsns[0]?.raw).toBe(dsn); + }); + test("finds DSNs in monorepo packages deeper than MAX_SCAN_DEPTH", async () => { // packages/spotlight/src/instrument.ts is depth 3 from root, // but with monorepo depth reset, packages/spotlight/ resets to 0