diff --git a/scripts/persistent_registry.py b/scripts/persistent_registry.py index d935731..274f761 100644 --- a/scripts/persistent_registry.py +++ b/scripts/persistent_registry.py @@ -745,6 +745,17 @@ def store_scan_result(self, result: Dict[str, Any]) -> None: Idempotent: re-scanning the same file set with the same result subset does not insert a duplicate row. + Closes the P1 idempotency bug from issue #82: the previous + implementation included ``scan_timestamp = time.time()`` in the + ``result_hash`` computation, so the hash changed on every call and + the idempotency check below never matched — every re-scan inserted + a fresh row, causing unbounded ``analysis_cache`` growth. The fix + splits the subset into a stable part (no timestamp) used for + hashing and a full part (with timestamp) used for storage. On a + cache hit, the existing row's ``timestamp`` and ``result_json`` + are updated in place so trend tracking still reflects the latest + scan without inserting duplicates. + Args: result: The full scan result dict returned by ``commands.scan.cmd_scan``. Expected to contain the @@ -768,7 +779,12 @@ def store_scan_result(self, result: Dict[str, Any]) -> None: + int(backend.get("nodes", 0) or 0) ) - subset = { + # Stable subset (no scan_timestamp) used for result_hash — issue #82. + # Excluding scan_timestamp from the hash makes re-scans of the same + # file set with the same counts produce an identical result_hash, + # so the idempotency check below finds the existing row instead of + # inserting a duplicate on every call (which caused cache bloat). + stable_subset = { "files_scanned": files_scanned, "frontend_counts": { "classes": int(frontend.get("classes", 0) or 0), @@ -779,10 +795,18 @@ def store_scan_result(self, result: Dict[str, Any]) -> None: "edges": int(backend.get("edges", 0) or 0), }, "frameworks": frameworks, - "scan_timestamp": time.time(), "total_symbols": total_symbols, } + # Capture once per call and reuse for both the JSON scan_timestamp + # field and the analysis_cache.timestamp column — keeps the two + # consistent without paying for a second time.time() call. + now = time.time() + # Full subset (with scan_timestamp) stored in result_json for trend + # tracking. scan_timestamp is preserved in storage but excluded + # from the hash above so re-scans stay idempotent. + subset = {**stable_subset, "scan_timestamp": now} + file_set_hash = self._compute_scan_file_set_hash(workspace) if not file_set_hash: logger.warning( @@ -791,15 +815,16 @@ def store_scan_result(self, result: Dict[str, Any]) -> None: ) return + stable_json = json.dumps(stable_subset, ensure_ascii=False, default=str, sort_keys=True) + result_hash = hashlib.sha256(stable_json.encode("utf-8")).hexdigest() result_json = json.dumps(subset, ensure_ascii=False, default=str, sort_keys=True) - result_hash = hashlib.sha256(result_json.encode("utf-8")).hexdigest() - now = time.time() conn = self._connect() - # Idempotent: if the same (command, file_set_hash, result_hash) - # triple already exists, do nothing. This mirrors the "INSERT OR - # IGNORE" intent — re-scanning the same file set with the same - # outcome must not bloat the cache. + # Idempotent upsert: if the same (command, file_set_hash, result_hash) + # triple already exists, update its timestamp + result_json in place + # so trend tracking reflects the latest scan, instead of inserting a + # duplicate row. Mirrors the "INSERT OR IGNORE" intent — re-scanning + # the same file set with the same outcome must not bloat the cache. existing = conn.execute( """SELECT 1 FROM analysis_cache WHERE command = ? AND file_set_hash = ? AND result_hash = ? @@ -808,11 +833,24 @@ def store_scan_result(self, result: Dict[str, Any]) -> None: ("scan", file_set_hash, result_hash), ).fetchone() if existing: - logger.debug( - "store_scan_result: cache hit (command=scan, " - "file_set_hash=%s, result_hash=%s) — skipping insert", - file_set_hash[:12], result_hash[:12], - ) + try: + conn.execute( + """UPDATE analysis_cache + SET result_json = ?, timestamp = ? + WHERE command = ? AND file_set_hash = ? AND result_hash = ? + """, + (result_json, now, "scan", file_set_hash, result_hash), + ) + conn.commit() + logger.debug( + "store_scan_result: cache hit (command=scan, " + "file_set_hash=%s, result_hash=%s) — updated timestamp", + file_set_hash[:12], result_hash[:12], + ) + except sqlite3.Error as e: + logger.warning( + "store_scan_result: failed to update analysis_cache on hit: %s", e + ) return try: diff --git a/tests/test_persistent_registry.py b/tests/test_persistent_registry.py index cf1412a..7d14ee3 100644 --- a/tests/test_persistent_registry.py +++ b/tests/test_persistent_registry.py @@ -476,13 +476,21 @@ def test_migrate_already_exists(self, workspace): class TestStoreScanResult: - """Test PersistentRegistry.store_scan_result — closes issue #31. + """Test PersistentRegistry.store_scan_result — closes issues #31 and #82. - The P0 bug: codelens.py calls pr.store_scan_result(result) after every - successful scan, but the method did not exist. The resulting + The P0 bug (#31): codelens.py calls pr.store_scan_result(result) after + every successful scan, but the method did not exist. The resulting AttributeError was swallowed by a bare except Exception, so the analysis_cache table was never populated and the sqlite_persisted flag was never set. These tests verify the fix. + + The P1 bug (#82): the previous implementation included + ``scan_timestamp = time.time()`` in the ``result_hash`` computation, + so the hash changed on every call and the idempotency check never + matched — every re-scan inserted a fresh row (cache bloat). The fix + excludes ``scan_timestamp`` from the hash while still storing it in + ``result_json`` for trend tracking, and updates the existing row's + timestamp + result_json in place on cache hit. """ def _make_scan_result(self, workspace, **overrides): @@ -545,7 +553,17 @@ def test_store_scan_result_persists_to_analysis_cache(self, populated_workspace) assert stored["total_symbols"] == 8 def test_store_scan_result_is_idempotent(self, populated_workspace): - """Calling store_scan_result twice with the same result inserts only one row.""" + """Calling store_scan_result twice with the same result inserts only one row. + + Issue #82: idempotency must hold with REAL time (no patching). + Previously this test only passed by patching ``time.time()`` to a + fixed value — meaningless in production where every scan has a + different timestamp. With ``scan_timestamp`` excluded from + ``result_hash``, two calls with real time still produce the same + hash (because the stable subset is identical) → the second call + hits the existing row and updates it in place instead of + inserting a duplicate. + """ from persistent_registry import PersistentRegistry import sqlite3 @@ -553,19 +571,11 @@ def test_store_scan_result_is_idempotent(self, populated_workspace): reg._connect() result = self._make_scan_result(populated_workspace) - # Note: scan_timestamp is computed inside store_scan_result via - # time.time(), so the second call will produce a DIFFERENT subset - # unless we freeze the timestamp. To test idempotency, we patch - # time.time inside the module to return a fixed value. - import persistent_registry as pr_module - - original_time = pr_module.time.time - pr_module.time.time = lambda: 1700000000.0 - try: - reg.store_scan_result(result) - reg.store_scan_result(result) - finally: - pr_module.time.time = original_time + # Real time — no patching. Two consecutive calls will have + # different scan_timestamp values, but the stable subset used for + # result_hash is identical, so the idempotency check must fire. + reg.store_scan_result(result) + reg.store_scan_result(result) db_path = reg.db_path reg.close() @@ -580,6 +590,59 @@ def test_store_scan_result_is_idempotent(self, populated_workspace): f"expected 1 row after duplicate store_scan_result, got {count}" ) + def test_store_scan_result_updates_timestamp_on_rescan(self, populated_workspace): + """Re-scanning the same file set updates the existing row's timestamp. + + Issue #82 acceptance criterion: idempotency means no duplicate row, + but the row's ``timestamp`` column must reflect the LATEST scan + (not the first) so trend tracking stays current. We query the + ``timestamp`` column after the first call, then again after the + second call, and assert the value changed. + """ + from persistent_registry import PersistentRegistry + import sqlite3 + + reg = PersistentRegistry(populated_workspace) + reg._connect() + + result = self._make_scan_result(populated_workspace) + reg.store_scan_result(result) + + db_path = reg.db_path + reg.close() + + conn = sqlite3.connect(db_path) + ts_after_first = conn.execute( + "SELECT timestamp FROM analysis_cache WHERE command = 'scan'" + ).fetchone()[0] + conn.close() + + # Sleep briefly so the second call's time.time() is strictly + # greater than the first's (deterministic on any platform). + time.sleep(0.05) + + reg2 = PersistentRegistry(populated_workspace) + reg2._connect() + reg2.store_scan_result(result) + reg2.close() + + conn = sqlite3.connect(db_path) + row_count = conn.execute( + "SELECT COUNT(*) FROM analysis_cache WHERE command = 'scan'" + ).fetchone()[0] + ts_after_second = conn.execute( + "SELECT timestamp FROM analysis_cache WHERE command = 'scan'" + ).fetchone()[0] + conn.close() + + assert row_count == 1, ( + f"expected 1 row after re-scan (idempotent), got {row_count}" + ) + assert ts_after_second > ts_after_first, ( + f"timestamp must be updated on re-scan: " + f"first={ts_after_first!r}, second={ts_after_second!r}" + ) + def test_store_scan_result_different_file_sets(self, workspace): """Two different file sets produce two rows with different file_set_hash.""" from persistent_registry import PersistentRegistry