Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 51 additions & 13 deletions scripts/persistent_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@
joined = "\n".join(rel_paths)
return hashlib.sha256(joined.encode("utf-8")).hexdigest()

def store_scan_result(self, result: Dict[str, Any]) -> None:

Check failure on line 731 in scripts/persistent_registry.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 17 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=Wolfvin_CodeLens&issues=AZ8Qky_Nqjbo_uOAxY3G&open=AZ8Qky_Nqjbo_uOAxY3G&pullRequest=84
"""Persist a normalized subset of a scan result to ``analysis_cache``.

Closes the P0 silent bug from issue #31: ``codelens.py`` calls
Expand All @@ -745,6 +745,17 @@
Idempotent: re-scanning the same file set with the same result
subset does not insert a duplicate row.

Closes the P1 idempotency bug from issue #82: the previous
implementation included ``scan_timestamp = time.time()`` in the
``result_hash`` computation, so the hash changed on every call and
the idempotency check below never matched — every re-scan inserted
a fresh row, causing unbounded ``analysis_cache`` growth. The fix
splits the subset into a stable part (no timestamp) used for
hashing and a full part (with timestamp) used for storage. On a
cache hit, the existing row's ``timestamp`` and ``result_json``
are updated in place so trend tracking still reflects the latest
scan without inserting duplicates.

Args:
result: The full scan result dict returned by
``commands.scan.cmd_scan``. Expected to contain the
Expand All @@ -768,7 +779,12 @@
+ int(backend.get("nodes", 0) or 0)
)

subset = {
# Stable subset (no scan_timestamp) used for result_hash — issue #82.
# Excluding scan_timestamp from the hash makes re-scans of the same
# file set with the same counts produce an identical result_hash,
# so the idempotency check below finds the existing row instead of
# inserting a duplicate on every call (which caused cache bloat).
stable_subset = {
"files_scanned": files_scanned,
"frontend_counts": {
"classes": int(frontend.get("classes", 0) or 0),
Expand All @@ -779,10 +795,18 @@
"edges": int(backend.get("edges", 0) or 0),
},
"frameworks": frameworks,
"scan_timestamp": time.time(),
"total_symbols": total_symbols,
}

# Capture once per call and reuse for both the JSON scan_timestamp
# field and the analysis_cache.timestamp column — keeps the two
# consistent without paying for a second time.time() call.
now = time.time()
# Full subset (with scan_timestamp) stored in result_json for trend
# tracking. scan_timestamp is preserved in storage but excluded
# from the hash above so re-scans stay idempotent.
subset = {**stable_subset, "scan_timestamp": now}

file_set_hash = self._compute_scan_file_set_hash(workspace)
if not file_set_hash:
logger.warning(
Expand All @@ -791,15 +815,16 @@
)
return

stable_json = json.dumps(stable_subset, ensure_ascii=False, default=str, sort_keys=True)
result_hash = hashlib.sha256(stable_json.encode("utf-8")).hexdigest()
result_json = json.dumps(subset, ensure_ascii=False, default=str, sort_keys=True)
result_hash = hashlib.sha256(result_json.encode("utf-8")).hexdigest()
now = time.time()

conn = self._connect()
# Idempotent: if the same (command, file_set_hash, result_hash)
# triple already exists, do nothing. This mirrors the "INSERT OR
# IGNORE" intent — re-scanning the same file set with the same
# outcome must not bloat the cache.
# Idempotent upsert: if the same (command, file_set_hash, result_hash)
# triple already exists, update its timestamp + result_json in place
# so trend tracking reflects the latest scan, instead of inserting a
# duplicate row. Mirrors the "INSERT OR IGNORE" intent — re-scanning
# the same file set with the same outcome must not bloat the cache.
existing = conn.execute(
"""SELECT 1 FROM analysis_cache
WHERE command = ? AND file_set_hash = ? AND result_hash = ?
Expand All @@ -808,11 +833,24 @@
("scan", file_set_hash, result_hash),
).fetchone()
if existing:
logger.debug(
"store_scan_result: cache hit (command=scan, "
"file_set_hash=%s, result_hash=%s) — skipping insert",
file_set_hash[:12], result_hash[:12],
)
try:
conn.execute(
"""UPDATE analysis_cache
SET result_json = ?, timestamp = ?
WHERE command = ? AND file_set_hash = ? AND result_hash = ?
""",
(result_json, now, "scan", file_set_hash, result_hash),
)
conn.commit()
logger.debug(
"store_scan_result: cache hit (command=scan, "
"file_set_hash=%s, result_hash=%s) — updated timestamp",
file_set_hash[:12], result_hash[:12],
)
except sqlite3.Error as e:
logger.warning(
"store_scan_result: failed to update analysis_cache on hit: %s", e
)
return

try:
Expand Down
97 changes: 80 additions & 17 deletions tests/test_persistent_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,13 +476,21 @@ def test_migrate_already_exists(self, workspace):


class TestStoreScanResult:
"""Test PersistentRegistry.store_scan_result — closes issue #31.
"""Test PersistentRegistry.store_scan_result — closes issues #31 and #82.

The P0 bug: codelens.py calls pr.store_scan_result(result) after every
successful scan, but the method did not exist. The resulting
The P0 bug (#31): codelens.py calls pr.store_scan_result(result) after
every successful scan, but the method did not exist. The resulting
AttributeError was swallowed by a bare except Exception, so the
analysis_cache table was never populated and the sqlite_persisted
flag was never set. These tests verify the fix.

The P1 bug (#82): the previous implementation included
``scan_timestamp = time.time()`` in the ``result_hash`` computation,
so the hash changed on every call and the idempotency check never
matched — every re-scan inserted a fresh row (cache bloat). The fix
excludes ``scan_timestamp`` from the hash while still storing it in
``result_json`` for trend tracking, and updates the existing row's
timestamp + result_json in place on cache hit.
"""

def _make_scan_result(self, workspace, **overrides):
Expand Down Expand Up @@ -545,27 +553,29 @@ def test_store_scan_result_persists_to_analysis_cache(self, populated_workspace)
assert stored["total_symbols"] == 8

def test_store_scan_result_is_idempotent(self, populated_workspace):
"""Calling store_scan_result twice with the same result inserts only one row."""
"""Calling store_scan_result twice with the same result inserts only one row.

Issue #82: idempotency must hold with REAL time (no patching).
Previously this test only passed by patching ``time.time()`` to a
fixed value — meaningless in production where every scan has a
different timestamp. With ``scan_timestamp`` excluded from
``result_hash``, two calls with real time still produce the same
hash (because the stable subset is identical) → the second call
hits the existing row and updates it in place instead of
inserting a duplicate.
"""
from persistent_registry import PersistentRegistry
import sqlite3

reg = PersistentRegistry(populated_workspace)
reg._connect()

result = self._make_scan_result(populated_workspace)
# Note: scan_timestamp is computed inside store_scan_result via
# time.time(), so the second call will produce a DIFFERENT subset
# unless we freeze the timestamp. To test idempotency, we patch
# time.time inside the module to return a fixed value.
import persistent_registry as pr_module

original_time = pr_module.time.time
pr_module.time.time = lambda: 1700000000.0
try:
reg.store_scan_result(result)
reg.store_scan_result(result)
finally:
pr_module.time.time = original_time
# Real time — no patching. Two consecutive calls will have
# different scan_timestamp values, but the stable subset used for
# result_hash is identical, so the idempotency check must fire.
reg.store_scan_result(result)
reg.store_scan_result(result)

db_path = reg.db_path
reg.close()
Expand All @@ -580,6 +590,59 @@ def test_store_scan_result_is_idempotent(self, populated_workspace):
f"expected 1 row after duplicate store_scan_result, got {count}"
)

def test_store_scan_result_updates_timestamp_on_rescan(self, populated_workspace):
"""Re-scanning the same file set updates the existing row's timestamp.

Issue #82 acceptance criterion: idempotency means no duplicate row,
but the row's ``timestamp`` column must reflect the LATEST scan
(not the first) so trend tracking stays current. We query the
``timestamp`` column after the first call, then again after the
second call, and assert the value changed.
"""
from persistent_registry import PersistentRegistry
import sqlite3

reg = PersistentRegistry(populated_workspace)
reg._connect()

result = self._make_scan_result(populated_workspace)
reg.store_scan_result(result)

db_path = reg.db_path
reg.close()

conn = sqlite3.connect(db_path)
ts_after_first = conn.execute(
"SELECT timestamp FROM analysis_cache WHERE command = 'scan'"
).fetchone()[0]
conn.close()

# Sleep briefly so the second call's time.time() is strictly
# greater than the first's (deterministic on any platform).
time.sleep(0.05)

reg2 = PersistentRegistry(populated_workspace)
reg2._connect()
reg2.store_scan_result(result)
reg2.close()

conn = sqlite3.connect(db_path)
row_count = conn.execute(
"SELECT COUNT(*) FROM analysis_cache WHERE command = 'scan'"
).fetchone()[0]
ts_after_second = conn.execute(
"SELECT timestamp FROM analysis_cache WHERE command = 'scan'"
).fetchone()[0]
conn.close()

assert row_count == 1, (
f"expected 1 row after re-scan (idempotent), got {row_count}"
)
assert ts_after_second > ts_after_first, (
f"timestamp must be updated on re-scan: "
f"first={ts_after_first!r}, second={ts_after_second!r}"
)

def test_store_scan_result_different_file_sets(self, workspace):
"""Two different file sets produce two rows with different file_set_hash."""
from persistent_registry import PersistentRegistry
Expand Down
Loading