From f5e928d7e4b67b71481945eaaf92a13db80c2f8f Mon Sep 17 00:00:00 2001 From: Wolfvin Date: Mon, 29 Jun 2026 15:29:38 +0000 Subject: [PATCH] fix(cli): --top N universally truncates all list-valued keys (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _apply_top_n only truncated keys listed in the hardcoded _LIST_KEYS allowlist (28 names). New commands returning lists under non-standard keys (e.g., 'entities', 'widgets') silently ignored --top N, violating the 'Limit list results to top N items' contract from SKILL-QUICK.md and risking token overflow for MCP clients. Fix: Option A (auto-discover) — replace the allowlist iteration with a runtime scan of ALL top-level keys. Every list-valued key is truncated, except those in _NO_TOP_KEYS (structural/metadata keys like 'available_commands'). Also scans dict-valued keys for category-keyed lists, replacing the separate _NESTED_LIST_KEYS loop. This makes --top N a universal contract: any command, present or future, that returns a list under any key name is automatically truncated. No registration needed. Changes: - _apply_top_n: replaced _LIST_KEYS + _NESTED_LIST_KEYS loops with single auto-discover pass over all result.keys() - Removed _NESTED_LIST_KEYS (dead code — merged into auto-discover) - Added _NO_TOP_KEYS exempt set for structural/metadata keys - Kept _LIST_KEYS for _apply_lite (different purpose: priority-ordered primary result list lookup, not truncation) - coverage_map special case retained (dict-of-dicts, not dict-of-lists) Tests: 8 new tests in TestTopNUniversalTruncation covering: - Non-standard key truncation (the issue #36 repro) - Multiple non-standard keys truncated simultaneously - _NO_TOP_KEYS exempt keys NOT truncated - Standard keys still truncated (backward compat) - --top 0 unlimited for non-standard keys - Nested dict non-standard key truncation - Underscore-prefixed keys skipped - Exact repro from issue #36 Verified: 94/94 pass in test_cli.py + test_codelens.py. Full suite: 0 new failures (37 pre-existing failures unchanged — all graph/architecture/pollution issues unrelated to --top). Closes #36. --- scripts/codelens.py | 67 ++++++++++++++++------------ tests/test_cli.py | 105 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+), 28 deletions(-) diff --git a/scripts/codelens.py b/scripts/codelens.py index 83b63a6..ff70739 100755 --- a/scripts/codelens.py +++ b/scripts/codelens.py @@ -307,7 +307,11 @@ def _auto_setup(workspace: str) -> Dict[str, Any]: # ─── Post-Processing: --top N ────────────────────────────────── -# Keys that commonly contain list results across all commands +# Keys that commonly contain list results across all commands. +# Used by _apply_lite to find the "primary result list" for lite mode output. +# NOTE: _apply_top_n no longer relies on this list — it auto-discovers all +# list-valued keys at runtime (issue #36). This list is kept solely for +# _apply_lite's priority-ordered lookup. _LIST_KEYS = [ "functions", "findings", "leaks", "hints", "issues", "matches", "results", "violations", "entrypoints", "routes", "stores", "callers", "callees", @@ -318,12 +322,17 @@ def _auto_setup(workspace: str) -> Dict[str, Any]: "risks", "drift", # refactor-safe, config-drift ] -# Nested containers that may contain category-keyed lists -_NESTED_LIST_KEYS = [ - "results", # dead-code: results{category:[]} - "issues", # missing-refs: issues{type:[]} - "cycles", # circular: cycles{type:[]} -] +# Keys whose values should NEVER be truncated by --top, even if they're lists. +# These are structural/metadata keys (command names, engine names, etc.) that +# are not "result lists" in the --top N contract sense. Adding a key here is +# an explicit opt-out — the default is to truncate all list-valued keys (issue #36). +_NO_TOP_KEYS = frozenset({ + "available_commands", # help/list-commands: metadata, not results + "engines", # analyze: engine list is structural + "supported_languages", # setup/info: metadata + "categories", # category names are metadata + "tags", # tag names are metadata +}) # Commands that return large lists — get smart default --top _LIST_COMMANDS = { @@ -378,15 +387,31 @@ def _item_sort_key(item): def _apply_top_n(result: Dict[str, Any], top_n: int, command: str = "") -> Dict[str, Any]: - """Limit all list/array results to top N items. Sort by relevance first. Adds truncated flags.""" + """Limit all list/array results to top N items. Sort by relevance first. Adds truncated flags. + + Auto-discovers ALL list-valued keys at runtime — no allowlist needed (issue #36). + This ensures --top N is a universal contract: any command, present or future, + that returns a list under any key name will be truncated. Keys in _NO_TOP_KEYS + are exempt (structural/metadata keys like ``available_commands``). + + Also handles category-keyed dicts (e.g., ``by_category{cat: [...]}``) and + nested containers (e.g., ``results{category: [...]}``) by scanning dict + values for lists. + """ if not isinstance(result, dict) or top_n <= 0: return result # Get sort strategy for this command sort_key, sort_desc = _SORT_STRATEGIES.get(command, (None, False)) - for key in _LIST_KEYS: - val = result.get(key) + # Auto-discover: iterate ALL top-level keys (snapshot to allow in-place additions). + # Truncate every list-valued key except those in _NO_TOP_KEYS or starting with '_'. + # Also scan dict-valued keys for category-keyed lists. + for key in list(result.keys()): + if key in _NO_TOP_KEYS or key.startswith("_"): + continue + val = result[key] + if isinstance(val, list) and len(val) > top_n: # Sort by relevance before truncating if sort_key: @@ -397,23 +422,8 @@ def _apply_top_n(result: Dict[str, Any], top_n: int, command: str = "") -> Dict[ result[f"{key}_truncated"] = True result[f"{key}_total"] = len(val) elif isinstance(val, dict): - # Handle category-keyed dicts like by_category{cat: [...]} - for sub_key, sub_val in val.items(): - if isinstance(sub_val, list) and len(sub_val) > top_n: - # Sort items within each category - if sort_key: - sub_val = _sort_items(sub_val, sort_key, sort_desc) - val[sub_key] = sub_val[:top_n] - else: - val[sub_key] = sub_val[:top_n] - if "_truncated_categories" not in result: - result["_truncated_categories"] = {} - result["_truncated_categories"][sub_key] = len(sub_val) - - # Handle nested containers like dead-code's results{category:[]} - for key in _NESTED_LIST_KEYS: - val = result.get(key) - if isinstance(val, dict): + # Handle category-keyed dicts like by_category{cat: [...]} and + # nested containers like dead-code's results{category:[]} for sub_key, sub_val in val.items(): if isinstance(sub_val, list) and len(sub_val) > top_n: if sort_key: @@ -425,7 +435,8 @@ def _apply_top_n(result: Dict[str, Any], top_n: int, command: str = "") -> Dict[ result["_truncated_categories"] = {} result["_truncated_categories"][sub_key] = len(sub_val) - # Handle coverage_map in test-map (file:{fn:{...}}) + # Handle coverage_map in test-map (file:{fn:{...}}) — dict-of-dicts, not + # dict-of-lists, so the auto-discover above doesn't catch it. if "coverage_map" in result and isinstance(result["coverage_map"], dict): cm = result["coverage_map"] if len(cm) > top_n: diff --git a/tests/test_cli.py b/tests/test_cli.py index c572507..c57e393 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -15,6 +15,7 @@ from commands.query import cmd_query from commands.list import cmd_list from commands.init import cmd_init +from codelens import _apply_top_n, _NO_TOP_KEYS def _create_sample_workspace(): @@ -331,3 +332,107 @@ def test_check_full_cli_invocation_with_positional(self): finally: import shutil shutil.rmtree(ws, ignore_errors=True) + + +# ─── --top N universal truncation (issue #36) ────────────────── + + +class TestTopNUniversalTruncation: + """Regression guard for issue #36: --top N must truncate ALL list-valued + keys in command output, not just those in a hardcoded allowlist. + + Before the fix, ``_apply_top_n`` only truncated keys listed in ``_LIST_KEYS`` + (28 names). A new command returning a list under a non-standard key (e.g., + ``entities``, ``my_things``, ``nodes``) would silently ignore ``--top N``, + violating the documented *"Limit list results to top N items"* contract from + ``SKILL-QUICK.md`` and risking token overflow for MCP clients. + + The fix replaces the allowlist with runtime auto-discovery: every top-level + list-valued key is truncated, except those in ``_NO_TOP_KEYS`` (structural/ + metadata keys like ``available_commands``). + """ + + def test_non_standard_key_gets_truncated(self): + """A hypothetical key name not in any allowlist must still be truncated.""" + result = {"status": "ok", "entities": [{"id": i} for i in range(100)]} + out = _apply_top_n(result, 5) + assert len(out["entities"]) == 5, ( + f"Expected 5 items after truncation, got {len(out['entities'])}. " + "Non-standard key 'entities' was not truncated by --top." + ) + assert out["entities_truncated"] is True + assert out["entities_total"] == 100 + + def test_multiple_non_standard_keys_truncated(self): + """Multiple non-standard list keys must all be truncated.""" + result = { + "status": "ok", + "widgets": [{"id": i} for i in range(50)], + "gadgets": [{"id": i} for i in range(30)], + } + out = _apply_top_n(result, 10) + assert len(out["widgets"]) == 10 + assert len(out["gadgets"]) == 10 + assert out["widgets_truncated"] is True + assert out["gadgets_truncated"] is True + + def test_no_top_keys_exempt(self): + """Keys in _NO_TOP_KEYS must NOT be truncated even if they're lists.""" + result = { + "status": "ok", + "available_commands": [f"cmd_{i}" for i in range(50)], + } + out = _apply_top_n(result, 5) + assert len(out["available_commands"]) == 50, ( + "available_commands should NOT be truncated (it's in _NO_TOP_KEYS)" + ) + assert "available_commands_truncated" not in out + + def test_standard_keys_still_truncated(self): + """Existing allowlisted keys (e.g., 'findings') must still be truncated.""" + result = {"findings": [{"name": f"f_{i}"} for i in range(50)]} + out = _apply_top_n(result, 10) + assert len(out["findings"]) == 10 + assert out["findings_truncated"] is True + assert out["findings_total"] == 50 + + def test_top_zero_means_no_truncation_for_non_standard_key(self): + """--top 0 (unlimited) must apply to non-standard keys too.""" + result = {"entities": [{"id": i} for i in range(100)]} + out = _apply_top_n(result, 0) + assert len(out["entities"]) == 100 + + def test_nested_dict_non_standard_key_truncated(self): + """Non-standard dict-of-lists key must also be truncated.""" + result = { + "groups": { + "alpha": [{"id": i} for i in range(30)], + "beta": [{"id": i} for i in range(5)], + } + } + out = _apply_top_n(result, 10) + assert len(out["groups"]["alpha"]) == 10 + assert len(out["groups"]["beta"]) == 5 # under limit, no truncation + + def test_underscore_prefixed_keys_skipped(self): + """Internal keys starting with '_' should not be processed.""" + result = { + "_meta": [{"x": i} for i in range(50)], + "findings": [{"name": f"f_{i}"} for i in range(50)], + } + out = _apply_top_n(result, 5) + assert len(out["_meta"]) == 50, "_-prefixed keys should not be truncated" + assert len(out["findings"]) == 5 + + def test_repro_from_issue(self): + """Exact repro from issue #36: hypothetical command returning 'entities'.""" + # Simulate what a new command might return + result = {"status": "ok", "entities": [{"id": i} for i in range(1000)]} + out = _apply_top_n(result, 5) + # Before fix: len == 1000 (silent --top bypass) + # After fix: len == 5 (universal truncation) + assert len(out["entities"]) == 5, ( + "Issue #36 repro: --top 5 did not truncate 'entities' key. " + "This means a new command returning non-standard keys would " + "silently bypass --top, risking token overflow for MCP clients." + )