diff --git a/openkb/cli.py b/openkb/cli.py index 60d8551a..a322a22a 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -812,7 +812,21 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): # Strip dangling wikilinks now so a retry (after a PageIndex # failure below) finds a clean wiki — no point in re-running this # on every attempt. - files_changed, ghosts = fix_broken_links(wiki_dir) + # + # Scope: only the pages this remove actually touched (modified + # concept pages ∪ index.md). Previously this swept the whole wiki + # via ``fix_broken_links(wiki_dir)``, which silently stripped + # pre-existing dangling links in unrelated pages — see issue #58 + # (Bug 2). Users who want a wiki-wide sweep can still run + # ``openkb lint --fix`` explicitly. + lint_scope: list[Path] = [ + wiki_dir / "concepts" / f"{slug}.md" + for slug in concept_result["modified"] + ] + index_md = wiki_dir / "index.md" + if index_md.exists(): + lint_scope.append(index_md) + files_changed, ghosts = fix_broken_links(wiki_dir, restrict_to=lint_scope) if files_changed: click.echo(f" lint --fix cleaned {ghosts} dangling wikilink(s) in {files_changed} file(s)") @@ -839,7 +853,10 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): return # ----- Commit point ----- - registry.remove_by_doc_name(doc_name) + # Prune by hash, not by ``doc_name``: legacy registry entries + # (ingested before commit c504e26) carry only ``{name, type}`` and + # would silently no-op under ``remove_by_doc_name``. See issue #58. + registry.remove_by_hash(file_hash) if raw_path is not None: raw_path.unlink(missing_ok=True) diff --git a/openkb/lint.py b/openkb/lint.py index db5af226..ddcfe10b 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -173,7 +173,11 @@ def list_existing_wiki_targets(wiki_dir: Path) -> set[str]: return targets -def fix_broken_links(wiki: Path) -> tuple[int, int]: +def fix_broken_links( + wiki: Path, + *, + restrict_to: list[Path] | None = None, +) -> tuple[int, int]: """Rewrite or strip broken [[wikilinks]] across the wiki in place. For each Markdown page under ``wiki`` (excluding ``reports/`` and @@ -184,6 +188,16 @@ def fix_broken_links(wiki: Path) -> tuple[int, int]: Args: wiki: Path to the wiki root directory. + restrict_to: When provided, only rewrite these files (must live + under ``wiki``). Paths outside the wiki and non-existent + paths are silently skipped. An empty list is a no-op — the + valid-target whitelist is still computed from the entire + wiki, so links like ``[[concepts/sibling]]`` resolve + correctly even when ``sibling.md`` is not in the scope. + Used by ``openkb remove`` (issue #58 / Bug 2) to clean only + the pages it actually touched instead of sweeping the + whole wiki and stripping pre-existing dangling links the + user may want to keep. Returns: Tuple of ``(files_changed, ghosts_stripped)``. @@ -200,14 +214,27 @@ def fix_broken_links(wiki: Path) -> tuple[int, int]: # otherwise strip_ghost_wikilinks would rebuild it per file (O(F·M)). norm_index = build_norm_index(known_targets) + if restrict_to is None: + candidates: list[Path] = [ + md for md in wiki.rglob("*.md") + if md.name not in _EXCLUDED_FILES + and md.relative_to(wiki).parts[:1] not in (("reports",), ("sources",)) + ] + else: + wiki_resolved = wiki.resolve() + candidates = [] + for raw in restrict_to: + if not raw.is_file(): + continue + try: + raw.resolve().relative_to(wiki_resolved) + except ValueError: + continue # outside wiki — skip silently + candidates.append(raw) + files_changed = 0 ghosts_stripped = 0 - for md in wiki.rglob("*.md"): - if md.name in _EXCLUDED_FILES: - continue - rel_parts = md.relative_to(wiki).parts - if rel_parts and rel_parts[0] in ("reports", "sources"): - continue + for md in candidates: text = _read_md(md) cleaned, ghosts = strip_ghost_wikilinks( text, known_targets, norm_index=norm_index, diff --git a/openkb/state.py b/openkb/state.py index 6e6e0877..7b945037 100644 --- a/openkb/state.py +++ b/openkb/state.py @@ -50,6 +50,20 @@ def remove_by_doc_name(self, doc_name: str) -> bool: return True return False + def remove_by_hash(self, file_hash: str) -> bool: + """Remove the entry keyed by ``file_hash``. Returns True if removed. + + Preferred over :meth:`remove_by_doc_name` when the caller already + has the hash in hand — works regardless of whether the entry's + metadata carries a ``doc_name`` field (legacy entries written + before commit c504e26 do not). + """ + if file_hash not in self._data: + return False + del self._data[file_hash] + self._persist() + return True + # ------------------------------------------------------------------ # Internal # ------------------------------------------------------------------ diff --git a/tests/test_lint.py b/tests/test_lint.py index 6b06112c..fe6a3e6a 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -11,6 +11,7 @@ find_broken_links, find_missing_entries, find_orphans, + fix_broken_links, run_structural_lint, strip_ghost_wikilinks, ) @@ -392,3 +393,111 @@ def test_returns_normalized_to_canonical_map(self): def test_empty_set_returns_empty_dict(self): assert build_norm_index(set()) == {} + + +class TestFixBrokenLinksRestrictTo: + """Issue #58 / Bug 2: ``fix_broken_links`` must support scoping the + rewrite to a caller-supplied subset of files so ``openkb remove`` + can clean up only the pages it actually touched (modified concept + pages ∪ index.md) instead of sweeping the entire wiki and stripping + pre-existing dangling links the user may want to keep. + """ + + def test_default_behavior_scans_all_files(self, tmp_path): + """Calling ``fix_broken_links(wiki)`` without ``restrict_to`` + still processes every wiki file — the existing global behavior + is preserved for callers that want it (e.g. ``openkb lint --fix``). + """ + wiki = _make_wiki(tmp_path) + a = wiki / "concepts" / "a.md" + b = wiki / "concepts" / "b.md" + a.write_text("# A\n\nLink [[concepts/ghost]] here.\n", encoding="utf-8") + b.write_text("# B\n\nLink [[concepts/ghost]] too.\n", encoding="utf-8") + + files_changed, ghosts = fix_broken_links(wiki) + + assert files_changed == 2 + assert ghosts == 2 + assert "[[concepts/ghost]]" not in a.read_text() + assert "[[concepts/ghost]]" not in b.read_text() + + def test_restrict_to_only_touches_listed_files(self, tmp_path): + """When ``restrict_to`` is provided, only those files are + rewritten — even if pre-existing ghost links exist elsewhere + in the wiki, those files are left alone. + """ + wiki = _make_wiki(tmp_path) + touched = wiki / "concepts" / "touched.md" + untouched = wiki / "concepts" / "untouched.md" + touched.write_text( + "# touched\n\nGhost [[concepts/ghost]] here.\n", encoding="utf-8", + ) + untouched.write_text( + "# untouched\n\nGhost [[concepts/ghost]] here.\n", encoding="utf-8", + ) + + files_changed, ghosts = fix_broken_links(wiki, restrict_to=[touched]) + + assert files_changed == 1 + assert ghosts == 1 + assert "[[concepts/ghost]]" not in touched.read_text() + # Untouched file keeps its pre-existing ghost link verbatim. + assert "[[concepts/ghost]]" in untouched.read_text() + + def test_restrict_to_empty_list_is_noop(self, tmp_path): + """An empty ``restrict_to`` means "process nothing" (not "fall + back to wiki-wide"). The whole point of the parameter is letting + the CLI say "I touched zero files; don't sweep the wiki on my + behalf." + """ + wiki = _make_wiki(tmp_path) + a = wiki / "concepts" / "a.md" + a.write_text("# A\n\nGhost [[concepts/ghost]] here.\n", encoding="utf-8") + + files_changed, ghosts = fix_broken_links(wiki, restrict_to=[]) + + assert files_changed == 0 + assert ghosts == 0 + assert "[[concepts/ghost]]" in a.read_text() + + def test_restrict_to_skips_paths_not_under_wiki(self, tmp_path): + """Defensive: a path that doesn't live under ``wiki`` (e.g. a + leftover absolute path from the caller) is silently skipped + rather than rewriting an unrelated file. + """ + wiki = _make_wiki(tmp_path) + stray = tmp_path / "stray.md" + stray.write_text("# stray\n[[concepts/ghost]]\n", encoding="utf-8") + before = stray.read_text() + + files_changed, ghosts = fix_broken_links(wiki, restrict_to=[stray]) + + assert files_changed == 0 + assert ghosts == 0 + assert stray.read_text() == before + + def test_restrict_to_uses_global_known_targets(self, tmp_path): + """The valid-target set must still be computed from the whole + wiki — restricting only narrows which files get *rewritten*, + not what counts as a valid link target. Without this, + ``[[concepts/sibling]]`` in the file under review would be + misclassified as a ghost just because ``sibling.md`` is outside + ``restrict_to``. + """ + wiki = _make_wiki(tmp_path) + (wiki / "concepts" / "sibling.md").write_text("# sibling", encoding="utf-8") + target = wiki / "concepts" / "target.md" + target.write_text( + "Valid [[concepts/sibling]] and ghost [[concepts/ghost]]\n", + encoding="utf-8", + ) + + files_changed, ghosts = fix_broken_links(wiki, restrict_to=[target]) + + assert files_changed == 1 + assert ghosts == 1 + text = target.read_text() + # Real sibling link survives unchanged. + assert "[[concepts/sibling]]" in text + # Ghost link gets demoted. + assert "[[concepts/ghost]]" not in text diff --git a/tests/test_remove.py b/tests/test_remove.py index cc5b252e..8518d639 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -222,6 +222,34 @@ def test_hash_registry_remove_by_doc_name(tmp_path): assert list(saved.keys()) == ["h2"] +def test_hash_registry_remove_by_hash(tmp_path): + """Issue #58 / Bug 1: `remove_by_hash` is the hash-keyed sibling of + `remove_by_doc_name`. It exists so callers that already have the + file_hash in hand (e.g. the CLI after `_resolve_doc_identifier`) + don't need to round-trip through the `doc_name` slug — that round + trip is what silently no-ops for legacy registry entries lacking a + `doc_name` key (ingested before commit c504e26). + """ + path = tmp_path / "hashes.json" + path.write_text(json.dumps({ + "h_modern": {"name": "a.pdf", "doc_name": "a-h_modern", "type": "short"}, + "h_legacy": {"name": "b.pdf", "type": "short"}, # no doc_name + })) + + reg = HashRegistry(path) + + # Modern entry: removable by hash. + assert reg.remove_by_hash("h_modern") is True + assert reg.remove_by_hash("h_modern") is False # idempotent + # Legacy entry (no doc_name): removable by hash too — that's the point. + assert reg.remove_by_hash("h_legacy") is True + # Unknown hash: returns False, doesn't raise. + assert reg.remove_by_hash("never-existed") is False + + assert reg.all_entries() == {} + assert json.loads(path.read_text()) == {} + + # --------------------------------------------------------------------------- # _resolve_doc_identifier # --------------------------------------------------------------------------- @@ -459,26 +487,130 @@ def test_cli_remove_confirm_no_aborts(kb_dir): assert (kb_dir / "wiki" / "summaries" / "attention-h_a.md").exists() -def test_cli_remove_lint_cleans_dangling_links(kb_dir): - """`openkb remove` must auto-run lint --fix so wikilinks pointing at - the deleted summary/concept get stripped from sibling pages. +def _seed_legacy_kb(kb_dir: Path) -> None: + """Seed a KB whose registry entry pre-dates PR #51. + + Layout reflects the issue #58 / Bug 1 repro: ``hashes.json`` only + has ``{name, type}`` (no ``doc_name`` key), and the wiki paths use + the bare stem of the original filename — which is also what + ``cli.py``'s ``Path(name).stem`` fallback produces on the read path. + """ + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({ + "h_legacy": {"name": "ollama.md", "type": "md"}, + "h_keep": {"name": "other.md", "type": "md"}, # untouched bystander + })) + (kb_dir / "raw" / "ollama.md").write_text("# Ollama\n", encoding="utf-8") + (kb_dir / "raw" / "other.md").write_text("# Other\n", encoding="utf-8") + + (kb_dir / "wiki" / "summaries" / "ollama.md").write_text( + "---\nsources: [raw/ollama.md]\nbrief: x\n---\n# Ollama\n", + encoding="utf-8", + ) + (kb_dir / "wiki" / "summaries" / "other.md").write_text( + "---\nsources: [raw/other.md]\nbrief: y\n---\n# Other\n", + encoding="utf-8", + ) + (kb_dir / "wiki" / "index.md").write_text( + "# Knowledge Base Index\n\n## Documents\n" + "- [[summaries/ollama]] (md) - Ollama notes\n" + "- [[summaries/other]] (md) - Other notes\n\n" + "## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + (kb_dir / "wiki" / "log.md").write_text("# Log\n", encoding="utf-8") + + +def test_cli_remove_prunes_legacy_registry_entry_without_doc_name(kb_dir): + """Issue #58 / Bug 1 regression: a registry entry written before + PR #51 has only ``{name, type}``. The earlier remove path called + ``remove_by_doc_name(meta.get('doc_name') or Path(name).stem)``, + which silently no-op'd because ``meta.get('doc_name')`` is None for + every legacy row — leaving an orphan hash entry that then re-bound + the next ``openkb add`` of the same file via SHA dedup. + + Fix: the CLI now prunes the registry by the already-resolved + ``file_hash`` (returned by ``_resolve_doc_identifier``), so the + metadata shape doesn't matter. + """ + _seed_legacy_kb(kb_dir) + hashes_path = kb_dir / ".openkb" / "hashes.json" + + result = _invoke(kb_dir, ["remove", "ollama.md", "--yes"]) + + assert result.exit_code == 0, result.output + + remaining = json.loads(hashes_path.read_text()) + assert "h_legacy" not in remaining, ( + "legacy registry entry survived remove — see issue #58 Bug 1" + ) + # Sibling legacy entry is untouched. + assert "h_keep" in remaining + + # Wiki side-effects of the remove still happened (sanity). + assert not (kb_dir / "wiki" / "summaries" / "ollama.md").exists() + assert not (kb_dir / "raw" / "ollama.md").exists() + + +def test_cli_remove_lint_cleans_dangling_links_in_modified_page(kb_dir): + """`openkb remove` must auto-run a scoped lint --fix so wikilinks + pointing at the just-deleted summary get stripped from concept + pages that this removal modified. + + Scope (issue #58 / Bug 2): only files in ``concept_result["modified"]`` + plus ``index.md`` — see ``test_cli_remove_preserves_ghosts_in_unrelated_pages`` + for the complementary contract. """ _seed_two_doc_kb(kb_dir) - # Plant a stray reference to the about-to-be-deleted summary inside - # the surviving concept's body (not in any structured section). - llm_path = kb_dir / "wiki" / "concepts" / "llm.md" - llm_path.write_text( - llm_path.read_text() + "\nSee also [[summaries/attention-h_a]] for background.\n", + # Plant a stray free-text reference in the body of the MULTI-source + # concept page — `concepts/attention.md` has both attention-h_a and + # llm-h_l as sources, so the remove flow modifies it (drops + # attention-h_a from the frontmatter). The lint pass should also + # strip the dangling free-text link. + attn_path = kb_dir / "wiki" / "concepts" / "attention.md" + attn_path.write_text( + attn_path.read_text() + "\nSee also [[summaries/attention-h_a]] for background.\n", encoding="utf-8", ) result = _invoke(kb_dir, ["remove", "attention.pdf", "--yes"]) assert result.exit_code == 0, result.output - cleaned = llm_path.read_text() + cleaned = attn_path.read_text() assert "[[summaries/attention-h_a]]" not in cleaned +def test_cli_remove_preserves_ghosts_in_unrelated_pages(kb_dir): + """Issue #58 / Bug 2 regression: `openkb remove` must NOT strip + pre-existing dangling wikilinks from concept pages that don't have + the removed doc in their frontmatter sources. + + Before the fix, ``fix_broken_links(wiki_dir)`` swept the whole wiki + on every remove, producing 39-file / 1254-line diffs and silently + deleting links the user had hand-written to not-yet-added concepts. + """ + _seed_two_doc_kb(kb_dir) + # llm.md's frontmatter sources include only ``summaries/llm-h_l`` — + # the removal of ``attention.pdf`` does NOT touch its sources list, + # so it is out of scope for the post-remove lint pass. + llm_path = kb_dir / "wiki" / "concepts" / "llm.md" + original = llm_path.read_text() + ( + "\nFollow-up: [[concepts/agent-loops]] (concept I'll add next week).\n" + "Also a hand-added back-ref [[summaries/attention-h_a]] users may want.\n" + ) + llm_path.write_text(original, encoding="utf-8") + + result = _invoke(kb_dir, ["remove", "attention.pdf", "--yes"]) + + assert result.exit_code == 0, result.output + surviving = llm_path.read_text() + # Pre-existing ghost to a not-yet-added concept survives. + assert "[[concepts/agent-loops]]" in surviving + # And a hand-added link to the just-deleted summary also survives + # because llm.md is OUT OF SCOPE for this removal's cleanup. Users + # who want a wiki-wide sweep can run `openkb lint --fix` explicitly. + assert "[[summaries/attention-h_a]]" in surviving + + # --------------------------------------------------------------------------- # Regression: code-review issue #1 # `openkb add` must persist `doc_name` so `remove_by_doc_name` can prune