Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions openkb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,21 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
# Strip dangling wikilinks now so a retry (after a PageIndex
# failure below) finds a clean wiki — no point in re-running this
# on every attempt.
files_changed, ghosts = fix_broken_links(wiki_dir)
#
# Scope: only the pages this remove actually touched (modified
# concept pages ∪ index.md). Previously this swept the whole wiki
# via ``fix_broken_links(wiki_dir)``, which silently stripped
# pre-existing dangling links in unrelated pages — see issue #58
# (Bug 2). Users who want a wiki-wide sweep can still run
# ``openkb lint --fix`` explicitly.
lint_scope: list[Path] = [
wiki_dir / "concepts" / f"{slug}.md"
for slug in concept_result["modified"]
]
index_md = wiki_dir / "index.md"
if index_md.exists():
lint_scope.append(index_md)
files_changed, ghosts = fix_broken_links(wiki_dir, restrict_to=lint_scope)
if files_changed:
click.echo(f" lint --fix cleaned {ghosts} dangling wikilink(s) in {files_changed} file(s)")

Expand All @@ -839,7 +853,10 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
return

# ----- Commit point -----
registry.remove_by_doc_name(doc_name)
# Prune by hash, not by ``doc_name``: legacy registry entries
# (ingested before commit c504e26) carry only ``{name, type}`` and
# would silently no-op under ``remove_by_doc_name``. See issue #58.
registry.remove_by_hash(file_hash)

if raw_path is not None:
raw_path.unlink(missing_ok=True)
Expand Down
41 changes: 34 additions & 7 deletions openkb/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ def list_existing_wiki_targets(wiki_dir: Path) -> set[str]:
return targets


def fix_broken_links(wiki: Path) -> tuple[int, int]:
def fix_broken_links(
wiki: Path,
*,
restrict_to: list[Path] | None = None,
) -> tuple[int, int]:
"""Rewrite or strip broken [[wikilinks]] across the wiki in place.

For each Markdown page under ``wiki`` (excluding ``reports/`` and
Expand All @@ -184,6 +188,16 @@ def fix_broken_links(wiki: Path) -> tuple[int, int]:

Args:
wiki: Path to the wiki root directory.
restrict_to: When provided, only rewrite these files (must live
under ``wiki``). Paths outside the wiki and non-existent
paths are silently skipped. An empty list is a no-op — the
valid-target whitelist is still computed from the entire
wiki, so links like ``[[concepts/sibling]]`` resolve
correctly even when ``sibling.md`` is not in the scope.
Used by ``openkb remove`` (issue #58 / Bug 2) to clean only
the pages it actually touched instead of sweeping the
whole wiki and stripping pre-existing dangling links the
user may want to keep.

Returns:
Tuple of ``(files_changed, ghosts_stripped)``.
Expand All @@ -200,14 +214,27 @@ def fix_broken_links(wiki: Path) -> tuple[int, int]:
# otherwise strip_ghost_wikilinks would rebuild it per file (O(F·M)).
norm_index = build_norm_index(known_targets)

if restrict_to is None:
candidates: list[Path] = [
md for md in wiki.rglob("*.md")
if md.name not in _EXCLUDED_FILES
and md.relative_to(wiki).parts[:1] not in (("reports",), ("sources",))
]
else:
wiki_resolved = wiki.resolve()
candidates = []
for raw in restrict_to:
if not raw.is_file():
continue
try:
raw.resolve().relative_to(wiki_resolved)
except ValueError:
continue # outside wiki — skip silently
candidates.append(raw)

files_changed = 0
ghosts_stripped = 0
for md in wiki.rglob("*.md"):
if md.name in _EXCLUDED_FILES:
continue
rel_parts = md.relative_to(wiki).parts
if rel_parts and rel_parts[0] in ("reports", "sources"):
continue
for md in candidates:
text = _read_md(md)
cleaned, ghosts = strip_ghost_wikilinks(
text, known_targets, norm_index=norm_index,
Expand Down
14 changes: 14 additions & 0 deletions openkb/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ def remove_by_doc_name(self, doc_name: str) -> bool:
return True
return False

def remove_by_hash(self, file_hash: str) -> bool:
"""Remove the entry keyed by ``file_hash``. Returns True if removed.

Preferred over :meth:`remove_by_doc_name` when the caller already
has the hash in hand — works regardless of whether the entry's
metadata carries a ``doc_name`` field (legacy entries written
before commit c504e26 do not).
"""
if file_hash not in self._data:
return False
del self._data[file_hash]
self._persist()
return True

# ------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------
Expand Down
109 changes: 109 additions & 0 deletions tests/test_lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
find_broken_links,
find_missing_entries,
find_orphans,
fix_broken_links,
run_structural_lint,
strip_ghost_wikilinks,
)
Expand Down Expand Up @@ -392,3 +393,111 @@ def test_returns_normalized_to_canonical_map(self):

def test_empty_set_returns_empty_dict(self):
assert build_norm_index(set()) == {}


class TestFixBrokenLinksRestrictTo:
"""Issue #58 / Bug 2: ``fix_broken_links`` must support scoping the
rewrite to a caller-supplied subset of files so ``openkb remove``
can clean up only the pages it actually touched (modified concept
pages ∪ index.md) instead of sweeping the entire wiki and stripping
pre-existing dangling links the user may want to keep.
"""

def test_default_behavior_scans_all_files(self, tmp_path):
"""Calling ``fix_broken_links(wiki)`` without ``restrict_to``
still processes every wiki file — the existing global behavior
is preserved for callers that want it (e.g. ``openkb lint --fix``).
"""
wiki = _make_wiki(tmp_path)
a = wiki / "concepts" / "a.md"
b = wiki / "concepts" / "b.md"
a.write_text("# A\n\nLink [[concepts/ghost]] here.\n", encoding="utf-8")
b.write_text("# B\n\nLink [[concepts/ghost]] too.\n", encoding="utf-8")

files_changed, ghosts = fix_broken_links(wiki)

assert files_changed == 2
assert ghosts == 2
assert "[[concepts/ghost]]" not in a.read_text()
assert "[[concepts/ghost]]" not in b.read_text()

def test_restrict_to_only_touches_listed_files(self, tmp_path):
"""When ``restrict_to`` is provided, only those files are
rewritten — even if pre-existing ghost links exist elsewhere
in the wiki, those files are left alone.
"""
wiki = _make_wiki(tmp_path)
touched = wiki / "concepts" / "touched.md"
untouched = wiki / "concepts" / "untouched.md"
touched.write_text(
"# touched\n\nGhost [[concepts/ghost]] here.\n", encoding="utf-8",
)
untouched.write_text(
"# untouched\n\nGhost [[concepts/ghost]] here.\n", encoding="utf-8",
)

files_changed, ghosts = fix_broken_links(wiki, restrict_to=[touched])

assert files_changed == 1
assert ghosts == 1
assert "[[concepts/ghost]]" not in touched.read_text()
# Untouched file keeps its pre-existing ghost link verbatim.
assert "[[concepts/ghost]]" in untouched.read_text()

def test_restrict_to_empty_list_is_noop(self, tmp_path):
"""An empty ``restrict_to`` means "process nothing" (not "fall
back to wiki-wide"). The whole point of the parameter is letting
the CLI say "I touched zero files; don't sweep the wiki on my
behalf."
"""
wiki = _make_wiki(tmp_path)
a = wiki / "concepts" / "a.md"
a.write_text("# A\n\nGhost [[concepts/ghost]] here.\n", encoding="utf-8")

files_changed, ghosts = fix_broken_links(wiki, restrict_to=[])

assert files_changed == 0
assert ghosts == 0
assert "[[concepts/ghost]]" in a.read_text()

def test_restrict_to_skips_paths_not_under_wiki(self, tmp_path):
"""Defensive: a path that doesn't live under ``wiki`` (e.g. a
leftover absolute path from the caller) is silently skipped
rather than rewriting an unrelated file.
"""
wiki = _make_wiki(tmp_path)
stray = tmp_path / "stray.md"
stray.write_text("# stray\n[[concepts/ghost]]\n", encoding="utf-8")
before = stray.read_text()

files_changed, ghosts = fix_broken_links(wiki, restrict_to=[stray])

assert files_changed == 0
assert ghosts == 0
assert stray.read_text() == before

def test_restrict_to_uses_global_known_targets(self, tmp_path):
"""The valid-target set must still be computed from the whole
wiki — restricting only narrows which files get *rewritten*,
not what counts as a valid link target. Without this,
``[[concepts/sibling]]`` in the file under review would be
misclassified as a ghost just because ``sibling.md`` is outside
``restrict_to``.
"""
wiki = _make_wiki(tmp_path)
(wiki / "concepts" / "sibling.md").write_text("# sibling", encoding="utf-8")
target = wiki / "concepts" / "target.md"
target.write_text(
"Valid [[concepts/sibling]] and ghost [[concepts/ghost]]\n",
encoding="utf-8",
)

files_changed, ghosts = fix_broken_links(wiki, restrict_to=[target])

assert files_changed == 1
assert ghosts == 1
text = target.read_text()
# Real sibling link survives unchanged.
assert "[[concepts/sibling]]" in text
# Ghost link gets demoted.
assert "[[concepts/ghost]]" not in text
Loading