Files
public-tools/check_md_links.py
2026-01-29 18:14:26 +01:00

75 lines
2.0 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import re
import sys
from pathlib import Path
from urllib.parse import unquote
LINK_RE = re.compile(r"\[[^\]]*\]\(([^)]+)\)") # [text](target)
def is_external(target: str) -> bool:
return bool(re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", target))
def normalize_target(target: str) -> str:
target = target.strip()
# strip surrounding <>
if target.startswith("<") and target.endswith(">"):
target = target[1:-1].strip()
# drop anchor
target = target.split("#", 1)[0]
# drop query
target = target.split("?", 1)[0]
# URL decode
target = unquote(target)
return target
def main() -> int:
root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(".")
root = root.resolve()
missing = []
for md in root.rglob("*.md"):
text = md.read_text(encoding="utf-8", errors="ignore")
for m in LINK_RE.finditer(text):
raw = m.group(1)
if is_external(raw):
continue
target = normalize_target(raw)
if not target:
continue
# ignore mailto:, etc.
if (
":" in target
and not target.startswith("./")
and not target.startswith("../")
):
continue
resolved = (md.parent / target).resolve()
try:
resolved.relative_to(root)
except ValueError:
# points outside repo; treat as external-ish
continue
if not resolved.exists():
missing.append((md, raw, str(resolved.relative_to(root))))
if not missing:
print("OK: no missing internal link targets found.")
return 0
print("Missing internal link targets:\n")
for src, raw, resolved_rel in missing:
print(f"- {src.relative_to(root)}: ({raw}) -> {resolved_rel}")
return 1
if __name__ == "__main__":
raise SystemExit(main())