6 категорий штрафов со взвешенной суммой: - broken_wikilinks (×10) — [[foo]] не ведёт никуда - broken_paths (×10) — [text](./foo.md) не существует - missing_frontmatter (×3) — .md без frontmatter-заголовка - orphan_files (×2) — нет входящих wikilinks и не в служебных папках - undated_todos (×1) — "- [ ]" без YYYY-MM-DD - duplicate_basenames (×5) — одинаковое имя в разных папках Baseline сегодня: score=493 (158 файлов). Выход: audit/DATE-health.md (человекочит) + audit/health-latest.json (для agent-loop). False-positives зачинены: - wikilinks внутри backticks/fenced code игнорим - ../ и ./ пути резолвятся от файла-источника - CLAUDE.md/README.md не требуют frontmatter
351 lines
12 KiB
Python
Executable File
351 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
kb-health — объективная метрика качества KB (karpathy-style val_bpb).
|
||
|
||
Считаем штрафы. Меньше = лучше.
|
||
|
||
Категории:
|
||
- broken_wikilinks [[foo]] не ведёт никуда
|
||
- broken_paths [text](./foo.md) — путь не существует
|
||
- missing_frontmatter .md без `---\\n...\\n---`
|
||
- orphan_files нет входящих wikilinks, не в системных папках
|
||
- undated_todos "- [ ] ..." без YYYY-MM-DD в строке
|
||
- duplicate_basenames два файла с одинаковым basename в разных папках
|
||
|
||
Веса (итоговый score = сумма):
|
||
broken_wikilinks * 10
|
||
broken_paths * 10
|
||
missing_frontmatter * 3
|
||
orphan_files * 2
|
||
undated_todos * 1
|
||
duplicate_basenames * 5
|
||
|
||
Вывод:
|
||
audit/YYYY-MM-DD-health.md — человекочитаемый отчёт
|
||
audit/health-latest.json — JSON для kb-agent-loop.sh (сравнение score между прогонами)
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import sys
|
||
from collections import defaultdict
|
||
from datetime import date
|
||
from pathlib import Path
|
||
|
||
VAULT = Path(__file__).resolve().parent.parent
|
||
OUT_DIR = VAULT / "audit"
|
||
|
||
# папки/файлы которые сканируем
|
||
INCLUDE_DIRS = ["decisions", "notes", "projects", "snippets", "daily",
|
||
"claude-memory", "templates", "scripts", "audit"]
|
||
INCLUDE_ROOT_FILES = ["CLAUDE.md", "README.md"]
|
||
|
||
# папки исключаем полностью
|
||
EXCLUDE_DIRS = {".git", ".obsidian", ".claude"}
|
||
|
||
# файлы где orphan-статус норма (системные, служебные)
|
||
ORPHAN_OK_PATTERNS = [
|
||
re.compile(r"^daily/.*"), # daily-notes редко бэклинкуются
|
||
re.compile(r"^audit/.*"), # audit-отчёты
|
||
re.compile(r"^templates/.*"), # шаблоны
|
||
re.compile(r"^notes/claude/.*"), # автосейвы сессий Claude
|
||
re.compile(r"^scripts/.*"), # скрипты
|
||
re.compile(r"^CLAUDE\.md$"),
|
||
re.compile(r"^README\.md$"),
|
||
]
|
||
|
||
WEIGHTS = {
|
||
"broken_wikilinks": 10,
|
||
"broken_paths": 10,
|
||
"missing_frontmatter": 3,
|
||
"orphan_files": 2,
|
||
"undated_todos": 1,
|
||
"duplicate_basenames": 5,
|
||
}
|
||
|
||
|
||
def collect_md_files():
|
||
"""Возвращает list[Path] — все .md файлы в scope."""
|
||
files = []
|
||
for name in INCLUDE_ROOT_FILES:
|
||
p = VAULT / name
|
||
if p.is_file():
|
||
files.append(p)
|
||
for dname in INCLUDE_DIRS:
|
||
root = VAULT / dname
|
||
if not root.exists():
|
||
continue
|
||
for p in root.rglob("*.md"):
|
||
if any(part in EXCLUDE_DIRS for part in p.parts):
|
||
continue
|
||
files.append(p)
|
||
return files
|
||
|
||
|
||
def rel(p: Path) -> str:
|
||
return str(p.relative_to(VAULT))
|
||
|
||
|
||
def has_frontmatter(text: str) -> bool:
|
||
"""Frontmatter = `---\\n...\\n---` в самом начале."""
|
||
if not text.startswith("---\n"):
|
||
return False
|
||
return bool(re.match(r"---\n(.*?\n)*?---\n", text))
|
||
|
||
|
||
def strip_code(text: str) -> str:
|
||
"""Убираем inline `...` и fenced ```...``` — чтобы regex не цеплял примеры кода."""
|
||
text = re.sub(r"```[\s\S]*?```", "", text)
|
||
text = re.sub(r"`[^`\n]*`", "", text)
|
||
return text
|
||
|
||
|
||
def extract_wikilinks(text: str):
|
||
"""Возвращает list[str] — таргеты без alias/heading. Игнорим код-блоки."""
|
||
out = []
|
||
for m in re.finditer(r"\[\[([^\]]+)\]\]", strip_code(text)):
|
||
target = m.group(1).split("|")[0].split("#")[0].strip()
|
||
if target:
|
||
out.append(target)
|
||
return out
|
||
|
||
|
||
def extract_md_paths(text: str):
|
||
"""Относительные пути типа [text](./foo.md) или (../foo/bar.md)."""
|
||
out = []
|
||
for m in re.finditer(r"\]\(([^)]+?\.md)(?:#[^)]*)?\)", strip_code(text)):
|
||
path = m.group(1)
|
||
if path.startswith("http"):
|
||
continue
|
||
out.append(path)
|
||
return out
|
||
|
||
|
||
def count_undated_todos(text: str) -> int:
|
||
"""'- [ ] ...' без упоминания даты YYYY-MM-DD в той же строке."""
|
||
count = 0
|
||
for line in text.splitlines():
|
||
if re.match(r"^\s*-\s*\[\s*\]\s+", line):
|
||
if not re.search(r"\d{4}-\d{2}-\d{2}", line):
|
||
count += 1
|
||
return count
|
||
|
||
|
||
def resolve_wikilink(target: str, all_basenames: dict, from_file: Path):
|
||
"""Ищем файл по wikilink-target. Возвращает Path или None.
|
||
Стратегии:
|
||
1. '../foo/bar' — relative от файла-источника
|
||
2. 'folder/bar' — от корня vault
|
||
3. 'bar' — по basename в любой папке (Obsidian flat namespace)
|
||
"""
|
||
target_clean = target.replace(".md", "")
|
||
# relative с ../ или ./
|
||
if target_clean.startswith((".", "/")):
|
||
try:
|
||
resolved = (from_file.parent / target_clean).resolve()
|
||
# добавляем .md если нет
|
||
candidates = [resolved.with_suffix(".md"), resolved]
|
||
for c in candidates:
|
||
if c.is_file() and str(c).endswith(".md"):
|
||
return c
|
||
except Exception:
|
||
pass
|
||
# полный путь от корня vault
|
||
guess = VAULT / f"{target_clean}.md"
|
||
if guess.is_file():
|
||
return guess
|
||
# только basename — flat namespace
|
||
basename = target_clean.rsplit("/", 1)[-1]
|
||
if basename in all_basenames:
|
||
return all_basenames[basename][0]
|
||
return None
|
||
|
||
|
||
def resolve_md_path(path: str, from_file: Path):
|
||
"""Относительный путь из файла from_file."""
|
||
try:
|
||
resolved = (from_file.parent / path).resolve()
|
||
if resolved.is_file():
|
||
return resolved
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
|
||
def is_orphan_ok(relpath: str) -> bool:
|
||
return any(pat.match(relpath) for pat in ORPHAN_OK_PATTERNS)
|
||
|
||
|
||
def main():
|
||
today = date.today().isoformat()
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
files = collect_md_files()
|
||
if not files:
|
||
print("no md files found", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
# basename-index (для wikilink-резолва)
|
||
basenames = defaultdict(list)
|
||
for p in files:
|
||
key = p.stem
|
||
basenames[key].append(p)
|
||
|
||
# метрики
|
||
broken_wl = [] # (file, target)
|
||
broken_paths = [] # (file, path)
|
||
missing_fm = [] # file
|
||
undated_todos_per_file = {} # file: count
|
||
duplicate_basenames = [] # (basename, files)
|
||
incoming_links = defaultdict(set) # file → set of files linking TO it
|
||
|
||
# CLAUDE.md, README.md, шаблоны — служебные, frontmatter не требуем
|
||
fm_exempt = {"CLAUDE.md", "README.md", ".cursorrules"}
|
||
|
||
for f in files:
|
||
text = f.read_text(errors="ignore")
|
||
rel_f = rel(f)
|
||
|
||
if rel_f not in fm_exempt and f.name != "README.md" and not has_frontmatter(text):
|
||
missing_fm.append(rel_f)
|
||
|
||
for target in extract_wikilinks(text):
|
||
resolved = resolve_wikilink(target, basenames, f)
|
||
if resolved is None:
|
||
broken_wl.append((rel_f, target))
|
||
else:
|
||
incoming_links[resolved].add(f)
|
||
|
||
for path in extract_md_paths(text):
|
||
resolved = resolve_md_path(path, f)
|
||
if resolved is None:
|
||
broken_paths.append((rel_f, path))
|
||
|
||
n_todos = count_undated_todos(text)
|
||
if n_todos > 0:
|
||
undated_todos_per_file[rel_f] = n_todos
|
||
|
||
# orphan = нет incoming links, не в OK-zones
|
||
orphans = []
|
||
for f in files:
|
||
if f not in incoming_links:
|
||
rel_f = rel(f)
|
||
if not is_orphan_ok(rel_f):
|
||
orphans.append(rel_f)
|
||
|
||
# duplicate basenames
|
||
for name, paths in basenames.items():
|
||
if len(paths) > 1:
|
||
duplicate_basenames.append((name, [rel(p) for p in paths]))
|
||
|
||
counts = {
|
||
"broken_wikilinks": len(broken_wl),
|
||
"broken_paths": len(broken_paths),
|
||
"missing_frontmatter": len(missing_fm),
|
||
"orphan_files": len(orphans),
|
||
"undated_todos": sum(undated_todos_per_file.values()),
|
||
"duplicate_basenames": len(duplicate_basenames),
|
||
}
|
||
score = sum(counts[k] * WEIGHTS[k] for k in counts)
|
||
|
||
# JSON
|
||
latest = {
|
||
"date": today,
|
||
"score": score,
|
||
"counts": counts,
|
||
"weights": WEIGHTS,
|
||
"files_scanned": len(files),
|
||
}
|
||
(OUT_DIR / "health-latest.json").write_text(json.dumps(latest, indent=2, ensure_ascii=False))
|
||
|
||
# Markdown report
|
||
lines = [
|
||
"---",
|
||
f"date: {today}",
|
||
"type: audit",
|
||
"source: kb-health.py",
|
||
f"score: {score}",
|
||
"tags: [audit, health, metric]",
|
||
"---",
|
||
"",
|
||
f"# KB health — {today}",
|
||
"",
|
||
f"**Score (меньше = лучше): `{score}`**",
|
||
f"Проверено файлов: {len(files)}",
|
||
"",
|
||
"## Разбивка",
|
||
"",
|
||
"| Категория | Кол-во | Вес | Штраф |",
|
||
"|---|---:|---:|---:|",
|
||
]
|
||
for k in WEIGHTS:
|
||
c = counts[k]
|
||
w = WEIGHTS[k]
|
||
lines.append(f"| {k} | {c} | {w} | {c * w} |")
|
||
lines += ["| **ИТОГО** | | | **" + str(score) + "** |", ""]
|
||
|
||
if broken_wl:
|
||
lines += ["## Битые wikilinks", ""]
|
||
lines += ["| Откуда | `[[таргет]]` |", "|---|---|"]
|
||
for fr, tg in broken_wl[:50]:
|
||
lines.append(f"| `{fr}` | `[[{tg}]]` |")
|
||
if len(broken_wl) > 50:
|
||
lines.append(f"| ... | +{len(broken_wl)-50} ещё |")
|
||
lines.append("")
|
||
|
||
if broken_paths:
|
||
lines += ["## Битые relative-пути", ""]
|
||
lines += ["| Откуда | Путь |", "|---|---|"]
|
||
for fr, pt in broken_paths[:50]:
|
||
lines.append(f"| `{fr}` | `{pt}` |")
|
||
if len(broken_paths) > 50:
|
||
lines.append(f"| ... | +{len(broken_paths)-50} ещё |")
|
||
lines.append("")
|
||
|
||
if missing_fm:
|
||
lines += [f"## Без frontmatter ({len(missing_fm)})", ""]
|
||
for f in missing_fm[:30]:
|
||
lines.append(f"- `{f}`")
|
||
if len(missing_fm) > 30:
|
||
lines.append(f"- ... +{len(missing_fm)-30} ещё")
|
||
lines.append("")
|
||
|
||
if orphans:
|
||
lines += [f"## Orphan — без бэклинков ({len(orphans)})", "",
|
||
"_Эти файлы никто не упоминает через `[[..]]`. Кандидаты на удаление или добавление ссылок._", ""]
|
||
for f in orphans[:30]:
|
||
lines.append(f"- `{f}`")
|
||
if len(orphans) > 30:
|
||
lines.append(f"- ... +{len(orphans)-30} ещё")
|
||
lines.append("")
|
||
|
||
if undated_todos_per_file:
|
||
lines += [f"## TODO без даты ({sum(undated_todos_per_file.values())} шт в {len(undated_todos_per_file)} файлах)", ""]
|
||
for f, n in sorted(undated_todos_per_file.items(), key=lambda x: -x[1])[:20]:
|
||
lines.append(f"- `{f}` — {n} шт")
|
||
lines.append("")
|
||
|
||
if duplicate_basenames:
|
||
lines += [f"## Дубликаты имён ({len(duplicate_basenames)})", ""]
|
||
for name, paths in duplicate_basenames[:20]:
|
||
lines.append(f"- `{name}.md`:")
|
||
for p in paths:
|
||
lines.append(f" - `{p}`")
|
||
lines.append("")
|
||
|
||
lines += [
|
||
"---",
|
||
"*Генерируется `scripts/kb-health.py`. JSON-версия в `audit/health-latest.json` для agent-loop.*",
|
||
]
|
||
|
||
out = OUT_DIR / f"{today}-health.md"
|
||
out.write_text("\n".join(lines))
|
||
print(f"health report: {out}")
|
||
print(f" score: {score}")
|
||
for k, v in counts.items():
|
||
print(f" {k}: {v}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|