#!/usr/bin/env python3 """ kb-health — объективная метрика качества KB (karpathy-style val_bpb). Считаем штрафы. Меньше = лучше. Категории: - broken_wikilinks [[foo]] не ведёт никуда - broken_paths [text](./foo.md) — путь не существует - missing_frontmatter .md без `---\\n...\\n---` - orphan_files нет входящих wikilinks, не в системных папках - undated_todos "- [ ] ..." без YYYY-MM-DD в строке - duplicate_basenames два файла с одинаковым basename в разных папках Веса (итоговый score = сумма): broken_wikilinks * 10 broken_paths * 10 missing_frontmatter * 3 orphan_files * 2 undated_todos * 1 duplicate_basenames * 5 Вывод: audit/YYYY-MM-DD-health.md — человекочитаемый отчёт audit/health-latest.json — JSON для kb-agent-loop.sh (сравнение score между прогонами) """ import json import re import sys from collections import defaultdict from datetime import date from pathlib import Path VAULT = Path(__file__).resolve().parent.parent OUT_DIR = VAULT / "audit" # папки/файлы которые сканируем INCLUDE_DIRS = ["decisions", "notes", "projects", "snippets", "daily", "claude-memory", "templates", "scripts", "audit"] INCLUDE_ROOT_FILES = ["CLAUDE.md", "README.md"] # папки исключаем полностью EXCLUDE_DIRS = {".git", ".obsidian", ".claude"} # файлы где orphan-статус норма (системные, служебные) ORPHAN_OK_PATTERNS = [ re.compile(r"^daily/.*"), # daily-notes редко бэклинкуются re.compile(r"^audit/.*"), # audit-отчёты re.compile(r"^templates/.*"), # шаблоны re.compile(r"^notes/claude/.*"), # автосейвы сессий Claude re.compile(r"^scripts/.*"), # скрипты re.compile(r"^CLAUDE\.md$"), re.compile(r"^README\.md$"), ] WEIGHTS = { "broken_wikilinks": 10, "broken_paths": 10, "missing_frontmatter": 3, "orphan_files": 2, "undated_todos": 1, "duplicate_basenames": 5, } def collect_md_files(): """Возвращает list[Path] — все .md файлы в scope.""" files = [] for name in INCLUDE_ROOT_FILES: p = VAULT / name if p.is_file(): files.append(p) for dname in INCLUDE_DIRS: root = VAULT / dname if not root.exists(): continue for p in root.rglob("*.md"): if any(part in EXCLUDE_DIRS for part in p.parts): continue files.append(p) return files def rel(p: Path) -> str: return str(p.relative_to(VAULT)) def has_frontmatter(text: str) -> bool: """Frontmatter = `---\\n...\\n---` в самом начале.""" if not text.startswith("---\n"): return False return bool(re.match(r"---\n(.*?\n)*?---\n", text)) def strip_code(text: str) -> str: """Убираем inline `...` и fenced ```...``` — чтобы regex не цеплял примеры кода.""" text = re.sub(r"```[\s\S]*?```", "", text) text = re.sub(r"`[^`\n]*`", "", text) return text def extract_wikilinks(text: str): """Возвращает list[str] — таргеты без alias/heading. Игнорим код-блоки.""" out = [] for m in re.finditer(r"\[\[([^\]]+)\]\]", strip_code(text)): target = m.group(1).split("|")[0].split("#")[0].strip() if target: out.append(target) return out def extract_md_paths(text: str): """Относительные пути типа [text](./foo.md) или (../foo/bar.md).""" out = [] for m in re.finditer(r"\]\(([^)]+?\.md)(?:#[^)]*)?\)", strip_code(text)): path = m.group(1) if path.startswith("http"): continue out.append(path) return out def count_undated_todos(text: str) -> int: """'- [ ] ...' без упоминания даты YYYY-MM-DD в той же строке.""" count = 0 for line in text.splitlines(): if re.match(r"^\s*-\s*\[\s*\]\s+", line): if not re.search(r"\d{4}-\d{2}-\d{2}", line): count += 1 return count def resolve_wikilink(target: str, all_basenames: dict, from_file: Path): """Ищем файл по wikilink-target. Возвращает Path или None. Стратегии: 1. '../foo/bar' — relative от файла-источника 2. 'folder/bar' — от корня vault 3. 'bar' — по basename в любой папке (Obsidian flat namespace) """ target_clean = target.replace(".md", "") # relative с ../ или ./ if target_clean.startswith((".", "/")): try: resolved = (from_file.parent / target_clean).resolve() # добавляем .md если нет candidates = [resolved.with_suffix(".md"), resolved] for c in candidates: if c.is_file() and str(c).endswith(".md"): return c except Exception: pass # полный путь от корня vault guess = VAULT / f"{target_clean}.md" if guess.is_file(): return guess # только basename — flat namespace basename = target_clean.rsplit("/", 1)[-1] if basename in all_basenames: return all_basenames[basename][0] return None def resolve_md_path(path: str, from_file: Path): """Относительный путь из файла from_file.""" try: resolved = (from_file.parent / path).resolve() if resolved.is_file(): return resolved except Exception: pass return None def is_orphan_ok(relpath: str) -> bool: return any(pat.match(relpath) for pat in ORPHAN_OK_PATTERNS) def main(): today = date.today().isoformat() OUT_DIR.mkdir(parents=True, exist_ok=True) files = collect_md_files() if not files: print("no md files found", file=sys.stderr) sys.exit(1) # basename-index (для wikilink-резолва) basenames = defaultdict(list) for p in files: key = p.stem basenames[key].append(p) # метрики broken_wl = [] # (file, target) broken_paths = [] # (file, path) missing_fm = [] # file undated_todos_per_file = {} # file: count duplicate_basenames = [] # (basename, files) incoming_links = defaultdict(set) # file → set of files linking TO it # CLAUDE.md, README.md, шаблоны — служебные, frontmatter не требуем fm_exempt = {"CLAUDE.md", "README.md", ".cursorrules"} for f in files: text = f.read_text(errors="ignore") rel_f = rel(f) if rel_f not in fm_exempt and f.name != "README.md" and not has_frontmatter(text): missing_fm.append(rel_f) for target in extract_wikilinks(text): resolved = resolve_wikilink(target, basenames, f) if resolved is None: broken_wl.append((rel_f, target)) else: incoming_links[resolved].add(f) for path in extract_md_paths(text): resolved = resolve_md_path(path, f) if resolved is None: broken_paths.append((rel_f, path)) n_todos = count_undated_todos(text) if n_todos > 0: undated_todos_per_file[rel_f] = n_todos # orphan = нет incoming links, не в OK-zones orphans = [] for f in files: if f not in incoming_links: rel_f = rel(f) if not is_orphan_ok(rel_f): orphans.append(rel_f) # duplicate basenames for name, paths in basenames.items(): if len(paths) > 1: duplicate_basenames.append((name, [rel(p) for p in paths])) counts = { "broken_wikilinks": len(broken_wl), "broken_paths": len(broken_paths), "missing_frontmatter": len(missing_fm), "orphan_files": len(orphans), "undated_todos": sum(undated_todos_per_file.values()), "duplicate_basenames": len(duplicate_basenames), } score = sum(counts[k] * WEIGHTS[k] for k in counts) # JSON latest = { "date": today, "score": score, "counts": counts, "weights": WEIGHTS, "files_scanned": len(files), } (OUT_DIR / "health-latest.json").write_text(json.dumps(latest, indent=2, ensure_ascii=False)) # Markdown report lines = [ "---", f"date: {today}", "type: audit", "source: kb-health.py", f"score: {score}", "tags: [audit, health, metric]", "---", "", f"# KB health — {today}", "", f"**Score (меньше = лучше): `{score}`**", f"Проверено файлов: {len(files)}", "", "## Разбивка", "", "| Категория | Кол-во | Вес | Штраф |", "|---|---:|---:|---:|", ] for k in WEIGHTS: c = counts[k] w = WEIGHTS[k] lines.append(f"| {k} | {c} | {w} | {c * w} |") lines += ["| **ИТОГО** | | | **" + str(score) + "** |", ""] if broken_wl: lines += ["## Битые wikilinks", ""] lines += ["| Откуда | `[[таргет]]` |", "|---|---|"] for fr, tg in broken_wl[:50]: lines.append(f"| `{fr}` | `[[{tg}]]` |") if len(broken_wl) > 50: lines.append(f"| ... | +{len(broken_wl)-50} ещё |") lines.append("") if broken_paths: lines += ["## Битые relative-пути", ""] lines += ["| Откуда | Путь |", "|---|---|"] for fr, pt in broken_paths[:50]: lines.append(f"| `{fr}` | `{pt}` |") if len(broken_paths) > 50: lines.append(f"| ... | +{len(broken_paths)-50} ещё |") lines.append("") if missing_fm: lines += [f"## Без frontmatter ({len(missing_fm)})", ""] for f in missing_fm[:30]: lines.append(f"- `{f}`") if len(missing_fm) > 30: lines.append(f"- ... +{len(missing_fm)-30} ещё") lines.append("") if orphans: lines += [f"## Orphan — без бэклинков ({len(orphans)})", "", "_Эти файлы никто не упоминает через `[[..]]`. Кандидаты на удаление или добавление ссылок._", ""] for f in orphans[:30]: lines.append(f"- `{f}`") if len(orphans) > 30: lines.append(f"- ... +{len(orphans)-30} ещё") lines.append("") if undated_todos_per_file: lines += [f"## TODO без даты ({sum(undated_todos_per_file.values())} шт в {len(undated_todos_per_file)} файлах)", ""] for f, n in sorted(undated_todos_per_file.items(), key=lambda x: -x[1])[:20]: lines.append(f"- `{f}` — {n} шт") lines.append("") if duplicate_basenames: lines += [f"## Дубликаты имён ({len(duplicate_basenames)})", ""] for name, paths in duplicate_basenames[:20]: lines.append(f"- `{name}.md`:") for p in paths: lines.append(f" - `{p}`") lines.append("") lines += [ "---", "*Генерируется `scripts/kb-health.py`. JSON-версия в `audit/health-latest.json` для agent-loop.*", ] out = OUT_DIR / f"{today}-health.md" out.write_text("\n".join(lines)) print(f"health report: {out}") print(f" score: {score}") for k, v in counts.items(): print(f" {k}: {v}") if __name__ == "__main__": main()