Files
knowledge-base/snippets/owui-kb-sync.py

145 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Инкрементальный синк vault → Open WebUI Knowledge.
Синкает только полезные папки, исключает файлы с секретами. Идемпотентен (по md5).
Устойчив: пустые файлы и ошибки отдельных файлов не валят весь прогон.
Запуск по cron на LXC 142. Логи: /opt/owui-kb-sync/sync.log
"""
import os, sys, json, hashlib, re, urllib.request, urllib.error, uuid
BASE = "http://localhost:3000"
EMAIL, PASSWORD = "it5870@yandex.ru", "1qaz!QAZ"
KB_ID = "7f60313d-add9-4f99-ad53-89e792295129"
ROOT = "/opt/owui-kb-sync/kb"
MANIFEST = "/opt/owui-kb-sync/manifest.json"
INCLUDE_DIRS = ("projects", "decisions", "claude-memory", "snippets")
EXCLUDE_SUBSTR = ("credential", "secret", ".env", "/password") # путь в нижнем регистре
_FM = re.compile(r"^---\n.*?\n---\n", re.S)
def _req(path, data=None, token=None, method=None, raw=None, ctype="application/json", timeout=180):
hdr = {}
if token:
hdr["Authorization"] = "Bearer " + token
if raw is not None:
body, hdr["Content-Type"] = raw, ctype
elif data is not None:
body, hdr["Content-Type"] = json.dumps(data).encode(), ctype
else:
body = None
r = urllib.request.Request(BASE + path, data=body, headers=hdr,
method=method or ("POST" if body else "GET"))
with urllib.request.urlopen(r, timeout=timeout) as resp:
return json.loads(resp.read().decode())
def login():
return _req("/api/v1/auths/signin", {"email": EMAIL, "password": PASSWORD})["token"]
def has_text(data):
"""Есть ли осмысленный текст помимо YAML-фронтматтера."""
try:
t = data.decode("utf-8", "ignore")
except Exception:
return False
t = _FM.sub("", t).strip()
return len(t) >= 5
def upload_file(token, relpath, content_bytes):
boundary = "----owui" + uuid.uuid4().hex
body = b"".join([
f"--{boundary}\r\n".encode(),
f'Content-Disposition: form-data; name="file"; filename="{relpath}"\r\n'.encode(),
b"Content-Type: text/markdown\r\n\r\n",
content_bytes,
f"\r\n--{boundary}--\r\n".encode(),
])
return _req("/api/v1/files/", raw=body, token=token,
ctype=f"multipart/form-data; boundary={boundary}")
def kb_add(token, file_id):
return _req(f"/api/v1/knowledge/{KB_ID}/file/add", {"file_id": file_id}, token=token)
def safe(fn, *a):
try:
fn(*a)
except Exception as e:
print(" warn:", e)
def kb_remove(token, file_id):
if not file_id:
return
safe(lambda: _req(f"/api/v1/knowledge/{KB_ID}/file/remove", {"file_id": file_id}, token=token))
def file_delete(token, file_id):
if not file_id:
return
safe(lambda: _req(f"/api/v1/files/{file_id}", token=token, method="DELETE"))
def wanted(relpath):
low = "/" + relpath.lower()
return (relpath.startswith(INCLUDE_DIRS) and relpath.endswith(".md")
and not any(s in low for s in EXCLUDE_SUBSTR))
def main():
token = login()
manifest = json.load(open(MANIFEST)) if os.path.exists(MANIFEST) else {}
current = {}
skipped_empty = 0
for dirpath, _, files in os.walk(ROOT):
for f in files:
rel = os.path.relpath(os.path.join(dirpath, f), ROOT)
if not wanted(rel):
continue
data = open(os.path.join(dirpath, f), "rb").read()
if not has_text(data):
skipped_empty += 1
continue
current[rel] = (hashlib.md5(data).hexdigest(), data)
added = changed = removed = errors = 0
for rel, (h, data) in current.items():
old = manifest.get(rel)
if old and old["hash"] == h:
continue
try:
if old:
kb_remove(token, old["file_id"])
file_delete(token, old["file_id"])
fid = upload_file(token, rel, data)["id"]
kb_add(token, fid)
manifest[rel] = {"hash": h, "file_id": fid}
changed += 1 if old else 0
added += 0 if old else 1
except urllib.error.HTTPError as e:
errors += 1
if e.code == 400: # дубликат/пустой контент — близнец уже в коллекции, не ретраить
manifest[rel] = {"hash": h, "file_id": None, "skip": True}
print(f" ERR {rel}: HTTP {e.code}")
except Exception as e:
errors += 1
print(f" ERR {rel}: {e}")
for rel in list(manifest):
if rel not in current:
kb_remove(token, manifest[rel]["file_id"])
file_delete(token, manifest[rel]["file_id"])
del manifest[rel]
removed += 1
json.dump(manifest, open(MANIFEST, "w"), ensure_ascii=False)
print(f"sync done: +{added} ~{changed} -{removed} | пропущено пустых: {skipped_empty} | "
f"ошибок: {errors} | в коллекции: {len(manifest)}")
if __name__ == "__main__":
main()