145 lines
5.1 KiB
Python
145 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
||
"""Инкрементальный синк vault → Open WebUI Knowledge.
|
||
Синкает только полезные папки, исключает файлы с секретами. Идемпотентен (по md5).
|
||
Устойчив: пустые файлы и ошибки отдельных файлов не валят весь прогон.
|
||
Запуск по cron на LXC 142. Логи: /opt/owui-kb-sync/sync.log
|
||
"""
|
||
import os, sys, json, hashlib, re, urllib.request, urllib.error, uuid
|
||
|
||
BASE = "http://localhost:3000"
|
||
EMAIL, PASSWORD = "it5870@yandex.ru", "1qaz!QAZ"
|
||
KB_ID = "7f60313d-add9-4f99-ad53-89e792295129"
|
||
ROOT = "/opt/owui-kb-sync/kb"
|
||
MANIFEST = "/opt/owui-kb-sync/manifest.json"
|
||
INCLUDE_DIRS = ("projects", "decisions", "claude-memory", "snippets")
|
||
EXCLUDE_SUBSTR = ("credential", "secret", ".env", "/password") # путь в нижнем регистре
|
||
_FM = re.compile(r"^---\n.*?\n---\n", re.S)
|
||
|
||
|
||
def _req(path, data=None, token=None, method=None, raw=None, ctype="application/json", timeout=180):
|
||
hdr = {}
|
||
if token:
|
||
hdr["Authorization"] = "Bearer " + token
|
||
if raw is not None:
|
||
body, hdr["Content-Type"] = raw, ctype
|
||
elif data is not None:
|
||
body, hdr["Content-Type"] = json.dumps(data).encode(), ctype
|
||
else:
|
||
body = None
|
||
r = urllib.request.Request(BASE + path, data=body, headers=hdr,
|
||
method=method or ("POST" if body else "GET"))
|
||
with urllib.request.urlopen(r, timeout=timeout) as resp:
|
||
return json.loads(resp.read().decode())
|
||
|
||
|
||
def login():
|
||
return _req("/api/v1/auths/signin", {"email": EMAIL, "password": PASSWORD})["token"]
|
||
|
||
|
||
def has_text(data):
|
||
"""Есть ли осмысленный текст помимо YAML-фронтматтера."""
|
||
try:
|
||
t = data.decode("utf-8", "ignore")
|
||
except Exception:
|
||
return False
|
||
t = _FM.sub("", t).strip()
|
||
return len(t) >= 5
|
||
|
||
|
||
def upload_file(token, relpath, content_bytes):
|
||
boundary = "----owui" + uuid.uuid4().hex
|
||
body = b"".join([
|
||
f"--{boundary}\r\n".encode(),
|
||
f'Content-Disposition: form-data; name="file"; filename="{relpath}"\r\n'.encode(),
|
||
b"Content-Type: text/markdown\r\n\r\n",
|
||
content_bytes,
|
||
f"\r\n--{boundary}--\r\n".encode(),
|
||
])
|
||
return _req("/api/v1/files/", raw=body, token=token,
|
||
ctype=f"multipart/form-data; boundary={boundary}")
|
||
|
||
|
||
def kb_add(token, file_id):
|
||
return _req(f"/api/v1/knowledge/{KB_ID}/file/add", {"file_id": file_id}, token=token)
|
||
|
||
|
||
def safe(fn, *a):
|
||
try:
|
||
fn(*a)
|
||
except Exception as e:
|
||
print(" warn:", e)
|
||
|
||
|
||
def kb_remove(token, file_id):
|
||
if not file_id:
|
||
return
|
||
safe(lambda: _req(f"/api/v1/knowledge/{KB_ID}/file/remove", {"file_id": file_id}, token=token))
|
||
|
||
|
||
def file_delete(token, file_id):
|
||
if not file_id:
|
||
return
|
||
safe(lambda: _req(f"/api/v1/files/{file_id}", token=token, method="DELETE"))
|
||
|
||
|
||
def wanted(relpath):
|
||
low = "/" + relpath.lower()
|
||
return (relpath.startswith(INCLUDE_DIRS) and relpath.endswith(".md")
|
||
and not any(s in low for s in EXCLUDE_SUBSTR))
|
||
|
||
|
||
def main():
|
||
token = login()
|
||
manifest = json.load(open(MANIFEST)) if os.path.exists(MANIFEST) else {}
|
||
|
||
current = {}
|
||
skipped_empty = 0
|
||
for dirpath, _, files in os.walk(ROOT):
|
||
for f in files:
|
||
rel = os.path.relpath(os.path.join(dirpath, f), ROOT)
|
||
if not wanted(rel):
|
||
continue
|
||
data = open(os.path.join(dirpath, f), "rb").read()
|
||
if not has_text(data):
|
||
skipped_empty += 1
|
||
continue
|
||
current[rel] = (hashlib.md5(data).hexdigest(), data)
|
||
|
||
added = changed = removed = errors = 0
|
||
for rel, (h, data) in current.items():
|
||
old = manifest.get(rel)
|
||
if old and old["hash"] == h:
|
||
continue
|
||
try:
|
||
if old:
|
||
kb_remove(token, old["file_id"])
|
||
file_delete(token, old["file_id"])
|
||
fid = upload_file(token, rel, data)["id"]
|
||
kb_add(token, fid)
|
||
manifest[rel] = {"hash": h, "file_id": fid}
|
||
changed += 1 if old else 0
|
||
added += 0 if old else 1
|
||
except urllib.error.HTTPError as e:
|
||
errors += 1
|
||
if e.code == 400: # дубликат/пустой контент — близнец уже в коллекции, не ретраить
|
||
manifest[rel] = {"hash": h, "file_id": None, "skip": True}
|
||
print(f" ERR {rel}: HTTP {e.code}")
|
||
except Exception as e:
|
||
errors += 1
|
||
print(f" ERR {rel}: {e}")
|
||
|
||
for rel in list(manifest):
|
||
if rel not in current:
|
||
kb_remove(token, manifest[rel]["file_id"])
|
||
file_delete(token, manifest[rel]["file_id"])
|
||
del manifest[rel]
|
||
removed += 1
|
||
|
||
json.dump(manifest, open(MANIFEST, "w"), ensure_ascii=False)
|
||
print(f"sync done: +{added} ~{changed} -{removed} | пропущено пустых: {skipped_empty} | "
|
||
f"ошибок: {errors} | в коллекции: {len(manifest)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|