knowledge-base/snippets/telegraph-md-to-page.py

#!/usr/bin/env python3
"""Обновить Telegra.ph страницу содержимым markdown-файла."""
import json
import re
import subprocess
import sys
import urllib.parse
import urllib.request

ACCESS_TOKEN = "c38dcadb86e6edd7efc76496d9171d38beef6dc0f6a7ef2cd79bbae70e46"
PATH = "Nastrojka-VPN-04-24-2"
TITLE = "Настройка VPN"
AUTHOR = "Олег"

# --- Inline markdown parser ---
# Order: 1) protect `code` and [text](url) with placeholders
#        2) parse bold/italic on remaining text
#        3) restore placeholders

CODE_RE = re.compile(r"`([^`]+?)`")
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
BOLD_RE = re.compile(r"\*\*(.+?)\*\*")
ITALIC_STAR_RE = re.compile(r"(?<![*\w])\*([^\s*][^*]*?)\*(?![*\w])")
ITALIC_UND_RE = re.compile(r"(?<![_\w])_([^\s_][^_]*?)_(?![_\w])")

def parse_inline(text):
    if not text:
        return []

    placeholders = {}
    pc = [0]

    def put(node):
        key = f"\x00PH{pc[0]}\x00"
        pc[0] += 1
        placeholders[key] = node
        return key

    # 1) Protect code first (so its content is untouched by later regexes)
    text = CODE_RE.sub(lambda m: put({"tag": "code", "children": [m.group(1)]}), text)
    # 2) Protect links (link text still can have formatting — we recurse)
    text = LINK_RE.sub(lambda m: put({"tag": "a", "attrs": {"href": m.group(2)},
                                       "children": parse_inline(m.group(1))}), text)

    # 3) Apply bold, then italic
    def apply_pattern(nodes, pattern, tag):
        out = []
        for n in nodes:
            if not isinstance(n, str):
                out.append(n)
                continue
            last = 0
            for m in pattern.finditer(n):
                if m.start() > last:
                    out.append(n[last:m.start()])
                out.append({"tag": tag, "children": parse_inline(m.group(1))})
                last = m.end()
            if last < len(n):
                out.append(n[last:])
        return out

    nodes = [text]
    nodes = apply_pattern(nodes, BOLD_RE, "strong")
    nodes = apply_pattern(nodes, ITALIC_STAR_RE, "em")
    nodes = apply_pattern(nodes, ITALIC_UND_RE, "em")

    # 4) Restore placeholders in string nodes
    result = []
    for n in nodes:
        if isinstance(n, str):
            parts = re.split(r"(\x00PH\d+\x00)", n)
            for p in parts:
                if p == "":
                    continue
                if p in placeholders:
                    result.append(placeholders[p])
                else:
                    result.append(p)
        else:
            result.append(n)
    return result


# --- Block parser with nested list support ---
def indent_of(line):
    return len(line) - len(line.lstrip(" "))

def is_ol_item(line):
    return re.match(r"^\s*\d+\.\s+", line)

def is_ul_item(line):
    return re.match(r"^\s*[-*]\s+", line)

def strip_list_marker(line):
    m = re.match(r"^\s*(?:\d+\.|[-*])\s+(.*)$", line)
    return m.group(1) if m else line

def parse_list(lines, i, base_indent):
    """Parse a list starting at lines[i] with base_indent. Returns (node, new_i)."""
    first = lines[i]
    is_ol = bool(is_ol_item(first))
    tag = "ol" if is_ol else "ul"
    items = []

    while i < len(lines):
        line = lines[i]
        if not line.strip():
            # blank line — check if list continues
            if i + 1 < len(lines):
                nxt = lines[i + 1]
                if nxt.strip() and indent_of(nxt) == base_indent and (is_ol_item(nxt) or is_ul_item(nxt)):
                    i += 1
                    continue
            break
        ind = indent_of(line)
        if ind < base_indent:
            break
        if ind > base_indent:
            # shouldn't happen at top — break
            break
        if not (is_ol_item(line) or is_ul_item(line)):
            break
        # same-kind check: if switching from ol→ul at same indent, break
        if (is_ol and is_ul_item(line) and not is_ol_item(line)) or \
           (not is_ol and is_ol_item(line) and not is_ul_item(line)):
            break

        # This is a list item at base_indent
        text = strip_list_marker(line)
        i += 1
        continuation_text = []
        nested_children = []

        # Consume continuation lines and nested lists
        while i < len(lines):
            nl = lines[i]
            if not nl.strip():
                # peek ahead
                if i + 1 >= len(lines):
                    i += 1
                    break
                nxt = lines[i + 1]
                if nxt.strip() and indent_of(nxt) > base_indent:
                    i += 1
                    continue
                # end of item
                break
            ni = indent_of(nl)
            if ni <= base_indent:
                break
            # Nested list?
            if is_ol_item(nl) or is_ul_item(nl):
                nested, i = parse_list(lines, i, ni)
                nested_children.append(nested)
                continue
            # continuation line
            continuation_text.append(nl.strip())
            i += 1

        full_text = text
        if continuation_text:
            full_text = (text + " " + " ".join(continuation_text)).strip()
        children = parse_inline(full_text) + nested_children
        items.append({"tag": "li", "children": children})

    return {"tag": tag, "children": items}, i


def parse_blocks(md):
    lines = md.splitlines()
    nodes = []
    i = 0
    while i < len(lines):
        line = lines[i]

        if not line.strip():
            i += 1
            continue

        # heading
        m = re.match(r"^(#{1,6})\s+(.+)$", line)
        if m:
            level = len(m.group(1))
            tag = "h3" if level <= 2 else "h4"
            nodes.append({"tag": tag, "children": parse_inline(m.group(2).strip())})
            i += 1
            continue

        # hr
        if re.match(r"^-{3,}\s*$", line):
            nodes.append({"tag": "hr"})
            i += 1
            continue

        # fenced code
        if line.strip().startswith("```"):
            i += 1
            buf = []
            while i < len(lines) and not lines[i].strip().startswith("```"):
                buf.append(lines[i])
                i += 1
            i += 1
            nodes.append({"tag": "pre", "children": ["\n".join(buf)]})
            continue

        # list
        if is_ol_item(line) or is_ul_item(line):
            node, i = parse_list(lines, i, indent_of(line))
            nodes.append(node)
            continue

        # paragraph
        buf = [line]
        i += 1
        while i < len(lines):
            nxt = lines[i]
            if not nxt.strip():
                break
            if re.match(r"^#{1,6}\s", nxt):
                break
            if re.match(r"^-{3,}\s*$", nxt):
                break
            if is_ol_item(nxt) or is_ul_item(nxt):
                break
            if nxt.strip().startswith("```"):
                break
            buf.append(nxt)
            i += 1
        # Preserve line breaks within paragraph using <br> — Telegraph supports it
        # Strip trailing markdown hard-break "  " marker
        cleaned = [re.sub(r"\s+$", "", b) for b in buf]
        # Join with space, inserting br between lines that were originally separated
        inline_children = []
        for idx, part in enumerate(cleaned):
            if idx > 0:
                inline_children.append({"tag": "br"})
            inline_children.extend(parse_inline(part))
        nodes.append({"tag": "p", "children": inline_children})
    return nodes


def main():
    md = subprocess.check_output(
        ["sshpass", "-p", "1qaz!QAZ",
         "ssh", "-o", "StrictHostKeyChecking=no", "root@10.0.0.250",
         "pct exec 137 -- cat /tmp/vpn-instruction-improved.md"],
        text=True
    )

    content = parse_blocks(md)

    # Dry-run preview
    if "--dry" in sys.argv:
        print(json.dumps(content, ensure_ascii=False, indent=2))
        return

    data = urllib.parse.urlencode({
        "access_token": ACCESS_TOKEN,
        "title": TITLE,
        "author_name": AUTHOR,
        "content": json.dumps(content, ensure_ascii=False),
        "return_content": "false",
    }).encode()
    req = urllib.request.Request(f"https://api.telegra.ph/editPage/{PATH}", data=data)
    with urllib.request.urlopen(req) as resp:
        result = json.loads(resp.read().decode())
    print(json.dumps(result, ensure_ascii=False, indent=2))
    if not result.get("ok"):
        sys.exit(1)

if __name__ == "__main__":
    main()