ZLD_POC/backend/app/word_parser.py

"""Extract text / HTML from a Word (.docx) document via pandoc."""
from __future__ import annotations

import re
import shutil
import subprocess
from pathlib import Path


def extract_word_text(path: Path) -> str:
    """Convert *path* (.docx) to Markdown with pandoc and return the result.

    Pandoc preserves tables, lists, bold/italic, and paragraph structure far
    better than python-docx plain-text extraction.  The returned string is
    cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are
    irrelevant for text matching.

    Falls back to python-docx plain-text extraction if pandoc is not installed.
    """
    pandoc = shutil.which("pandoc")
    if pandoc:
        return _extract_via_pandoc(path, pandoc)
    return _extract_via_docx(path)


def extract_word_html(path: Path) -> str | None:
    """Convert *path* (.docx) to an HTML fragment preserving merged table cells.

    Uses pandoc (``-t html5``) which correctly maps Word's ``<w:gridSpan>`` /
    ``<w:vMerge>`` to HTML ``colspan`` / ``rowspan`` attributes.

    Returns ``None`` when pandoc is unavailable or conversion fails.
    The returned string is a ``<body>`` fragment (no ``<html>`` / ``<head>``),
    with inline ``style`` attributes and ``<colgroup>`` stripped so that the
    frontend can apply its own CSS.
    """
    pandoc = shutil.which("pandoc")
    if not pandoc:
        return None
    try:
        result = subprocess.run(
            [pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"],
            capture_output=True,
            text=True,
            timeout=60,
        )
    except Exception:
        return None

    if result.returncode != 0:
        return None

    return _clean_word_html(result.stdout)


def _clean_word_html(html: str) -> str:
    """Extract <body> content and strip noise added by pandoc."""
    # 取 <body> 内容
    m = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
    body = m.group(1).strip() if m else html

    # 删除 <colgroup> 块（含列宽 inline style，由前端 CSS 接管）
    body = re.sub(r"<colgroup[^>]*>.*?</colgroup>", "", body, flags=re.DOTALL | re.IGNORECASE)
    # 删除所有 style="..." 属性
    body = re.sub(r'\s+style="[^"]*"', "", body)
    # 删除 pandoc 输出的空 <p></p>
    body = re.sub(r"<p>\s*</p>", "", body)

    return body.strip()


# --------------------------------------------------------------------------- #
# Pandoc path                                                                  #
# --------------------------------------------------------------------------- #

def _extract_via_pandoc(path: Path, pandoc: str) -> str:
    result = subprocess.run(
        [
            pandoc,
            str(path),
            "-f", "docx",
            "-t", "markdown",
            "--wrap=none",
        ],
        capture_output=True,
        text=True,
        timeout=60,
    )
    if result.returncode != 0:
        raise RuntimeError(
            f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}"
        )
    pandoc_text = _clean_pandoc_markdown(result.stdout)

    # pandoc 会丢弃包含浮动形状（AlternateContent / WPS 图形）的段落的文字内容。
    # 用 python-docx 补充：找出 pandoc 没有输出的段落文本，追加到末尾。
    # 对文本匹配无副作用（最坏情况是轻微重复，不影响 SequenceMatcher 结果）。
    try:
        from docx import Document  # type: ignore
        doc = Document(str(path))
        missing: list[str] = []
        for para in doc.paragraphs:
            text = para.text.strip()
            if text and text not in pandoc_text:
                missing.append(text)
        if missing:
            pandoc_text = pandoc_text + "\n" + "\n".join(missing)
    except Exception:
        pass  # python-docx 不可用时静默降级，pandoc 结果仍然有效

    return pandoc_text


def _clean_pandoc_markdown(text: str) -> str:
    """Remove pandoc-specific inline attributes that noise up text matching."""
    # [text]{.mark} / [text]{#id .cls key=val} → text
    text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text)
    # Leftover bare {…} attribute blocks on their own
    text = re.sub(r"\{[^}]*\}", "", text)
    return text


# --------------------------------------------------------------------------- #
# python-docx fallback                                                         #
# --------------------------------------------------------------------------- #

def _extract_via_docx(path: Path) -> str:
    from docx import Document  # type: ignore

    doc = Document(str(path))
    lines = [para.text for para in doc.paragraphs if para.text.strip()]

    seen_cells: set[int] = set()
    for table in doc.tables:
        for row in table.rows:
            cells: list[str] = []
            for cell in row.cells:
                if id(cell) in seen_cells:
                    continue
                seen_cells.add(id(cell))
                text = cell.text.strip()
                if text:
                    cells.append(text)
            if cells:
                lines.append("｜".join(cells))

    return "\n".join(lines)