Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions
--- a/backend/app/word_parser.py
+++ b/backend/app/word_parser.py
@@ -0,0 +1,147 @@
+"""Extract text / HTML from a Word (.docx) document via pandoc."""
+from __future__ import annotations
+
+import re
+import shutil
+import subprocess
+from pathlib import Path
+
+
+def extract_word_text(path: Path) -> str:
+    """Convert *path* (.docx) to Markdown with pandoc and return the result.
+
+    Pandoc preserves tables, lists, bold/italic, and paragraph structure far
+    better than python-docx plain-text extraction.  The returned string is
+    cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are
+    irrelevant for text matching.
+
+    Falls back to python-docx plain-text extraction if pandoc is not installed.
+    """
+    pandoc = shutil.which("pandoc")
+    if pandoc:
+        return _extract_via_pandoc(path, pandoc)
+    return _extract_via_docx(path)
+
+
+def extract_word_html(path: Path) -> str | None:
+    """Convert *path* (.docx) to an HTML fragment preserving merged table cells.
+
+    Uses pandoc (``-t html5``) which correctly maps Word's ``<w:gridSpan>`` /
+    ``<w:vMerge>`` to HTML ``colspan`` / ``rowspan`` attributes.
+
+    Returns ``None`` when pandoc is unavailable or conversion fails.
+    The returned string is a ``<body>`` fragment (no ``<html>`` / ``<head>``),
+    with inline ``style`` attributes and ``<colgroup>`` stripped so that the
+    frontend can apply its own CSS.
+    """
+    pandoc = shutil.which("pandoc")
+    if not pandoc:
+        return None
+    try:
+        result = subprocess.run(
+            [pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+    except Exception:
+        return None
+
+    if result.returncode != 0:
+        return None
+
+    return _clean_word_html(result.stdout)
+
+
+def _clean_word_html(html: str) -> str:
+    """Extract <body> content and strip noise added by pandoc."""
+    # 取 <body> 内容
+    m = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
+    body = m.group(1).strip() if m else html
+
+    # 删除 <colgroup> 块（含列宽 inline style，由前端 CSS 接管）
+    body = re.sub(r"<colgroup[^>]*>.*?</colgroup>", "", body, flags=re.DOTALL | re.IGNORECASE)
+    # 删除所有 style="..." 属性
+    body = re.sub(r'\s+style="[^"]*"', "", body)
+    # 删除 pandoc 输出的空 <p></p>
+    body = re.sub(r"<p>\s*</p>", "", body)
+
+    return body.strip()
+
+
+# --------------------------------------------------------------------------- #
+# Pandoc path                                                                  #
+# --------------------------------------------------------------------------- #
+
+def _extract_via_pandoc(path: Path, pandoc: str) -> str:
+    result = subprocess.run(
+        [
+            pandoc,
+            str(path),
+            "-f", "docx",
+            "-t", "markdown",
+            "--wrap=none",
+        ],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}"
+        )
+    pandoc_text = _clean_pandoc_markdown(result.stdout)
+
+    # pandoc 会丢弃包含浮动形状（AlternateContent / WPS 图形）的段落的文字内容。
+    # 用 python-docx 补充：找出 pandoc 没有输出的段落文本，追加到末尾。
+    # 对文本匹配无副作用（最坏情况是轻微重复，不影响 SequenceMatcher 结果）。
+    try:
+        from docx import Document  # type: ignore
+        doc = Document(str(path))
+        missing: list[str] = []
+        for para in doc.paragraphs:
+            text = para.text.strip()
+            if text and text not in pandoc_text:
+                missing.append(text)
+        if missing:
+            pandoc_text = pandoc_text + "\n" + "\n".join(missing)
+    except Exception:
+        pass  # python-docx 不可用时静默降级，pandoc 结果仍然有效
+
+    return pandoc_text
+
+
+def _clean_pandoc_markdown(text: str) -> str:
+    """Remove pandoc-specific inline attributes that noise up text matching."""
+    # [text]{.mark} / [text]{#id .cls key=val} → text
+    text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text)
+    # Leftover bare {…} attribute blocks on their own
+    text = re.sub(r"\{[^}]*\}", "", text)
+    return text
+
+
+# --------------------------------------------------------------------------- #
+# python-docx fallback                                                         #
+# --------------------------------------------------------------------------- #
+
+def _extract_via_docx(path: Path) -> str:
+    from docx import Document  # type: ignore
+
+    doc = Document(str(path))
+    lines = [para.text for para in doc.paragraphs if para.text.strip()]
+
+    seen_cells: set[int] = set()
+    for table in doc.tables:
+        for row in table.rows:
+            cells: list[str] = []
+            for cell in row.cells:
+                if id(cell) in seen_cells:
+                    continue
+                seen_cells.add(id(cell))
+                text = cell.text.strip()
+                if text:
+                    cells.append(text)
+            if cells:
+                lines.append("｜".join(cells))
+
+    return "\n".join(lines)