Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
This commit is contained in:
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions

147
backend/app/word_parser.py Normal file
View File

@@ -0,0 +1,147 @@
"""Extract text / HTML from a Word (.docx) document via pandoc."""
from __future__ import annotations
import re
import shutil
import subprocess
from pathlib import Path
def extract_word_text(path: Path) -> str:
"""Convert *path* (.docx) to Markdown with pandoc and return the result.
Pandoc preserves tables, lists, bold/italic, and paragraph structure far
better than python-docx plain-text extraction. The returned string is
cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are
irrelevant for text matching.
Falls back to python-docx plain-text extraction if pandoc is not installed.
"""
pandoc = shutil.which("pandoc")
if pandoc:
return _extract_via_pandoc(path, pandoc)
return _extract_via_docx(path)
def extract_word_html(path: Path) -> str | None:
"""Convert *path* (.docx) to an HTML fragment preserving merged table cells.
Uses pandoc (``-t html5``) which correctly maps Word's ``<w:gridSpan>`` /
``<w:vMerge>`` to HTML ``colspan`` / ``rowspan`` attributes.
Returns ``None`` when pandoc is unavailable or conversion fails.
The returned string is a ``<body>`` fragment (no ``<html>`` / ``<head>``),
with inline ``style`` attributes and ``<colgroup>`` stripped so that the
frontend can apply its own CSS.
"""
pandoc = shutil.which("pandoc")
if not pandoc:
return None
try:
result = subprocess.run(
[pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"],
capture_output=True,
text=True,
timeout=60,
)
except Exception:
return None
if result.returncode != 0:
return None
return _clean_word_html(result.stdout)
def _clean_word_html(html: str) -> str:
"""Extract <body> content and strip noise added by pandoc."""
# 取 <body> 内容
m = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
body = m.group(1).strip() if m else html
# 删除 <colgroup> 块(含列宽 inline style由前端 CSS 接管)
body = re.sub(r"<colgroup[^>]*>.*?</colgroup>", "", body, flags=re.DOTALL | re.IGNORECASE)
# 删除所有 style="..." 属性
body = re.sub(r'\s+style="[^"]*"', "", body)
# 删除 pandoc 输出的空 <p></p>
body = re.sub(r"<p>\s*</p>", "", body)
return body.strip()
# --------------------------------------------------------------------------- #
# Pandoc path #
# --------------------------------------------------------------------------- #
def _extract_via_pandoc(path: Path, pandoc: str) -> str:
result = subprocess.run(
[
pandoc,
str(path),
"-f", "docx",
"-t", "markdown",
"--wrap=none",
],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
raise RuntimeError(
f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}"
)
pandoc_text = _clean_pandoc_markdown(result.stdout)
# pandoc 会丢弃包含浮动形状AlternateContent / WPS 图形)的段落的文字内容。
# 用 python-docx 补充:找出 pandoc 没有输出的段落文本,追加到末尾。
# 对文本匹配无副作用(最坏情况是轻微重复,不影响 SequenceMatcher 结果)。
try:
from docx import Document # type: ignore
doc = Document(str(path))
missing: list[str] = []
for para in doc.paragraphs:
text = para.text.strip()
if text and text not in pandoc_text:
missing.append(text)
if missing:
pandoc_text = pandoc_text + "\n" + "\n".join(missing)
except Exception:
pass # python-docx 不可用时静默降级pandoc 结果仍然有效
return pandoc_text
def _clean_pandoc_markdown(text: str) -> str:
"""Remove pandoc-specific inline attributes that noise up text matching."""
# [text]{.mark} / [text]{#id .cls key=val} → text
text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text)
# Leftover bare {…} attribute blocks on their own
text = re.sub(r"\{[^}]*\}", "", text)
return text
# --------------------------------------------------------------------------- #
# python-docx fallback #
# --------------------------------------------------------------------------- #
def _extract_via_docx(path: Path) -> str:
from docx import Document # type: ignore
doc = Document(str(path))
lines = [para.text for para in doc.paragraphs if para.text.strip()]
seen_cells: set[int] = set()
for table in doc.tables:
for row in table.rows:
cells: list[str] = []
for cell in row.cells:
if id(cell) in seen_cells:
continue
seen_cells.add(id(cell))
text = cell.text.strip()
if text:
cells.append(text)
if cells:
lines.append("".join(cells))
return "\n".join(lines)