Initial commit: 包装审核 POC、Docker 与前后端
Made-with: Cursor
This commit is contained in:
147
backend/app/word_parser.py
Normal file
147
backend/app/word_parser.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Extract text / HTML from a Word (.docx) document via pandoc."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_word_text(path: Path) -> str:
|
||||
"""Convert *path* (.docx) to Markdown with pandoc and return the result.
|
||||
|
||||
Pandoc preserves tables, lists, bold/italic, and paragraph structure far
|
||||
better than python-docx plain-text extraction. The returned string is
|
||||
cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are
|
||||
irrelevant for text matching.
|
||||
|
||||
Falls back to python-docx plain-text extraction if pandoc is not installed.
|
||||
"""
|
||||
pandoc = shutil.which("pandoc")
|
||||
if pandoc:
|
||||
return _extract_via_pandoc(path, pandoc)
|
||||
return _extract_via_docx(path)
|
||||
|
||||
|
||||
def extract_word_html(path: Path) -> str | None:
|
||||
"""Convert *path* (.docx) to an HTML fragment preserving merged table cells.
|
||||
|
||||
Uses pandoc (``-t html5``) which correctly maps Word's ``<w:gridSpan>`` /
|
||||
``<w:vMerge>`` to HTML ``colspan`` / ``rowspan`` attributes.
|
||||
|
||||
Returns ``None`` when pandoc is unavailable or conversion fails.
|
||||
The returned string is a ``<body>`` fragment (no ``<html>`` / ``<head>``),
|
||||
with inline ``style`` attributes and ``<colgroup>`` stripped so that the
|
||||
frontend can apply its own CSS.
|
||||
"""
|
||||
pandoc = shutil.which("pandoc")
|
||||
if not pandoc:
|
||||
return None
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
|
||||
return _clean_word_html(result.stdout)
|
||||
|
||||
|
||||
def _clean_word_html(html: str) -> str:
|
||||
"""Extract <body> content and strip noise added by pandoc."""
|
||||
# 取 <body> 内容
|
||||
m = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
|
||||
body = m.group(1).strip() if m else html
|
||||
|
||||
# 删除 <colgroup> 块(含列宽 inline style,由前端 CSS 接管)
|
||||
body = re.sub(r"<colgroup[^>]*>.*?</colgroup>", "", body, flags=re.DOTALL | re.IGNORECASE)
|
||||
# 删除所有 style="..." 属性
|
||||
body = re.sub(r'\s+style="[^"]*"', "", body)
|
||||
# 删除 pandoc 输出的空 <p></p>
|
||||
body = re.sub(r"<p>\s*</p>", "", body)
|
||||
|
||||
return body.strip()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Pandoc path #
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
def _extract_via_pandoc(path: Path, pandoc: str) -> str:
|
||||
result = subprocess.run(
|
||||
[
|
||||
pandoc,
|
||||
str(path),
|
||||
"-f", "docx",
|
||||
"-t", "markdown",
|
||||
"--wrap=none",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
||||
)
|
||||
pandoc_text = _clean_pandoc_markdown(result.stdout)
|
||||
|
||||
# pandoc 会丢弃包含浮动形状(AlternateContent / WPS 图形)的段落的文字内容。
|
||||
# 用 python-docx 补充:找出 pandoc 没有输出的段落文本,追加到末尾。
|
||||
# 对文本匹配无副作用(最坏情况是轻微重复,不影响 SequenceMatcher 结果)。
|
||||
try:
|
||||
from docx import Document # type: ignore
|
||||
doc = Document(str(path))
|
||||
missing: list[str] = []
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
if text and text not in pandoc_text:
|
||||
missing.append(text)
|
||||
if missing:
|
||||
pandoc_text = pandoc_text + "\n" + "\n".join(missing)
|
||||
except Exception:
|
||||
pass # python-docx 不可用时静默降级,pandoc 结果仍然有效
|
||||
|
||||
return pandoc_text
|
||||
|
||||
|
||||
def _clean_pandoc_markdown(text: str) -> str:
|
||||
"""Remove pandoc-specific inline attributes that noise up text matching."""
|
||||
# [text]{.mark} / [text]{#id .cls key=val} → text
|
||||
text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text)
|
||||
# Leftover bare {…} attribute blocks on their own
|
||||
text = re.sub(r"\{[^}]*\}", "", text)
|
||||
return text
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# python-docx fallback #
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
def _extract_via_docx(path: Path) -> str:
|
||||
from docx import Document # type: ignore
|
||||
|
||||
doc = Document(str(path))
|
||||
lines = [para.text for para in doc.paragraphs if para.text.strip()]
|
||||
|
||||
seen_cells: set[int] = set()
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells: list[str] = []
|
||||
for cell in row.cells:
|
||||
if id(cell) in seen_cells:
|
||||
continue
|
||||
seen_cells.add(id(cell))
|
||||
text = cell.text.strip()
|
||||
if text:
|
||||
cells.append(text)
|
||||
if cells:
|
||||
lines.append("|".join(cells))
|
||||
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user