Files
ZLD_POC/backend/app/word_parser.py
2026-04-15 17:18:49 +08:00

148 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Extract text / HTML from a Word (.docx) document via pandoc."""
from __future__ import annotations
import re
import shutil
import subprocess
from pathlib import Path
def extract_word_text(path: Path) -> str:
"""Convert *path* (.docx) to Markdown with pandoc and return the result.
Pandoc preserves tables, lists, bold/italic, and paragraph structure far
better than python-docx plain-text extraction. The returned string is
cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are
irrelevant for text matching.
Falls back to python-docx plain-text extraction if pandoc is not installed.
"""
pandoc = shutil.which("pandoc")
if pandoc:
return _extract_via_pandoc(path, pandoc)
return _extract_via_docx(path)
def extract_word_html(path: Path) -> str | None:
"""Convert *path* (.docx) to an HTML fragment preserving merged table cells.
Uses pandoc (``-t html5``) which correctly maps Word's ``<w:gridSpan>`` /
``<w:vMerge>`` to HTML ``colspan`` / ``rowspan`` attributes.
Returns ``None`` when pandoc is unavailable or conversion fails.
The returned string is a ``<body>`` fragment (no ``<html>`` / ``<head>``),
with inline ``style`` attributes and ``<colgroup>`` stripped so that the
frontend can apply its own CSS.
"""
pandoc = shutil.which("pandoc")
if not pandoc:
return None
try:
result = subprocess.run(
[pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"],
capture_output=True,
text=True,
timeout=60,
)
except Exception:
return None
if result.returncode != 0:
return None
return _clean_word_html(result.stdout)
def _clean_word_html(html: str) -> str:
"""Extract <body> content and strip noise added by pandoc."""
# 取 <body> 内容
m = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
body = m.group(1).strip() if m else html
# 删除 <colgroup> 块(含列宽 inline style由前端 CSS 接管)
body = re.sub(r"<colgroup[^>]*>.*?</colgroup>", "", body, flags=re.DOTALL | re.IGNORECASE)
# 删除所有 style="..." 属性
body = re.sub(r'\s+style="[^"]*"', "", body)
# 删除 pandoc 输出的空 <p></p>
body = re.sub(r"<p>\s*</p>", "", body)
return body.strip()
# --------------------------------------------------------------------------- #
# Pandoc path #
# --------------------------------------------------------------------------- #
def _extract_via_pandoc(path: Path, pandoc: str) -> str:
result = subprocess.run(
[
pandoc,
str(path),
"-f", "docx",
"-t", "markdown",
"--wrap=none",
],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
raise RuntimeError(
f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}"
)
pandoc_text = _clean_pandoc_markdown(result.stdout)
# pandoc 会丢弃包含浮动形状AlternateContent / WPS 图形)的段落的文字内容。
# 用 python-docx 补充:找出 pandoc 没有输出的段落文本,追加到末尾。
# 对文本匹配无副作用(最坏情况是轻微重复,不影响 SequenceMatcher 结果)。
try:
from docx import Document # type: ignore
doc = Document(str(path))
missing: list[str] = []
for para in doc.paragraphs:
text = para.text.strip()
if text and text not in pandoc_text:
missing.append(text)
if missing:
pandoc_text = pandoc_text + "\n" + "\n".join(missing)
except Exception:
pass # python-docx 不可用时静默降级pandoc 结果仍然有效
return pandoc_text
def _clean_pandoc_markdown(text: str) -> str:
"""Remove pandoc-specific inline attributes that noise up text matching."""
# [text]{.mark} / [text]{#id .cls key=val} → text
text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text)
# Leftover bare {…} attribute blocks on their own
text = re.sub(r"\{[^}]*\}", "", text)
return text
# --------------------------------------------------------------------------- #
# python-docx fallback #
# --------------------------------------------------------------------------- #
def _extract_via_docx(path: Path) -> str:
from docx import Document # type: ignore
doc = Document(str(path))
lines = [para.text for para in doc.paragraphs if para.text.strip()]
seen_cells: set[int] = set()
for table in doc.tables:
for row in table.rows:
cells: list[str] = []
for cell in row.cells:
if id(cell) in seen_cells:
continue
seen_cells.add(id(cell))
text = cell.text.strip()
if text:
cells.append(text)
if cells:
lines.append("".join(cells))
return "\n".join(lines)