148 lines
5.1 KiB
Python
148 lines
5.1 KiB
Python
"""Extract text / HTML from a Word (.docx) document via pandoc."""
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
from pathlib import Path
|
||
|
||
|
||
def extract_word_text(path: Path) -> str:
|
||
"""Convert *path* (.docx) to Markdown with pandoc and return the result.
|
||
|
||
Pandoc preserves tables, lists, bold/italic, and paragraph structure far
|
||
better than python-docx plain-text extraction. The returned string is
|
||
cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are
|
||
irrelevant for text matching.
|
||
|
||
Falls back to python-docx plain-text extraction if pandoc is not installed.
|
||
"""
|
||
pandoc = shutil.which("pandoc")
|
||
if pandoc:
|
||
return _extract_via_pandoc(path, pandoc)
|
||
return _extract_via_docx(path)
|
||
|
||
|
||
def extract_word_html(path: Path) -> str | None:
|
||
"""Convert *path* (.docx) to an HTML fragment preserving merged table cells.
|
||
|
||
Uses pandoc (``-t html5``) which correctly maps Word's ``<w:gridSpan>`` /
|
||
``<w:vMerge>`` to HTML ``colspan`` / ``rowspan`` attributes.
|
||
|
||
Returns ``None`` when pandoc is unavailable or conversion fails.
|
||
The returned string is a ``<body>`` fragment (no ``<html>`` / ``<head>``),
|
||
with inline ``style`` attributes and ``<colgroup>`` stripped so that the
|
||
frontend can apply its own CSS.
|
||
"""
|
||
pandoc = shutil.which("pandoc")
|
||
if not pandoc:
|
||
return None
|
||
try:
|
||
result = subprocess.run(
|
||
[pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=60,
|
||
)
|
||
except Exception:
|
||
return None
|
||
|
||
if result.returncode != 0:
|
||
return None
|
||
|
||
return _clean_word_html(result.stdout)
|
||
|
||
|
||
def _clean_word_html(html: str) -> str:
|
||
"""Extract <body> content and strip noise added by pandoc."""
|
||
# 取 <body> 内容
|
||
m = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL | re.IGNORECASE)
|
||
body = m.group(1).strip() if m else html
|
||
|
||
# 删除 <colgroup> 块(含列宽 inline style,由前端 CSS 接管)
|
||
body = re.sub(r"<colgroup[^>]*>.*?</colgroup>", "", body, flags=re.DOTALL | re.IGNORECASE)
|
||
# 删除所有 style="..." 属性
|
||
body = re.sub(r'\s+style="[^"]*"', "", body)
|
||
# 删除 pandoc 输出的空 <p></p>
|
||
body = re.sub(r"<p>\s*</p>", "", body)
|
||
|
||
return body.strip()
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# Pandoc path #
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
def _extract_via_pandoc(path: Path, pandoc: str) -> str:
|
||
result = subprocess.run(
|
||
[
|
||
pandoc,
|
||
str(path),
|
||
"-f", "docx",
|
||
"-t", "markdown",
|
||
"--wrap=none",
|
||
],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=60,
|
||
)
|
||
if result.returncode != 0:
|
||
raise RuntimeError(
|
||
f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}"
|
||
)
|
||
pandoc_text = _clean_pandoc_markdown(result.stdout)
|
||
|
||
# pandoc 会丢弃包含浮动形状(AlternateContent / WPS 图形)的段落的文字内容。
|
||
# 用 python-docx 补充:找出 pandoc 没有输出的段落文本,追加到末尾。
|
||
# 对文本匹配无副作用(最坏情况是轻微重复,不影响 SequenceMatcher 结果)。
|
||
try:
|
||
from docx import Document # type: ignore
|
||
doc = Document(str(path))
|
||
missing: list[str] = []
|
||
for para in doc.paragraphs:
|
||
text = para.text.strip()
|
||
if text and text not in pandoc_text:
|
||
missing.append(text)
|
||
if missing:
|
||
pandoc_text = pandoc_text + "\n" + "\n".join(missing)
|
||
except Exception:
|
||
pass # python-docx 不可用时静默降级,pandoc 结果仍然有效
|
||
|
||
return pandoc_text
|
||
|
||
|
||
def _clean_pandoc_markdown(text: str) -> str:
|
||
"""Remove pandoc-specific inline attributes that noise up text matching."""
|
||
# [text]{.mark} / [text]{#id .cls key=val} → text
|
||
text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text)
|
||
# Leftover bare {…} attribute blocks on their own
|
||
text = re.sub(r"\{[^}]*\}", "", text)
|
||
return text
|
||
|
||
|
||
# --------------------------------------------------------------------------- #
|
||
# python-docx fallback #
|
||
# --------------------------------------------------------------------------- #
|
||
|
||
def _extract_via_docx(path: Path) -> str:
|
||
from docx import Document # type: ignore
|
||
|
||
doc = Document(str(path))
|
||
lines = [para.text for para in doc.paragraphs if para.text.strip()]
|
||
|
||
seen_cells: set[int] = set()
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
cells: list[str] = []
|
||
for cell in row.cells:
|
||
if id(cell) in seen_cells:
|
||
continue
|
||
seen_cells.add(id(cell))
|
||
text = cell.text.strip()
|
||
if text:
|
||
cells.append(text)
|
||
if cells:
|
||
lines.append("|".join(cells))
|
||
|
||
return "\n".join(lines)
|