"""Extract text / HTML from a Word (.docx) document via pandoc.""" from __future__ import annotations import re import shutil import subprocess from pathlib import Path def extract_word_text(path: Path) -> str: """Convert *path* (.docx) to Markdown with pandoc and return the result. Pandoc preserves tables, lists, bold/italic, and paragraph structure far better than python-docx plain-text extraction. The returned string is cleaned of pandoc-specific span attributes (e.g. ``{.mark}``) that are irrelevant for text matching. Falls back to python-docx plain-text extraction if pandoc is not installed. """ pandoc = shutil.which("pandoc") if pandoc: return _extract_via_pandoc(path, pandoc) return _extract_via_docx(path) def extract_word_html(path: Path) -> str | None: """Convert *path* (.docx) to an HTML fragment preserving merged table cells. Uses pandoc (``-t html5``) which correctly maps Word's ```` / ```` to HTML ``colspan`` / ``rowspan`` attributes. Returns ``None`` when pandoc is unavailable or conversion fails. The returned string is a ```` fragment (no ```` / ````), with inline ``style`` attributes and ```` stripped so that the frontend can apply its own CSS. """ pandoc = shutil.which("pandoc") if not pandoc: return None try: result = subprocess.run( [pandoc, str(path), "-f", "docx", "-t", "html5", "--wrap=none"], capture_output=True, text=True, timeout=60, ) except Exception: return None if result.returncode != 0: return None return _clean_word_html(result.stdout) def _clean_word_html(html: str) -> str: """Extract content and strip noise added by pandoc.""" # 取 内容 m = re.search(r"]*>(.*?)", html, re.DOTALL | re.IGNORECASE) body = m.group(1).strip() if m else html # 删除 块(含列宽 inline style,由前端 CSS 接管) body = re.sub(r"]*>.*?", "", body, flags=re.DOTALL | re.IGNORECASE) # 删除所有 style="..." 属性 body = re.sub(r'\s+style="[^"]*"', "", body) # 删除 pandoc 输出的空

body = re.sub(r"

\s*

", "", body) return body.strip() # --------------------------------------------------------------------------- # # Pandoc path # # --------------------------------------------------------------------------- # def _extract_via_pandoc(path: Path, pandoc: str) -> str: result = subprocess.run( [ pandoc, str(path), "-f", "docx", "-t", "markdown", "--wrap=none", ], capture_output=True, text=True, timeout=60, ) if result.returncode != 0: raise RuntimeError( f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}" ) pandoc_text = _clean_pandoc_markdown(result.stdout) # pandoc 会丢弃包含浮动形状(AlternateContent / WPS 图形)的段落的文字内容。 # 用 python-docx 补充:找出 pandoc 没有输出的段落文本,追加到末尾。 # 对文本匹配无副作用(最坏情况是轻微重复,不影响 SequenceMatcher 结果)。 try: from docx import Document # type: ignore doc = Document(str(path)) missing: list[str] = [] for para in doc.paragraphs: text = para.text.strip() if text and text not in pandoc_text: missing.append(text) if missing: pandoc_text = pandoc_text + "\n" + "\n".join(missing) except Exception: pass # python-docx 不可用时静默降级,pandoc 结果仍然有效 return pandoc_text def _clean_pandoc_markdown(text: str) -> str: """Remove pandoc-specific inline attributes that noise up text matching.""" # [text]{.mark} / [text]{#id .cls key=val} → text text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text) # Leftover bare {…} attribute blocks on their own text = re.sub(r"\{[^}]*\}", "", text) return text # --------------------------------------------------------------------------- # # python-docx fallback # # --------------------------------------------------------------------------- # def _extract_via_docx(path: Path) -> str: from docx import Document # type: ignore doc = Document(str(path)) lines = [para.text for para in doc.paragraphs if para.text.strip()] seen_cells: set[int] = set() for table in doc.tables: for row in table.rows: cells: list[str] = [] for cell in row.cells: if id(cell) in seen_cells: continue seen_cells.add(id(cell)) text = cell.text.strip() if text: cells.append(text) if cells: lines.append("|".join(cells)) return "\n".join(lines)