\s*
", "", body) return body.strip() # --------------------------------------------------------------------------- # # Pandoc path # # --------------------------------------------------------------------------- # def _extract_via_pandoc(path: Path, pandoc: str) -> str: result = subprocess.run( [ pandoc, str(path), "-f", "docx", "-t", "markdown", "--wrap=none", ], capture_output=True, text=True, timeout=60, ) if result.returncode != 0: raise RuntimeError( f"pandoc failed (exit {result.returncode}):\n{result.stderr.strip()}" ) pandoc_text = _clean_pandoc_markdown(result.stdout) # pandoc 会丢弃包含浮动形状(AlternateContent / WPS 图形)的段落的文字内容。 # 用 python-docx 补充:找出 pandoc 没有输出的段落文本,追加到末尾。 # 对文本匹配无副作用(最坏情况是轻微重复,不影响 SequenceMatcher 结果)。 try: from docx import Document # type: ignore doc = Document(str(path)) missing: list[str] = [] for para in doc.paragraphs: text = para.text.strip() if text and text not in pandoc_text: missing.append(text) if missing: pandoc_text = pandoc_text + "\n" + "\n".join(missing) except Exception: pass # python-docx 不可用时静默降级,pandoc 结果仍然有效 return pandoc_text def _clean_pandoc_markdown(text: str) -> str: """Remove pandoc-specific inline attributes that noise up text matching.""" # [text]{.mark} / [text]{#id .cls key=val} → text text = re.sub(r"\[([^\]]*)\]\{[^}]*\}", r"\1", text) # Leftover bare {…} attribute blocks on their own text = re.sub(r"\{[^}]*\}", "", text) return text # --------------------------------------------------------------------------- # # python-docx fallback # # --------------------------------------------------------------------------- # def _extract_via_docx(path: Path) -> str: from docx import Document # type: ignore doc = Document(str(path)) lines = [para.text for para in doc.paragraphs if para.text.strip()] seen_cells: set[int] = set() for table in doc.tables: for row in table.rows: cells: list[str] = [] for cell in row.cells: if id(cell) in seen_cells: continue seen_cells.add(id(cell)) text = cell.text.strip() if text: cells.append(text) if cells: lines.append("|".join(cells)) return "\n".join(lines)