"""Validate extracted text blocks against a Word document's content.""" from __future__ import annotations import re import unicodedata from dataclasses import dataclass from difflib import SequenceMatcher # Minimum SequenceMatcher ratio to count as a match (strict: content must be nearly identical) MATCH_THRESHOLD = 0.95 # For multi-row tables: individual row match threshold TABLE_ROW_SINGLE_THRESHOLD = 0.95 # For multi-row tables: fraction of valid rows that must match TABLE_ROW_MATCH_THRESHOLD = 0.5 # Characters below this length are treated as too short to validate MIN_TEXT_LENGTH = 2 @dataclass class ValidationResult: status: str # "matched" | "unmatched" | "empty_or_garbled" reason: str normalized_text: str matched_excerpt: str | None # 圆圈序号 ①②③...⑳(NFKC 之前处理,避免转为数字后难以区分) _CIRCLED_NUM_RE = re.compile(r"^[①-⑳]") # 数字列表前缀:"1. " "2." "3. " 等(NFKC 之后处理) _LIST_NUM_RE = re.compile(r"^\d{1,2}[.\s]+") # 句末/列表标点(中英文等价符,忽略差异;保留小数点和冒号) _PUNCT_RE = re.compile(r"[,。;!?、…,;!?]") def _normalize(text: str) -> str: """Collapse whitespace and normalise unicode for comparison. 额外处理: - 去掉首部圆圈序号(①②③)和数字列表前缀(1. 2.) - 忽略中英文标点差异(,。;vs ,.) - 统一 dash 并去掉 dash 两侧空格(50 – 60 → 50-60) """ text = text.lstrip() # 先去圆圈序号(在 NFKC 前,避免 ③→3 后与普通数字混淆) text = _CIRCLED_NUM_RE.sub("", text).lstrip() # Unicode 归一化(全角→半角、① → 1、:→ :、(→ ( 等) text = unicodedata.normalize("NFKC", text) # Strip markdown bold/italic markers text = re.sub(r"\*+", "", text) # 破折号变体归一化:en-dash / em-dash / minus sign → hyphen text = re.sub(r"[–—−]", "-", text) # 去掉 dash 两侧空格:"50 - 60" → "50-60" text = re.sub(r"\s*-\s*", "-", text) # 去掉数字列表前缀(NFKC 后,如 "3. " "4. ") text = _LIST_NUM_RE.sub("", text.lstrip()) # 忽略句末/列表标点差异 text = _PUNCT_RE.sub("", text) text = re.sub(r"\s+", " ", text).strip() return text def _is_garbled(text: str) -> bool: """Return True when text is empty, too short, or mostly non-printable.""" if not text or len(text) < MIN_TEXT_LENGTH: return True printable = sum(1 for c in text if not unicodedata.category(c).startswith("C")) return printable / len(text) < 0.5 def _word_lines(word_text: str) -> list[str]: """Split Word Markdown into non-empty normalised lines for matching. Grid-table separator rows (e.g. ``+-----+-----+``) are filtered out because they carry no semantic content and would skew similarity scores. """ _SEP_RE = re.compile(r"^[+\-=| ]+$") lines = [] for raw in word_text.splitlines(): norm = _normalize(raw) if not norm: continue # Skip pandoc grid-table separator rows if _SEP_RE.match(norm.replace(" ", "")): continue lines.append(norm) return lines def _match_against_line(needle: str, line: str) -> tuple[float, str]: """Return (ratio, excerpt) for needle vs a single Word line. When the needle (MinerU row) is significantly shorter than the Word line (because the Word table has more product columns), a plain SequenceMatcher ratio under-counts matching content. We also compute *needle coverage* — the fraction of the needle's characters that appear in the line — and take the higher of the two scores. """ # Exact substring if needle in line: idx = line.index(needle) return 1.0, line[idx: idx + len(needle) + 20].strip() matcher = SequenceMatcher(None, needle, line, autojunk=False) ratio = matcher.ratio() # Coverage ratio: useful when MinerU row is a partial view of a wider table if len(needle) > 0 and len(needle) < len(line): match_chars = sum(t for _, _, t in matcher.get_matching_blocks()) coverage = match_chars / len(needle) # Apply a small discount to avoid false positives on very short needles ratio = max(ratio, coverage * 0.95) # 表格行(含 | 分隔符)可能很长,给更多上下文以便前端完整渲染 max_len = 400 if line.lstrip().startswith("|") else 120 return ratio, line[:max_len].strip() def _match_single_line(norm: str, word_lines: list[str]) -> tuple[float, str]: """在 word_lines 中找与 norm 最相似的行,返回 (best_ratio, best_excerpt)。""" best_ratio = 0.0 best_excerpt = "" for line in word_lines: ratio, excerpt = _match_against_line(norm, line) if ratio > best_ratio: best_ratio = ratio best_excerpt = excerpt if best_ratio == 1.0: break return best_ratio, best_excerpt def _validate_table_against_word(raw_rows: list[str], word_text: str) -> ValidationResult: """多行表格逐行匹配,聚合命中率。 策略 ---- - 对每一行分别调用单行匹配,达到阈值则计为命中。 - 命中率 ≥ TABLE_ROW_MATCH_THRESHOLD(50%)即视为整体匹配。 - matched_excerpt 收集命中行的 Word 摘录,前端可渲染为表格。 """ word_lines = _word_lines(word_text) if not word_lines: norm_full = _normalize(" ".join(raw_rows)) return ValidationResult( status="unmatched", reason="Word 文档为空", normalized_text=norm_full, matched_excerpt=None, ) matched = 0 skipped = 0 excerpts: list[str] = [] seen_excerpts: set[str] = set() for row in raw_rows: norm_row = _normalize(row) if _is_garbled(norm_row): skipped += 1 continue ratio, exc = _match_single_line(norm_row, word_lines) if ratio >= TABLE_ROW_SINGLE_THRESHOLD: matched += 1 if exc and exc not in seen_excerpts: excerpts.append(exc) seen_excerpts.add(exc) valid_count = len(raw_rows) - skipped norm_full = _normalize(" ".join(raw_rows)) if valid_count == 0: return ValidationResult( status="empty_or_garbled", reason="表格文本为空或全部为乱码", normalized_text=norm_full, matched_excerpt=None, ) match_rate = matched / valid_count excerpt_text = "\n".join(excerpts) if excerpts else None if match_rate >= TABLE_ROW_MATCH_THRESHOLD: return ValidationResult( status="matched", reason=f"表格 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%})", normalized_text=norm_full, matched_excerpt=excerpt_text, ) return ValidationResult( status="unmatched", reason=f"表格仅 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%},阈值 {TABLE_ROW_MATCH_THRESHOLD:.0%})", normalized_text=norm_full, matched_excerpt=excerpt_text, ) def validate_field_against_word(text: str, word_text: str) -> ValidationResult: """Check whether *text* matches any line of *word_text*. - 单行文本:找 Word 中最相似的一行,相似度 ≥ 0.82 视为匹配。 - 多行文本(表格):逐行匹配,命中率 ≥ 50% 视为整体匹配。 Parameters ---------- text: The OCR-extracted text block to validate. word_text: Full Markdown text extracted from the reference Word document. Returns ------- ValidationResult Contains status, a human-readable reason, the normalised text, and the best-matching line from the Word document (if any). """ # 多行文本(表格):逐行匹配 raw_rows = [r.strip() for r in text.splitlines() if r.strip()] if len(raw_rows) > 1: return _validate_table_against_word(raw_rows, word_text) # 单行匹配 norm = _normalize(text) if _is_garbled(norm): return ValidationResult( status="empty_or_garbled", reason="文本为空或包含乱码", normalized_text=norm, matched_excerpt=None, ) word_lines = _word_lines(word_text) if not word_lines: return ValidationResult( status="unmatched", reason="Word 文档为空", normalized_text=norm, matched_excerpt=None, ) best_ratio, best_excerpt = _match_single_line(norm, word_lines) if best_ratio == 1.0: return ValidationResult( status="matched", reason="与 Word 某行内容完全匹配", normalized_text=norm, matched_excerpt=best_excerpt, ) if best_ratio >= MATCH_THRESHOLD: return ValidationResult( status="matched", reason=f"与 Word 某行相似度 {best_ratio:.0%},判定为匹配", normalized_text=norm, matched_excerpt=best_excerpt, ) return ValidationResult( status="unmatched", reason=f"在 Word 中未找到匹配行(最高相似度 {best_ratio:.0%})", normalized_text=norm, matched_excerpt=best_excerpt or None, )