267 lines
9.3 KiB
Python
267 lines
9.3 KiB
Python
"""Validate extracted text blocks against a Word document's content."""
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import unicodedata
|
||
from dataclasses import dataclass
|
||
from difflib import SequenceMatcher
|
||
|
||
|
||
# Minimum SequenceMatcher ratio to count as a match (strict: content must be nearly identical)
|
||
MATCH_THRESHOLD = 0.95
|
||
# For multi-row tables: individual row match threshold
|
||
TABLE_ROW_SINGLE_THRESHOLD = 0.95
|
||
# For multi-row tables: fraction of valid rows that must match
|
||
TABLE_ROW_MATCH_THRESHOLD = 0.5
|
||
# Characters below this length are treated as too short to validate
|
||
MIN_TEXT_LENGTH = 2
|
||
|
||
|
||
@dataclass
|
||
class ValidationResult:
|
||
status: str # "matched" | "unmatched" | "empty_or_garbled"
|
||
reason: str
|
||
normalized_text: str
|
||
matched_excerpt: str | None
|
||
|
||
|
||
# 圆圈序号 ①②③...⑳(NFKC 之前处理,避免转为数字后难以区分)
|
||
_CIRCLED_NUM_RE = re.compile(r"^[①-⑳]")
|
||
# 数字列表前缀:"1. " "2." "3. " 等(NFKC 之后处理)
|
||
_LIST_NUM_RE = re.compile(r"^\d{1,2}[.\s]+")
|
||
# 句末/列表标点(中英文等价符,忽略差异;保留小数点和冒号)
|
||
_PUNCT_RE = re.compile(r"[,。;!?、…,;!?]")
|
||
|
||
|
||
def _normalize(text: str) -> str:
|
||
"""Collapse whitespace and normalise unicode for comparison.
|
||
|
||
额外处理:
|
||
- 去掉首部圆圈序号(①②③)和数字列表前缀(1. 2.)
|
||
- 忽略中英文标点差异(,。;vs ,.)
|
||
- 统一 dash 并去掉 dash 两侧空格(50 – 60 → 50-60)
|
||
"""
|
||
text = text.lstrip()
|
||
# 先去圆圈序号(在 NFKC 前,避免 ③→3 后与普通数字混淆)
|
||
text = _CIRCLED_NUM_RE.sub("", text).lstrip()
|
||
# Unicode 归一化(全角→半角、① → 1、:→ :、(→ ( 等)
|
||
text = unicodedata.normalize("NFKC", text)
|
||
# Strip markdown bold/italic markers
|
||
text = re.sub(r"\*+", "", text)
|
||
# 破折号变体归一化:en-dash / em-dash / minus sign → hyphen
|
||
text = re.sub(r"[–—−]", "-", text)
|
||
# 去掉 dash 两侧空格:"50 - 60" → "50-60"
|
||
text = re.sub(r"\s*-\s*", "-", text)
|
||
# 去掉数字列表前缀(NFKC 后,如 "3. " "4. ")
|
||
text = _LIST_NUM_RE.sub("", text.lstrip())
|
||
# 忽略句末/列表标点差异
|
||
text = _PUNCT_RE.sub("", text)
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
return text
|
||
|
||
|
||
def _is_garbled(text: str) -> bool:
|
||
"""Return True when text is empty, too short, or mostly non-printable."""
|
||
if not text or len(text) < MIN_TEXT_LENGTH:
|
||
return True
|
||
printable = sum(1 for c in text if not unicodedata.category(c).startswith("C"))
|
||
return printable / len(text) < 0.5
|
||
|
||
|
||
def _word_lines(word_text: str) -> list[str]:
|
||
"""Split Word Markdown into non-empty normalised lines for matching.
|
||
|
||
Grid-table separator rows (e.g. ``+-----+-----+``) are filtered out
|
||
because they carry no semantic content and would skew similarity scores.
|
||
"""
|
||
_SEP_RE = re.compile(r"^[+\-=| ]+$")
|
||
lines = []
|
||
for raw in word_text.splitlines():
|
||
norm = _normalize(raw)
|
||
if not norm:
|
||
continue
|
||
# Skip pandoc grid-table separator rows
|
||
if _SEP_RE.match(norm.replace(" ", "")):
|
||
continue
|
||
lines.append(norm)
|
||
return lines
|
||
|
||
|
||
def _match_against_line(needle: str, line: str) -> tuple[float, str]:
|
||
"""Return (ratio, excerpt) for needle vs a single Word line.
|
||
|
||
When the needle (MinerU row) is significantly shorter than the Word line
|
||
(because the Word table has more product columns), a plain
|
||
SequenceMatcher ratio under-counts matching content. We also compute
|
||
*needle coverage* — the fraction of the needle's characters that appear
|
||
in the line — and take the higher of the two scores.
|
||
"""
|
||
# Exact substring
|
||
if needle in line:
|
||
idx = line.index(needle)
|
||
return 1.0, line[idx: idx + len(needle) + 20].strip()
|
||
|
||
matcher = SequenceMatcher(None, needle, line, autojunk=False)
|
||
ratio = matcher.ratio()
|
||
|
||
# Coverage ratio: useful when MinerU row is a partial view of a wider table
|
||
if len(needle) > 0 and len(needle) < len(line):
|
||
match_chars = sum(t for _, _, t in matcher.get_matching_blocks())
|
||
coverage = match_chars / len(needle)
|
||
# Apply a small discount to avoid false positives on very short needles
|
||
ratio = max(ratio, coverage * 0.95)
|
||
|
||
# 表格行(含 | 分隔符)可能很长,给更多上下文以便前端完整渲染
|
||
max_len = 400 if line.lstrip().startswith("|") else 120
|
||
return ratio, line[:max_len].strip()
|
||
|
||
|
||
def _match_single_line(norm: str, word_lines: list[str]) -> tuple[float, str]:
|
||
"""在 word_lines 中找与 norm 最相似的行,返回 (best_ratio, best_excerpt)。"""
|
||
best_ratio = 0.0
|
||
best_excerpt = ""
|
||
for line in word_lines:
|
||
ratio, excerpt = _match_against_line(norm, line)
|
||
if ratio > best_ratio:
|
||
best_ratio = ratio
|
||
best_excerpt = excerpt
|
||
if best_ratio == 1.0:
|
||
break
|
||
return best_ratio, best_excerpt
|
||
|
||
|
||
def _validate_table_against_word(raw_rows: list[str], word_text: str) -> ValidationResult:
|
||
"""多行表格逐行匹配,聚合命中率。
|
||
|
||
策略
|
||
----
|
||
- 对每一行分别调用单行匹配,达到阈值则计为命中。
|
||
- 命中率 ≥ TABLE_ROW_MATCH_THRESHOLD(50%)即视为整体匹配。
|
||
- matched_excerpt 收集命中行的 Word 摘录,前端可渲染为表格。
|
||
"""
|
||
word_lines = _word_lines(word_text)
|
||
if not word_lines:
|
||
norm_full = _normalize(" ".join(raw_rows))
|
||
return ValidationResult(
|
||
status="unmatched",
|
||
reason="Word 文档为空",
|
||
normalized_text=norm_full,
|
||
matched_excerpt=None,
|
||
)
|
||
|
||
matched = 0
|
||
skipped = 0
|
||
excerpts: list[str] = []
|
||
seen_excerpts: set[str] = set()
|
||
|
||
for row in raw_rows:
|
||
norm_row = _normalize(row)
|
||
if _is_garbled(norm_row):
|
||
skipped += 1
|
||
continue
|
||
ratio, exc = _match_single_line(norm_row, word_lines)
|
||
if ratio >= TABLE_ROW_SINGLE_THRESHOLD:
|
||
matched += 1
|
||
if exc and exc not in seen_excerpts:
|
||
excerpts.append(exc)
|
||
seen_excerpts.add(exc)
|
||
|
||
valid_count = len(raw_rows) - skipped
|
||
norm_full = _normalize(" ".join(raw_rows))
|
||
|
||
if valid_count == 0:
|
||
return ValidationResult(
|
||
status="empty_or_garbled",
|
||
reason="表格文本为空或全部为乱码",
|
||
normalized_text=norm_full,
|
||
matched_excerpt=None,
|
||
)
|
||
|
||
match_rate = matched / valid_count
|
||
excerpt_text = "\n".join(excerpts) if excerpts else None
|
||
|
||
if match_rate >= TABLE_ROW_MATCH_THRESHOLD:
|
||
return ValidationResult(
|
||
status="matched",
|
||
reason=f"表格 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%})",
|
||
normalized_text=norm_full,
|
||
matched_excerpt=excerpt_text,
|
||
)
|
||
|
||
return ValidationResult(
|
||
status="unmatched",
|
||
reason=f"表格仅 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%},阈值 {TABLE_ROW_MATCH_THRESHOLD:.0%})",
|
||
normalized_text=norm_full,
|
||
matched_excerpt=excerpt_text,
|
||
)
|
||
|
||
|
||
def validate_field_against_word(text: str, word_text: str) -> ValidationResult:
|
||
"""Check whether *text* matches any line of *word_text*.
|
||
|
||
- 单行文本:找 Word 中最相似的一行,相似度 ≥ 0.82 视为匹配。
|
||
- 多行文本(表格):逐行匹配,命中率 ≥ 50% 视为整体匹配。
|
||
|
||
Parameters
|
||
----------
|
||
text:
|
||
The OCR-extracted text block to validate.
|
||
word_text:
|
||
Full Markdown text extracted from the reference Word document.
|
||
|
||
Returns
|
||
-------
|
||
ValidationResult
|
||
Contains status, a human-readable reason, the normalised text,
|
||
and the best-matching line from the Word document (if any).
|
||
"""
|
||
# 多行文本(表格):逐行匹配
|
||
raw_rows = [r.strip() for r in text.splitlines() if r.strip()]
|
||
if len(raw_rows) > 1:
|
||
return _validate_table_against_word(raw_rows, word_text)
|
||
|
||
# 单行匹配
|
||
norm = _normalize(text)
|
||
|
||
if _is_garbled(norm):
|
||
return ValidationResult(
|
||
status="empty_or_garbled",
|
||
reason="文本为空或包含乱码",
|
||
normalized_text=norm,
|
||
matched_excerpt=None,
|
||
)
|
||
|
||
word_lines = _word_lines(word_text)
|
||
if not word_lines:
|
||
return ValidationResult(
|
||
status="unmatched",
|
||
reason="Word 文档为空",
|
||
normalized_text=norm,
|
||
matched_excerpt=None,
|
||
)
|
||
|
||
best_ratio, best_excerpt = _match_single_line(norm, word_lines)
|
||
|
||
if best_ratio == 1.0:
|
||
return ValidationResult(
|
||
status="matched",
|
||
reason="与 Word 某行内容完全匹配",
|
||
normalized_text=norm,
|
||
matched_excerpt=best_excerpt,
|
||
)
|
||
|
||
if best_ratio >= MATCH_THRESHOLD:
|
||
return ValidationResult(
|
||
status="matched",
|
||
reason=f"与 Word 某行相似度 {best_ratio:.0%},判定为匹配",
|
||
normalized_text=norm,
|
||
matched_excerpt=best_excerpt,
|
||
)
|
||
|
||
return ValidationResult(
|
||
status="unmatched",
|
||
reason=f"在 Word 中未找到匹配行(最高相似度 {best_ratio:.0%})",
|
||
normalized_text=norm,
|
||
matched_excerpt=best_excerpt or None,
|
||
)
|