Files
ZLD_POC/backend/app/text_validation.py
2026-04-15 17:18:49 +08:00

267 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Validate extracted text blocks against a Word document's content."""
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass
from difflib import SequenceMatcher
# Minimum SequenceMatcher ratio to count as a match (strict: content must be nearly identical)
MATCH_THRESHOLD = 0.95
# For multi-row tables: individual row match threshold
TABLE_ROW_SINGLE_THRESHOLD = 0.95
# For multi-row tables: fraction of valid rows that must match
TABLE_ROW_MATCH_THRESHOLD = 0.5
# Characters below this length are treated as too short to validate
MIN_TEXT_LENGTH = 2
@dataclass
class ValidationResult:
status: str # "matched" | "unmatched" | "empty_or_garbled"
reason: str
normalized_text: str
matched_excerpt: str | None
# 圆圈序号 ①②③...⑳NFKC 之前处理,避免转为数字后难以区分)
_CIRCLED_NUM_RE = re.compile(r"^[①-⑳]")
# 数字列表前缀:"1. " "2." "3. " 等NFKC 之后处理)
_LIST_NUM_RE = re.compile(r"^\d{1,2}[.\s]+")
# 句末/列表标点(中英文等价符,忽略差异;保留小数点和冒号)
_PUNCT_RE = re.compile(r"[,。;!?、…,;!?]")
def _normalize(text: str) -> str:
"""Collapse whitespace and normalise unicode for comparison.
额外处理:
- 去掉首部圆圈序号①②③和数字列表前缀1. 2.
- 忽略中英文标点差异vs ,.)
- 统一 dash 并去掉 dash 两侧空格50 60 → 50-60
"""
text = text.lstrip()
# 先去圆圈序号(在 NFKC 前,避免 ③→3 后与普通数字混淆)
text = _CIRCLED_NUM_RE.sub("", text).lstrip()
# Unicode 归一化(全角→半角、① → 1、→ :、(→ ( 等)
text = unicodedata.normalize("NFKC", text)
# Strip markdown bold/italic markers
text = re.sub(r"\*+", "", text)
# 破折号变体归一化en-dash / em-dash / minus sign → hyphen
text = re.sub(r"[–—−]", "-", text)
# 去掉 dash 两侧空格:"50 - 60" → "50-60"
text = re.sub(r"\s*-\s*", "-", text)
# 去掉数字列表前缀NFKC 后,如 "3. " "4. "
text = _LIST_NUM_RE.sub("", text.lstrip())
# 忽略句末/列表标点差异
text = _PUNCT_RE.sub("", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def _is_garbled(text: str) -> bool:
"""Return True when text is empty, too short, or mostly non-printable."""
if not text or len(text) < MIN_TEXT_LENGTH:
return True
printable = sum(1 for c in text if not unicodedata.category(c).startswith("C"))
return printable / len(text) < 0.5
def _word_lines(word_text: str) -> list[str]:
"""Split Word Markdown into non-empty normalised lines for matching.
Grid-table separator rows (e.g. ``+-----+-----+``) are filtered out
because they carry no semantic content and would skew similarity scores.
"""
_SEP_RE = re.compile(r"^[+\-=| ]+$")
lines = []
for raw in word_text.splitlines():
norm = _normalize(raw)
if not norm:
continue
# Skip pandoc grid-table separator rows
if _SEP_RE.match(norm.replace(" ", "")):
continue
lines.append(norm)
return lines
def _match_against_line(needle: str, line: str) -> tuple[float, str]:
"""Return (ratio, excerpt) for needle vs a single Word line.
When the needle (MinerU row) is significantly shorter than the Word line
(because the Word table has more product columns), a plain
SequenceMatcher ratio under-counts matching content. We also compute
*needle coverage* — the fraction of the needle's characters that appear
in the line — and take the higher of the two scores.
"""
# Exact substring
if needle in line:
idx = line.index(needle)
return 1.0, line[idx: idx + len(needle) + 20].strip()
matcher = SequenceMatcher(None, needle, line, autojunk=False)
ratio = matcher.ratio()
# Coverage ratio: useful when MinerU row is a partial view of a wider table
if len(needle) > 0 and len(needle) < len(line):
match_chars = sum(t for _, _, t in matcher.get_matching_blocks())
coverage = match_chars / len(needle)
# Apply a small discount to avoid false positives on very short needles
ratio = max(ratio, coverage * 0.95)
# 表格行(含 | 分隔符)可能很长,给更多上下文以便前端完整渲染
max_len = 400 if line.lstrip().startswith("|") else 120
return ratio, line[:max_len].strip()
def _match_single_line(norm: str, word_lines: list[str]) -> tuple[float, str]:
"""在 word_lines 中找与 norm 最相似的行,返回 (best_ratio, best_excerpt)。"""
best_ratio = 0.0
best_excerpt = ""
for line in word_lines:
ratio, excerpt = _match_against_line(norm, line)
if ratio > best_ratio:
best_ratio = ratio
best_excerpt = excerpt
if best_ratio == 1.0:
break
return best_ratio, best_excerpt
def _validate_table_against_word(raw_rows: list[str], word_text: str) -> ValidationResult:
"""多行表格逐行匹配,聚合命中率。
策略
----
- 对每一行分别调用单行匹配,达到阈值则计为命中。
- 命中率 ≥ TABLE_ROW_MATCH_THRESHOLD50%)即视为整体匹配。
- matched_excerpt 收集命中行的 Word 摘录,前端可渲染为表格。
"""
word_lines = _word_lines(word_text)
if not word_lines:
norm_full = _normalize(" ".join(raw_rows))
return ValidationResult(
status="unmatched",
reason="Word 文档为空",
normalized_text=norm_full,
matched_excerpt=None,
)
matched = 0
skipped = 0
excerpts: list[str] = []
seen_excerpts: set[str] = set()
for row in raw_rows:
norm_row = _normalize(row)
if _is_garbled(norm_row):
skipped += 1
continue
ratio, exc = _match_single_line(norm_row, word_lines)
if ratio >= TABLE_ROW_SINGLE_THRESHOLD:
matched += 1
if exc and exc not in seen_excerpts:
excerpts.append(exc)
seen_excerpts.add(exc)
valid_count = len(raw_rows) - skipped
norm_full = _normalize(" ".join(raw_rows))
if valid_count == 0:
return ValidationResult(
status="empty_or_garbled",
reason="表格文本为空或全部为乱码",
normalized_text=norm_full,
matched_excerpt=None,
)
match_rate = matched / valid_count
excerpt_text = "\n".join(excerpts) if excerpts else None
if match_rate >= TABLE_ROW_MATCH_THRESHOLD:
return ValidationResult(
status="matched",
reason=f"表格 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%}",
normalized_text=norm_full,
matched_excerpt=excerpt_text,
)
return ValidationResult(
status="unmatched",
reason=f"表格仅 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%},阈值 {TABLE_ROW_MATCH_THRESHOLD:.0%}",
normalized_text=norm_full,
matched_excerpt=excerpt_text,
)
def validate_field_against_word(text: str, word_text: str) -> ValidationResult:
"""Check whether *text* matches any line of *word_text*.
- 单行文本:找 Word 中最相似的一行,相似度 ≥ 0.82 视为匹配。
- 多行文本(表格):逐行匹配,命中率 ≥ 50% 视为整体匹配。
Parameters
----------
text:
The OCR-extracted text block to validate.
word_text:
Full Markdown text extracted from the reference Word document.
Returns
-------
ValidationResult
Contains status, a human-readable reason, the normalised text,
and the best-matching line from the Word document (if any).
"""
# 多行文本(表格):逐行匹配
raw_rows = [r.strip() for r in text.splitlines() if r.strip()]
if len(raw_rows) > 1:
return _validate_table_against_word(raw_rows, word_text)
# 单行匹配
norm = _normalize(text)
if _is_garbled(norm):
return ValidationResult(
status="empty_or_garbled",
reason="文本为空或包含乱码",
normalized_text=norm,
matched_excerpt=None,
)
word_lines = _word_lines(word_text)
if not word_lines:
return ValidationResult(
status="unmatched",
reason="Word 文档为空",
normalized_text=norm,
matched_excerpt=None,
)
best_ratio, best_excerpt = _match_single_line(norm, word_lines)
if best_ratio == 1.0:
return ValidationResult(
status="matched",
reason="与 Word 某行内容完全匹配",
normalized_text=norm,
matched_excerpt=best_excerpt,
)
if best_ratio >= MATCH_THRESHOLD:
return ValidationResult(
status="matched",
reason=f"与 Word 某行相似度 {best_ratio:.0%},判定为匹配",
normalized_text=norm,
matched_excerpt=best_excerpt,
)
return ValidationResult(
status="unmatched",
reason=f"在 Word 中未找到匹配行(最高相似度 {best_ratio:.0%}",
normalized_text=norm,
matched_excerpt=best_excerpt or None,
)