ZLD_POC/backend/app/text_validation.py

"""Validate extracted text blocks against a Word document's content."""
from __future__ import annotations

import re
import unicodedata
from dataclasses import dataclass
from difflib import SequenceMatcher


# Minimum SequenceMatcher ratio to count as a match (strict: content must be nearly identical)
MATCH_THRESHOLD = 0.95
# For multi-row tables: individual row match threshold
TABLE_ROW_SINGLE_THRESHOLD = 0.95
# For multi-row tables: fraction of valid rows that must match
TABLE_ROW_MATCH_THRESHOLD = 0.5
# Characters below this length are treated as too short to validate
MIN_TEXT_LENGTH = 2


@dataclass
class ValidationResult:
    status: str          # "matched" | "unmatched" | "empty_or_garbled"
    reason: str
    normalized_text: str
    matched_excerpt: str | None


# 圆圈序号 ①②③...⑳（NFKC 之前处理，避免转为数字后难以区分）
_CIRCLED_NUM_RE = re.compile(r"^[①-⑳]")
# 数字列表前缀："1. " "2." "3. " 等（NFKC 之后处理）
_LIST_NUM_RE = re.compile(r"^\d{1,2}[.\s]+")
# 句末/列表标点（中英文等价符，忽略差异；保留小数点和冒号）
_PUNCT_RE = re.compile(r"[，。；！？、…,;!?]")


def _normalize(text: str) -> str:
    """Collapse whitespace and normalise unicode for comparison.

    额外处理：
    - 去掉首部圆圈序号（①②③）和数字列表前缀（1. 2.）
    - 忽略中英文标点差异（，。；vs ,.)
    - 统一 dash 并去掉 dash 两侧空格（50 – 60 → 50-60）
    """
    text = text.lstrip()
    # 先去圆圈序号（在 NFKC 前，避免 ③→3 后与普通数字混淆）
    text = _CIRCLED_NUM_RE.sub("", text).lstrip()
    # Unicode 归一化（全角→半角、① → 1、：→ :、（→ ( 等）
    text = unicodedata.normalize("NFKC", text)
    # Strip markdown bold/italic markers
    text = re.sub(r"\*+", "", text)
    # 破折号变体归一化：en-dash / em-dash / minus sign → hyphen
    text = re.sub(r"[–—−]", "-", text)
    # 去掉 dash 两侧空格："50 - 60" → "50-60"
    text = re.sub(r"\s*-\s*", "-", text)
    # 去掉数字列表前缀（NFKC 后，如 "3. " "4. "）
    text = _LIST_NUM_RE.sub("", text.lstrip())
    # 忽略句末/列表标点差异
    text = _PUNCT_RE.sub("", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def _is_garbled(text: str) -> bool:
    """Return True when text is empty, too short, or mostly non-printable."""
    if not text or len(text) < MIN_TEXT_LENGTH:
        return True
    printable = sum(1 for c in text if not unicodedata.category(c).startswith("C"))
    return printable / len(text) < 0.5


def _word_lines(word_text: str) -> list[str]:
    """Split Word Markdown into non-empty normalised lines for matching.

    Grid-table separator rows (e.g. ``+-----+-----+``) are filtered out
    because they carry no semantic content and would skew similarity scores.
    """
    _SEP_RE = re.compile(r"^[+\-=| ]+$")
    lines = []
    for raw in word_text.splitlines():
        norm = _normalize(raw)
        if not norm:
            continue
        # Skip pandoc grid-table separator rows
        if _SEP_RE.match(norm.replace(" ", "")):
            continue
        lines.append(norm)
    return lines


def _match_against_line(needle: str, line: str) -> tuple[float, str]:
    """Return (ratio, excerpt) for needle vs a single Word line.

    When the needle (MinerU row) is significantly shorter than the Word line
    (because the Word table has more product columns), a plain
    SequenceMatcher ratio under-counts matching content.  We also compute
    *needle coverage* — the fraction of the needle's characters that appear
    in the line — and take the higher of the two scores.
    """
    # Exact substring
    if needle in line:
        idx = line.index(needle)
        return 1.0, line[idx: idx + len(needle) + 20].strip()

    matcher = SequenceMatcher(None, needle, line, autojunk=False)
    ratio = matcher.ratio()

    # Coverage ratio: useful when MinerU row is a partial view of a wider table
    if len(needle) > 0 and len(needle) < len(line):
        match_chars = sum(t for _, _, t in matcher.get_matching_blocks())
        coverage = match_chars / len(needle)
        # Apply a small discount to avoid false positives on very short needles
        ratio = max(ratio, coverage * 0.95)

    # 表格行（含 | 分隔符）可能很长，给更多上下文以便前端完整渲染
    max_len = 400 if line.lstrip().startswith("|") else 120
    return ratio, line[:max_len].strip()


def _match_single_line(norm: str, word_lines: list[str]) -> tuple[float, str]:
    """在 word_lines 中找与 norm 最相似的行，返回 (best_ratio, best_excerpt)。"""
    best_ratio = 0.0
    best_excerpt = ""
    for line in word_lines:
        ratio, excerpt = _match_against_line(norm, line)
        if ratio > best_ratio:
            best_ratio = ratio
            best_excerpt = excerpt
        if best_ratio == 1.0:
            break
    return best_ratio, best_excerpt


def _validate_table_against_word(raw_rows: list[str], word_text: str) -> ValidationResult:
    """多行表格逐行匹配，聚合命中率。

    策略
    ----
    - 对每一行分别调用单行匹配，达到阈值则计为命中。
    - 命中率 ≥ TABLE_ROW_MATCH_THRESHOLD（50%）即视为整体匹配。
    - matched_excerpt 收集命中行的 Word 摘录，前端可渲染为表格。
    """
    word_lines = _word_lines(word_text)
    if not word_lines:
        norm_full = _normalize(" ".join(raw_rows))
        return ValidationResult(
            status="unmatched",
            reason="Word 文档为空",
            normalized_text=norm_full,
            matched_excerpt=None,
        )

    matched = 0
    skipped = 0
    excerpts: list[str] = []
    seen_excerpts: set[str] = set()

    for row in raw_rows:
        norm_row = _normalize(row)
        if _is_garbled(norm_row):
            skipped += 1
            continue
        ratio, exc = _match_single_line(norm_row, word_lines)
        if ratio >= TABLE_ROW_SINGLE_THRESHOLD:
            matched += 1
            if exc and exc not in seen_excerpts:
                excerpts.append(exc)
                seen_excerpts.add(exc)

    valid_count = len(raw_rows) - skipped
    norm_full = _normalize(" ".join(raw_rows))

    if valid_count == 0:
        return ValidationResult(
            status="empty_or_garbled",
            reason="表格文本为空或全部为乱码",
            normalized_text=norm_full,
            matched_excerpt=None,
        )

    match_rate = matched / valid_count
    excerpt_text = "\n".join(excerpts) if excerpts else None

    if match_rate >= TABLE_ROW_MATCH_THRESHOLD:
        return ValidationResult(
            status="matched",
            reason=f"表格 {matched}/{valid_count} 行与 Word 匹配（命中率 {match_rate:.0%}）",
            normalized_text=norm_full,
            matched_excerpt=excerpt_text,
        )

    return ValidationResult(
        status="unmatched",
        reason=f"表格仅 {matched}/{valid_count} 行与 Word 匹配（命中率 {match_rate:.0%}，阈值 {TABLE_ROW_MATCH_THRESHOLD:.0%}）",
        normalized_text=norm_full,
        matched_excerpt=excerpt_text,
    )


def validate_field_against_word(text: str, word_text: str) -> ValidationResult:
    """Check whether *text* matches any line of *word_text*.

    - 单行文本：找 Word 中最相似的一行，相似度 ≥ 0.82 视为匹配。
    - 多行文本（表格）：逐行匹配，命中率 ≥ 50% 视为整体匹配。

    Parameters
    ----------
    text:
        The OCR-extracted text block to validate.
    word_text:
        Full Markdown text extracted from the reference Word document.

    Returns
    -------
    ValidationResult
        Contains status, a human-readable reason, the normalised text,
        and the best-matching line from the Word document (if any).
    """
    # 多行文本（表格）：逐行匹配
    raw_rows = [r.strip() for r in text.splitlines() if r.strip()]
    if len(raw_rows) > 1:
        return _validate_table_against_word(raw_rows, word_text)

    # 单行匹配
    norm = _normalize(text)

    if _is_garbled(norm):
        return ValidationResult(
            status="empty_or_garbled",
            reason="文本为空或包含乱码",
            normalized_text=norm,
            matched_excerpt=None,
        )

    word_lines = _word_lines(word_text)
    if not word_lines:
        return ValidationResult(
            status="unmatched",
            reason="Word 文档为空",
            normalized_text=norm,
            matched_excerpt=None,
        )

    best_ratio, best_excerpt = _match_single_line(norm, word_lines)

    if best_ratio == 1.0:
        return ValidationResult(
            status="matched",
            reason="与 Word 某行内容完全匹配",
            normalized_text=norm,
            matched_excerpt=best_excerpt,
        )

    if best_ratio >= MATCH_THRESHOLD:
        return ValidationResult(
            status="matched",
            reason=f"与 Word 某行相似度 {best_ratio:.0%}，判定为匹配",
            normalized_text=norm,
            matched_excerpt=best_excerpt,
        )

    return ValidationResult(
        status="unmatched",
        reason=f"在 Word 中未找到匹配行（最高相似度 {best_ratio:.0%}）",
        normalized_text=norm,
        matched_excerpt=best_excerpt or None,
    )