Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions
--- a/backend/app/text_validation.py
+++ b/backend/app/text_validation.py
@@ -0,0 +1,266 @@
+"""Validate extracted text blocks against a Word document's content."""
+from __future__ import annotations
+
+import re
+import unicodedata
+from dataclasses import dataclass
+from difflib import SequenceMatcher
+
+
+# Minimum SequenceMatcher ratio to count as a match (strict: content must be nearly identical)
+MATCH_THRESHOLD = 0.95
+# For multi-row tables: individual row match threshold
+TABLE_ROW_SINGLE_THRESHOLD = 0.95
+# For multi-row tables: fraction of valid rows that must match
+TABLE_ROW_MATCH_THRESHOLD = 0.5
+# Characters below this length are treated as too short to validate
+MIN_TEXT_LENGTH = 2
+
+
+@dataclass
+class ValidationResult:
+    status: str          # "matched" | "unmatched" | "empty_or_garbled"
+    reason: str
+    normalized_text: str
+    matched_excerpt: str | None
+
+
+# 圆圈序号 ①②③...⑳（NFKC 之前处理，避免转为数字后难以区分）
+_CIRCLED_NUM_RE = re.compile(r"^[①-⑳]")
+# 数字列表前缀："1. " "2." "3. " 等（NFKC 之后处理）
+_LIST_NUM_RE = re.compile(r"^\d{1,2}[.\s]+")
+# 句末/列表标点（中英文等价符，忽略差异；保留小数点和冒号）
+_PUNCT_RE = re.compile(r"[，。；！？、…,;!?]")
+
+
+def _normalize(text: str) -> str:
+    """Collapse whitespace and normalise unicode for comparison.
+
+    额外处理：
+    - 去掉首部圆圈序号（①②③）和数字列表前缀（1. 2.）
+    - 忽略中英文标点差异（，。；vs ,.)
+    - 统一 dash 并去掉 dash 两侧空格（50 – 60 → 50-60）
+    """
+    text = text.lstrip()
+    # 先去圆圈序号（在 NFKC 前，避免 ③→3 后与普通数字混淆）
+    text = _CIRCLED_NUM_RE.sub("", text).lstrip()
+    # Unicode 归一化（全角→半角、① → 1、：→ :、（→ ( 等）
+    text = unicodedata.normalize("NFKC", text)
+    # Strip markdown bold/italic markers
+    text = re.sub(r"\*+", "", text)
+    # 破折号变体归一化：en-dash / em-dash / minus sign → hyphen
+    text = re.sub(r"[–—−]", "-", text)
+    # 去掉 dash 两侧空格："50 - 60" → "50-60"
+    text = re.sub(r"\s*-\s*", "-", text)
+    # 去掉数字列表前缀（NFKC 后，如 "3. " "4. "）
+    text = _LIST_NUM_RE.sub("", text.lstrip())
+    # 忽略句末/列表标点差异
+    text = _PUNCT_RE.sub("", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def _is_garbled(text: str) -> bool:
+    """Return True when text is empty, too short, or mostly non-printable."""
+    if not text or len(text) < MIN_TEXT_LENGTH:
+        return True
+    printable = sum(1 for c in text if not unicodedata.category(c).startswith("C"))
+    return printable / len(text) < 0.5
+
+
+def _word_lines(word_text: str) -> list[str]:
+    """Split Word Markdown into non-empty normalised lines for matching.
+
+    Grid-table separator rows (e.g. ``+-----+-----+``) are filtered out
+    because they carry no semantic content and would skew similarity scores.
+    """
+    _SEP_RE = re.compile(r"^[+\-=| ]+$")
+    lines = []
+    for raw in word_text.splitlines():
+        norm = _normalize(raw)
+        if not norm:
+            continue
+        # Skip pandoc grid-table separator rows
+        if _SEP_RE.match(norm.replace(" ", "")):
+            continue
+        lines.append(norm)
+    return lines
+
+
+def _match_against_line(needle: str, line: str) -> tuple[float, str]:
+    """Return (ratio, excerpt) for needle vs a single Word line.
+
+    When the needle (MinerU row) is significantly shorter than the Word line
+    (because the Word table has more product columns), a plain
+    SequenceMatcher ratio under-counts matching content.  We also compute
+    *needle coverage* — the fraction of the needle's characters that appear
+    in the line — and take the higher of the two scores.
+    """
+    # Exact substring
+    if needle in line:
+        idx = line.index(needle)
+        return 1.0, line[idx: idx + len(needle) + 20].strip()
+
+    matcher = SequenceMatcher(None, needle, line, autojunk=False)
+    ratio = matcher.ratio()
+
+    # Coverage ratio: useful when MinerU row is a partial view of a wider table
+    if len(needle) > 0 and len(needle) < len(line):
+        match_chars = sum(t for _, _, t in matcher.get_matching_blocks())
+        coverage = match_chars / len(needle)
+        # Apply a small discount to avoid false positives on very short needles
+        ratio = max(ratio, coverage * 0.95)
+
+    # 表格行（含 | 分隔符）可能很长，给更多上下文以便前端完整渲染
+    max_len = 400 if line.lstrip().startswith("|") else 120
+    return ratio, line[:max_len].strip()
+
+
+def _match_single_line(norm: str, word_lines: list[str]) -> tuple[float, str]:
+    """在 word_lines 中找与 norm 最相似的行，返回 (best_ratio, best_excerpt)。"""
+    best_ratio = 0.0
+    best_excerpt = ""
+    for line in word_lines:
+        ratio, excerpt = _match_against_line(norm, line)
+        if ratio > best_ratio:
+            best_ratio = ratio
+            best_excerpt = excerpt
+        if best_ratio == 1.0:
+            break
+    return best_ratio, best_excerpt
+
+
+def _validate_table_against_word(raw_rows: list[str], word_text: str) -> ValidationResult:
+    """多行表格逐行匹配，聚合命中率。
+
+    策略
+    ----
+    - 对每一行分别调用单行匹配，达到阈值则计为命中。
+    - 命中率 ≥ TABLE_ROW_MATCH_THRESHOLD（50%）即视为整体匹配。
+    - matched_excerpt 收集命中行的 Word 摘录，前端可渲染为表格。
+    """
+    word_lines = _word_lines(word_text)
+    if not word_lines:
+        norm_full = _normalize(" ".join(raw_rows))
+        return ValidationResult(
+            status="unmatched",
+            reason="Word 文档为空",
+            normalized_text=norm_full,
+            matched_excerpt=None,
+        )
+
+    matched = 0
+    skipped = 0
+    excerpts: list[str] = []
+    seen_excerpts: set[str] = set()
+
+    for row in raw_rows:
+        norm_row = _normalize(row)
+        if _is_garbled(norm_row):
+            skipped += 1
+            continue
+        ratio, exc = _match_single_line(norm_row, word_lines)
+        if ratio >= TABLE_ROW_SINGLE_THRESHOLD:
+            matched += 1
+            if exc and exc not in seen_excerpts:
+                excerpts.append(exc)
+                seen_excerpts.add(exc)
+
+    valid_count = len(raw_rows) - skipped
+    norm_full = _normalize(" ".join(raw_rows))
+
+    if valid_count == 0:
+        return ValidationResult(
+            status="empty_or_garbled",
+            reason="表格文本为空或全部为乱码",
+            normalized_text=norm_full,
+            matched_excerpt=None,
+        )
+
+    match_rate = matched / valid_count
+    excerpt_text = "\n".join(excerpts) if excerpts else None
+
+    if match_rate >= TABLE_ROW_MATCH_THRESHOLD:
+        return ValidationResult(
+            status="matched",
+            reason=f"表格 {matched}/{valid_count} 行与 Word 匹配（命中率 {match_rate:.0%}）",
+            normalized_text=norm_full,
+            matched_excerpt=excerpt_text,
+        )
+
+    return ValidationResult(
+        status="unmatched",
+        reason=f"表格仅 {matched}/{valid_count} 行与 Word 匹配（命中率 {match_rate:.0%}，阈值 {TABLE_ROW_MATCH_THRESHOLD:.0%}）",
+        normalized_text=norm_full,
+        matched_excerpt=excerpt_text,
+    )
+
+
+def validate_field_against_word(text: str, word_text: str) -> ValidationResult:
+    """Check whether *text* matches any line of *word_text*.
+
+    - 单行文本：找 Word 中最相似的一行，相似度 ≥ 0.82 视为匹配。
+    - 多行文本（表格）：逐行匹配，命中率 ≥ 50% 视为整体匹配。
+
+    Parameters
+    ----------
+    text:
+        The OCR-extracted text block to validate.
+    word_text:
+        Full Markdown text extracted from the reference Word document.
+
+    Returns
+    -------
+    ValidationResult
+        Contains status, a human-readable reason, the normalised text,
+        and the best-matching line from the Word document (if any).
+    """
+    # 多行文本（表格）：逐行匹配
+    raw_rows = [r.strip() for r in text.splitlines() if r.strip()]
+    if len(raw_rows) > 1:
+        return _validate_table_against_word(raw_rows, word_text)
+
+    # 单行匹配
+    norm = _normalize(text)
+
+    if _is_garbled(norm):
+        return ValidationResult(
+            status="empty_or_garbled",
+            reason="文本为空或包含乱码",
+            normalized_text=norm,
+            matched_excerpt=None,
+        )
+
+    word_lines = _word_lines(word_text)
+    if not word_lines:
+        return ValidationResult(
+            status="unmatched",
+            reason="Word 文档为空",
+            normalized_text=norm,
+            matched_excerpt=None,
+        )
+
+    best_ratio, best_excerpt = _match_single_line(norm, word_lines)
+
+    if best_ratio == 1.0:
+        return ValidationResult(
+            status="matched",
+            reason="与 Word 某行内容完全匹配",
+            normalized_text=norm,
+            matched_excerpt=best_excerpt,
+        )
+
+    if best_ratio >= MATCH_THRESHOLD:
+        return ValidationResult(
+            status="matched",
+            reason=f"与 Word 某行相似度 {best_ratio:.0%}，判定为匹配",
+            normalized_text=norm,
+            matched_excerpt=best_excerpt,
+        )
+
+    return ValidationResult(
+        status="unmatched",
+        reason=f"在 Word 中未找到匹配行（最高相似度 {best_ratio:.0%}）",
+        normalized_text=norm,
+        matched_excerpt=best_excerpt or None,
+    )