Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
This commit is contained in:
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions

View File

@@ -0,0 +1,266 @@
"""Validate extracted text blocks against a Word document's content."""
from __future__ import annotations
import re
import unicodedata
from dataclasses import dataclass
from difflib import SequenceMatcher
# Minimum SequenceMatcher ratio to count as a match (strict: content must be nearly identical)
MATCH_THRESHOLD = 0.95
# For multi-row tables: individual row match threshold
TABLE_ROW_SINGLE_THRESHOLD = 0.95
# For multi-row tables: fraction of valid rows that must match
TABLE_ROW_MATCH_THRESHOLD = 0.5
# Characters below this length are treated as too short to validate
MIN_TEXT_LENGTH = 2
@dataclass
class ValidationResult:
status: str # "matched" | "unmatched" | "empty_or_garbled"
reason: str
normalized_text: str
matched_excerpt: str | None
# 圆圈序号 ①②③...⑳NFKC 之前处理,避免转为数字后难以区分)
_CIRCLED_NUM_RE = re.compile(r"^[①-⑳]")
# 数字列表前缀:"1. " "2." "3. " 等NFKC 之后处理)
_LIST_NUM_RE = re.compile(r"^\d{1,2}[.\s]+")
# 句末/列表标点(中英文等价符,忽略差异;保留小数点和冒号)
_PUNCT_RE = re.compile(r"[,。;!?、…,;!?]")
def _normalize(text: str) -> str:
"""Collapse whitespace and normalise unicode for comparison.
额外处理:
- 去掉首部圆圈序号①②③和数字列表前缀1. 2.
- 忽略中英文标点差异vs ,.)
- 统一 dash 并去掉 dash 两侧空格50 60 → 50-60
"""
text = text.lstrip()
# 先去圆圈序号(在 NFKC 前,避免 ③→3 后与普通数字混淆)
text = _CIRCLED_NUM_RE.sub("", text).lstrip()
# Unicode 归一化(全角→半角、① → 1、→ :、(→ ( 等)
text = unicodedata.normalize("NFKC", text)
# Strip markdown bold/italic markers
text = re.sub(r"\*+", "", text)
# 破折号变体归一化en-dash / em-dash / minus sign → hyphen
text = re.sub(r"[–—−]", "-", text)
# 去掉 dash 两侧空格:"50 - 60" → "50-60"
text = re.sub(r"\s*-\s*", "-", text)
# 去掉数字列表前缀NFKC 后,如 "3. " "4. "
text = _LIST_NUM_RE.sub("", text.lstrip())
# 忽略句末/列表标点差异
text = _PUNCT_RE.sub("", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def _is_garbled(text: str) -> bool:
"""Return True when text is empty, too short, or mostly non-printable."""
if not text or len(text) < MIN_TEXT_LENGTH:
return True
printable = sum(1 for c in text if not unicodedata.category(c).startswith("C"))
return printable / len(text) < 0.5
def _word_lines(word_text: str) -> list[str]:
"""Split Word Markdown into non-empty normalised lines for matching.
Grid-table separator rows (e.g. ``+-----+-----+``) are filtered out
because they carry no semantic content and would skew similarity scores.
"""
_SEP_RE = re.compile(r"^[+\-=| ]+$")
lines = []
for raw in word_text.splitlines():
norm = _normalize(raw)
if not norm:
continue
# Skip pandoc grid-table separator rows
if _SEP_RE.match(norm.replace(" ", "")):
continue
lines.append(norm)
return lines
def _match_against_line(needle: str, line: str) -> tuple[float, str]:
"""Return (ratio, excerpt) for needle vs a single Word line.
When the needle (MinerU row) is significantly shorter than the Word line
(because the Word table has more product columns), a plain
SequenceMatcher ratio under-counts matching content. We also compute
*needle coverage* — the fraction of the needle's characters that appear
in the line — and take the higher of the two scores.
"""
# Exact substring
if needle in line:
idx = line.index(needle)
return 1.0, line[idx: idx + len(needle) + 20].strip()
matcher = SequenceMatcher(None, needle, line, autojunk=False)
ratio = matcher.ratio()
# Coverage ratio: useful when MinerU row is a partial view of a wider table
if len(needle) > 0 and len(needle) < len(line):
match_chars = sum(t for _, _, t in matcher.get_matching_blocks())
coverage = match_chars / len(needle)
# Apply a small discount to avoid false positives on very short needles
ratio = max(ratio, coverage * 0.95)
# 表格行(含 | 分隔符)可能很长,给更多上下文以便前端完整渲染
max_len = 400 if line.lstrip().startswith("|") else 120
return ratio, line[:max_len].strip()
def _match_single_line(norm: str, word_lines: list[str]) -> tuple[float, str]:
"""在 word_lines 中找与 norm 最相似的行,返回 (best_ratio, best_excerpt)。"""
best_ratio = 0.0
best_excerpt = ""
for line in word_lines:
ratio, excerpt = _match_against_line(norm, line)
if ratio > best_ratio:
best_ratio = ratio
best_excerpt = excerpt
if best_ratio == 1.0:
break
return best_ratio, best_excerpt
def _validate_table_against_word(raw_rows: list[str], word_text: str) -> ValidationResult:
"""多行表格逐行匹配,聚合命中率。
策略
----
- 对每一行分别调用单行匹配,达到阈值则计为命中。
- 命中率 ≥ TABLE_ROW_MATCH_THRESHOLD50%)即视为整体匹配。
- matched_excerpt 收集命中行的 Word 摘录,前端可渲染为表格。
"""
word_lines = _word_lines(word_text)
if not word_lines:
norm_full = _normalize(" ".join(raw_rows))
return ValidationResult(
status="unmatched",
reason="Word 文档为空",
normalized_text=norm_full,
matched_excerpt=None,
)
matched = 0
skipped = 0
excerpts: list[str] = []
seen_excerpts: set[str] = set()
for row in raw_rows:
norm_row = _normalize(row)
if _is_garbled(norm_row):
skipped += 1
continue
ratio, exc = _match_single_line(norm_row, word_lines)
if ratio >= TABLE_ROW_SINGLE_THRESHOLD:
matched += 1
if exc and exc not in seen_excerpts:
excerpts.append(exc)
seen_excerpts.add(exc)
valid_count = len(raw_rows) - skipped
norm_full = _normalize(" ".join(raw_rows))
if valid_count == 0:
return ValidationResult(
status="empty_or_garbled",
reason="表格文本为空或全部为乱码",
normalized_text=norm_full,
matched_excerpt=None,
)
match_rate = matched / valid_count
excerpt_text = "\n".join(excerpts) if excerpts else None
if match_rate >= TABLE_ROW_MATCH_THRESHOLD:
return ValidationResult(
status="matched",
reason=f"表格 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%}",
normalized_text=norm_full,
matched_excerpt=excerpt_text,
)
return ValidationResult(
status="unmatched",
reason=f"表格仅 {matched}/{valid_count} 行与 Word 匹配(命中率 {match_rate:.0%},阈值 {TABLE_ROW_MATCH_THRESHOLD:.0%}",
normalized_text=norm_full,
matched_excerpt=excerpt_text,
)
def validate_field_against_word(text: str, word_text: str) -> ValidationResult:
"""Check whether *text* matches any line of *word_text*.
- 单行文本:找 Word 中最相似的一行,相似度 ≥ 0.82 视为匹配。
- 多行文本(表格):逐行匹配,命中率 ≥ 50% 视为整体匹配。
Parameters
----------
text:
The OCR-extracted text block to validate.
word_text:
Full Markdown text extracted from the reference Word document.
Returns
-------
ValidationResult
Contains status, a human-readable reason, the normalised text,
and the best-matching line from the Word document (if any).
"""
# 多行文本(表格):逐行匹配
raw_rows = [r.strip() for r in text.splitlines() if r.strip()]
if len(raw_rows) > 1:
return _validate_table_against_word(raw_rows, word_text)
# 单行匹配
norm = _normalize(text)
if _is_garbled(norm):
return ValidationResult(
status="empty_or_garbled",
reason="文本为空或包含乱码",
normalized_text=norm,
matched_excerpt=None,
)
word_lines = _word_lines(word_text)
if not word_lines:
return ValidationResult(
status="unmatched",
reason="Word 文档为空",
normalized_text=norm,
matched_excerpt=None,
)
best_ratio, best_excerpt = _match_single_line(norm, word_lines)
if best_ratio == 1.0:
return ValidationResult(
status="matched",
reason="与 Word 某行内容完全匹配",
normalized_text=norm,
matched_excerpt=best_excerpt,
)
if best_ratio >= MATCH_THRESHOLD:
return ValidationResult(
status="matched",
reason=f"与 Word 某行相似度 {best_ratio:.0%},判定为匹配",
normalized_text=norm,
matched_excerpt=best_excerpt,
)
return ValidationResult(
status="unmatched",
reason=f"在 Word 中未找到匹配行(最高相似度 {best_ratio:.0%}",
normalized_text=norm,
matched_excerpt=best_excerpt or None,
)