33 lines
1.4 KiB
Python
33 lines
1.4 KiB
Python
from backend.app.text_validation import classify_text_block, normalize_text, validate_field_against_word
|
||
|
||
|
||
def test_normalize_text_collapses_whitespace_and_full_width_punctuation() -> None:
|
||
raw = " 食品生产许可证编号:\nSC11133042404806 "
|
||
|
||
assert normalize_text(raw) == "食品生产许可证编号:SC11133042404806"
|
||
|
||
|
||
def test_classify_text_block_marks_garbled_text() -> None:
|
||
assert classify_text_block("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><>-<2D><>") == "empty_or_garbled"
|
||
assert classify_text_block(" ") == "empty_or_garbled"
|
||
assert classify_text_block("食品名称:天问礼品粽") == "candidate"
|
||
|
||
|
||
def test_validate_field_against_word_returns_excerpt_for_match() -> None:
|
||
word_text = "电话:0573-86981666 食品生产许可证编号:SC11133042404806 产品标准代号:GB/T 46259"
|
||
|
||
result = validate_field_against_word("食品生产许可证编号:SC11133042404806", word_text)
|
||
|
||
assert result.status == "matched"
|
||
assert result.reason == "normalized text found in Word content"
|
||
assert "SC11133042404806" in (result.matched_excerpt or "")
|
||
|
||
|
||
def test_validate_field_against_word_rejects_missing_text() -> None:
|
||
word_text = "产品标准代号:GB/T 46259"
|
||
|
||
result = validate_field_against_word("食品生产许可证编号:SC11133042404806", word_text)
|
||
|
||
assert result.status == "unmatched"
|
||
assert result.matched_excerpt is None
|