Files
ZLD_POC/tests/backend/test_text_validation.py
2026-04-15 17:18:49 +08:00

33 lines
1.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from backend.app.text_validation import classify_text_block, normalize_text, validate_field_against_word
def test_normalize_text_collapses_whitespace_and_full_width_punctuation() -> None:
raw = " 食品生产许可证编号:\nSC11133042404806 "
assert normalize_text(raw) == "食品生产许可证编号:SC11133042404806"
def test_classify_text_block_marks_garbled_text() -> None:
assert classify_text_block("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><>-<2D><>") == "empty_or_garbled"
assert classify_text_block(" ") == "empty_or_garbled"
assert classify_text_block("食品名称:天问礼品粽") == "candidate"
def test_validate_field_against_word_returns_excerpt_for_match() -> None:
word_text = "电话0573-86981666 食品生产许可证编号SC11133042404806 产品标准代号GB/T 46259"
result = validate_field_against_word("食品生产许可证编号SC11133042404806", word_text)
assert result.status == "matched"
assert result.reason == "normalized text found in Word content"
assert "SC11133042404806" in (result.matched_excerpt or "")
def test_validate_field_against_word_rejects_missing_text() -> None:
word_text = "产品标准代号GB/T 46259"
result = validate_field_against_word("食品生产许可证编号SC11133042404806", word_text)
assert result.status == "unmatched"
assert result.matched_excerpt is None