from backend.app.text_validation import classify_text_block, normalize_text, validate_field_against_word def test_normalize_text_collapses_whitespace_and_full_width_punctuation() -> None: raw = " 食品生产许可证编号:\nSC11133042404806 " assert normalize_text(raw) == "食品生产许可证编号:SC11133042404806" def test_classify_text_block_marks_garbled_text() -> None: assert classify_text_block("����-��-��") == "empty_or_garbled" assert classify_text_block(" ") == "empty_or_garbled" assert classify_text_block("食品名称:天问礼品粽") == "candidate" def test_validate_field_against_word_returns_excerpt_for_match() -> None: word_text = "电话:0573-86981666 食品生产许可证编号:SC11133042404806 产品标准代号:GB/T 46259" result = validate_field_against_word("食品生产许可证编号:SC11133042404806", word_text) assert result.status == "matched" assert result.reason == "normalized text found in Word content" assert "SC11133042404806" in (result.matched_excerpt or "") def test_validate_field_against_word_rejects_missing_text() -> None: word_text = "产品标准代号:GB/T 46259" result = validate_field_against_word("食品生产许可证编号:SC11133042404806", word_text) assert result.status == "unmatched" assert result.matched_excerpt is None