Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
This commit is contained in:
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions

View File

@@ -0,0 +1,85 @@
from types import SimpleNamespace
from backend.app.ai_parser import (
_estimate_text_width,
_estimate_text_width_from_text_matrix,
_page_horizontal_offset,
_text_rect_from_matrix,
)
def test_text_rect_from_matrix_uses_rendered_height_and_baseline() -> None:
font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
"食品名称: 天问礼品粽 (粽子/草木灰咸鸭蛋)",
[19.3618, 0.0, 0.0, 21.0, 435.9155, 629.3184],
942.06,
None,
)
assert font_size_pt == 21.0
assert x0_pt == 435.92
assert top_pt == 291.74
assert bottom_pt == 312.74
assert x1_pt > x0_pt
def test_text_rect_from_matrix_handles_small_text_without_collapsing_height() -> None:
font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
"儿童青少年应避免过量摄入盐油糖。",
[4.3157, 0.0, 0.0, 8.0, 680.7383741, 516.1778],
942.06,
None,
)
assert font_size_pt == 8.0
assert x0_pt == 680.74
assert top_pt == 417.88
assert bottom_pt == 425.88
assert x1_pt > x0_pt
def test_text_rect_from_matrix_applies_page_horizontal_offset() -> None:
font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
"材质:",
[7.0652, 0.0, 0.0, 12.36, 190.6111, 873.561],
942.06,
None,
24.21,
)
assert font_size_pt == 12.36
assert x0_pt == 166.4
assert top_pt == 56.14
assert bottom_pt == 68.5
assert x1_pt > x0_pt
def test_page_horizontal_offset_uses_artbox_left_inset() -> None:
page = SimpleNamespace(
artbox=SimpleNamespace(left=24.2137, width=1314.7563),
cropbox=SimpleNamespace(width=1363.4),
)
assert _page_horizontal_offset(page) == 24.2137
def test_text_matrix_width_is_tighter_than_fallback_for_food_name() -> None:
text = "食品名称: 天问礼品粽 (粽子/草木灰咸鸭蛋)"
reference_width = 374.51
fallback_width = round(_estimate_text_width(text, 21.0), 2)
matrix_width = round(_estimate_text_width_from_text_matrix(text, 19.3618) or 0.0, 2)
assert matrix_width > 0
assert abs(matrix_width - reference_width) < abs(fallback_width - reference_width)
def test_text_matrix_width_is_tighter_than_fallback_for_small_heading() -> None:
text = "营养成分表"
reference_width = 21.75
fallback_width = round(_estimate_text_width(text, 8.0), 2)
matrix_width = round(_estimate_text_width_from_text_matrix(text, 4.3157) or 0.0, 2)
assert matrix_width > 0
assert abs(matrix_width - reference_width) <= abs(fallback_width - reference_width)

View File

@@ -0,0 +1,47 @@
from pathlib import Path
import cv2
import numpy as np
from backend.app.ai_render_crop import detect_main_content_box, process_ai_render_crop
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
OUTPUT_DIR = WORKDIR / ".tmp_test_render_crop"
def test_detect_main_content_box_finds_centered_content() -> None:
image = np.full((400, 600, 3), 255, dtype=np.uint8)
cv2.rectangle(image, (120, 90), (520, 310), (10, 10, 10), 3)
cv2.putText(image, "MAIN CONTENT", (150, 210), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (20, 20, 20), 3, cv2.LINE_AA)
x0, y0, x1, y1 = detect_main_content_box(image)
assert x0 < 120
assert y0 < 90
assert x1 > 520
assert y1 > 310
def test_process_ai_render_crop_outputs_full_and_cropped_images() -> None:
result = process_ai_render_crop(AI_FILE, OUTPUT_DIR)
assert result["fullImage"]["url"].endswith(".png")
assert result["croppedImage"]["url"].endswith(".png")
assert result["cropBox"]["x0"] >= 0
assert result["cropBox"]["y0"] >= 0
assert result["cropBox"]["x1"] > result["cropBox"]["x0"]
assert result["cropBox"]["y1"] > result["cropBox"]["y0"]
full_path = OUTPUT_DIR / Path(result["fullImage"]["url"]).name
cropped_path = OUTPUT_DIR / Path(result["croppedImage"]["url"]).name
assert full_path.exists()
assert cropped_path.exists()
full_image = cv2.imread(str(full_path))
cropped_image = cv2.imread(str(cropped_path))
assert full_image is not None
assert cropped_image is not None
assert cropped_image.shape[1] < full_image.shape[1]
assert cropped_image.shape[0] < full_image.shape[0]

View File

@@ -0,0 +1,31 @@
from pathlib import Path
from fastapi.testclient import TestClient
from backend.app.main import app
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
client = TestClient(app)
def test_ai_render_crop_endpoint_returns_two_images_and_crop_box() -> None:
with AI_FILE.open("rb") as ai_fp:
response = client.post(
"/api/ai-render-crop",
files={
"ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
},
)
assert response.status_code == 200
payload = response.json()
assert payload["fullImage"]["type"] == "image"
assert payload["croppedImage"]["type"] == "image"
assert payload["fullImage"]["url"].endswith(".png")
assert payload["croppedImage"]["url"].endswith(".png")
assert payload["cropBox"]["x1"] > payload["cropBox"]["x0"]
assert payload["cropBox"]["y1"] > payload["cropBox"]["y0"]

169
tests/backend/test_api.py Normal file
View File

@@ -0,0 +1,169 @@
from pathlib import Path
import pytest
from fastapi.testclient import TestClient
from backend.app import pipeline
from backend.app import main
from backend.app.main import app
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"
client = TestClient(app)
def fake_mineru_payload() -> dict:
return {
"pdf_info": [
{
"page_idx": 0,
"page_size": [2772, 1961],
"para_blocks": [
{
"bbox": [704, 134, 2106, 229],
"lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
}
],
}
]
}
def test_process_endpoint_returns_preview_and_fields(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())
with AI_FILE.open("rb") as ai_fp, DOCX_FILE.open("rb") as docx_fp:
response = client.post(
"/api/process",
files={
"ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
"word_file": (
DOCX_FILE.name,
docx_fp,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
},
)
assert response.status_code == 200
payload = response.json()
assert payload["preview"]["type"] == "pdf"
assert payload["fields"]
assert payload["preview"]["pageWidthPt"] == 2772
assert payload["fields"][0]["text"] == "食品名称:天问礼品粽"
def test_process_endpoint_uses_default_sample_files_when_uploads_are_missing(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())
response = client.post("/api/process")
assert response.status_code == 200
payload = response.json()
assert payload["preview"]["type"] == "pdf"
assert payload["fields"]
assert any(field["text"] for field in payload["fields"])
def test_process_endpoint_surfaces_missing_mineru_key(monkeypatch: pytest.MonkeyPatch) -> None:
def fake_parse_with_mineru(_preview_path, _output_dir):
raise RuntimeError("MINERU_API_KEY is required")
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)
response = client.post("/api/process")
assert response.status_code == 500
assert response.json()["detail"] == "MINERU_API_KEY is required"
def test_mineru_extract_endpoint_returns_job_preview_and_blocks(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(
pipeline,
"extract_mineru_result",
lambda _ai_path, _output_dir, job_id=None: {
"jobId": job_id,
"preview": {
"type": "pdf",
"url": f"/api/files/{job_id}/preview.pdf",
"pageWidthPt": 2772,
"pageHeightPt": 1961,
},
"artifacts": {
"json": {"path": "/tmp/structured.json", "url": f"/api/files/{job_id}/mineru/structured.json"},
"markdown": {"path": "/tmp/full.md", "url": f"/api/files/{job_id}/mineru/full.md"},
},
"blocks": [{"id": "block-1", "text": "食品名称:天问礼品粽", "page": 1, "x0_pt": 1, "top_pt": 2, "x1_pt": 3, "bottom_pt": 4}],
},
)
with AI_FILE.open("rb") as ai_fp:
response = client.post(
"/api/mineru-extract",
files={"ai_file": (AI_FILE.name, ai_fp, "application/postscript")},
)
assert response.status_code == 200
payload = response.json()
assert payload["jobId"]
assert payload["preview"]["type"] == "pdf"
assert payload["artifacts"]["json"]["url"].endswith("/mineru/structured.json")
assert payload["artifacts"]["markdown"]["url"].endswith("/mineru/full.md")
assert payload["blocks"][0]["id"] == "block-1"
def test_compare_word_endpoint_returns_compared_fields(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
monkeypatch.setattr(main, "OUTPUTS_DIR", tmp_path)
(tmp_path / "test-job").mkdir(parents=True, exist_ok=True)
monkeypatch.setattr(
pipeline,
"compare_word_with_mineru",
lambda _word_path, _output_dir, job_id=None: {
"jobId": job_id,
"preview": {
"type": "pdf",
"url": f"/api/files/{job_id}/preview.pdf",
"pageWidthPt": 2772,
"pageHeightPt": 1961,
},
"fields": [
{
"id": "field-1",
"text": "食品名称:天问礼品粽",
"page": 1,
"x0_pt": 1,
"top_pt": 2,
"x1_pt": 3,
"bottom_pt": 4,
"normalized_text": "食品名称:天问礼品粽",
"validation_status": "matched",
"validation_reason": "normalized text found in Word content",
"matched_excerpt": "食品名称:天问礼品粽",
}
],
},
)
with DOCX_FILE.open("rb") as docx_fp:
response = client.post(
"/api/compare-word",
data={"job_id": "test-job"},
files={
"word_file": (
DOCX_FILE.name,
docx_fp,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
},
)
assert response.status_code == 200
payload = response.json()
assert payload["jobId"] == "test-job"
assert payload["fields"][0]["validation_status"] == "matched"

View File

@@ -0,0 +1,30 @@
from pathlib import Path
from backend.app.barcode_cv import decode_barcode_image
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
def test_decode_barcode_image_reads_ean13_from_original_sample() -> None:
result = decode_barcode_image(WORKDIR / "1.jpg")
assert result["text"] == "6954930015983"
assert result["format"] == "EAN_13"
assert result["valid_checksum"] is True
def test_decode_barcode_image_reads_ean13_from_ma1_sample() -> None:
result = decode_barcode_image(WORKDIR / "ma1.png")
assert result["text"] == "6954930015983"
assert result["format"] == "EAN_13"
assert result["valid_checksum"] is True
def test_decode_barcode_image_reads_ean13_from_ma2_sample() -> None:
result = decode_barcode_image(WORKDIR / "ma2.png")
assert result["text"] == "6954930016737"
assert result["format"] == "EAN_13"
assert result["valid_checksum"] is True

View File

@@ -0,0 +1,33 @@
import cv2
import numpy as np
from backend.app.layout_cv import Box, detect_text_lines, merge_text_and_rectangles
def test_merge_text_and_rectangles_keeps_outer_table_box_and_drops_nested_cells() -> None:
text_lines = [
Box(20, 20, 120, 36, "line", "配料"),
Box(20, 40, 120, 56, "line", "糯米"),
Box(20, 60, 120, 76, "line", "红豆"),
]
rectangles = [
Box(10, 10, 150, 90, "rectangle"),
Box(12, 12, 78, 44, "rectangle"),
Box(82, 12, 148, 44, "rectangle"),
]
merged = merge_text_and_rectangles(text_lines, rectangles)
assert [box.kind for box in merged] == ["rectangle", "line", "line", "line"]
assert merged[0].as_tuple() == (10, 10, 150, 90)
def test_detect_text_lines_finds_two_text_rows_without_ocr() -> None:
image = np.full((220, 420, 3), 255, dtype=np.uint8)
cv2.putText(image, "LINE ONE", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 2, cv2.LINE_AA)
cv2.putText(image, "LINE TWO", (20, 140), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 2, cv2.LINE_AA)
lines = detect_text_lines(image)
assert len(lines) == 2
assert lines[0].y1 < lines[1].y0

View File

@@ -0,0 +1,192 @@
from __future__ import annotations
import io
import json
import zipfile
from pathlib import Path
import pytest
import requests
from backend.app import mineru_client
from backend.app.mineru_client import MineruClient, MineruClientError
class FakeResponse:
def __init__(self, status: int, body: bytes):
self.status = status
self._body = body
def read(self) -> bytes:
return self._body
def __enter__(self) -> "FakeResponse":
return self
def __exit__(self, *_args: object) -> None:
return None
class FakeRequestsResponse:
def __init__(self, status_code: int, text: str = ""):
self.status_code = status_code
self.text = text
def _zip_with_json() -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"demo_middle.json",
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}),
)
return buffer.getvalue()
def _zip_with_layout_and_model() -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]]))
archive.writestr(
"layout.json",
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}),
)
return buffer.getvalue()
def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
calls: list[str] = []
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
calls.append(str(url))
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
if str(url) == "https://upload.example/file":
raise AssertionError("upload URL should be handled by requests.put")
if str(url).endswith("/api/v4/extract/task"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
)
if str(url).endswith("/api/v4/extract/task/task-1"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
)
if str(url) == "https://download.example/result.zip":
return FakeResponse(200, _zip_with_json())
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
assert payload["pdf_info"][0]["page_size"] == [1, 1]
assert calls == [
"https://mineru.net/api/v4/file-urls/batch",
"https://mineru.net/api/v4/extract/task",
"https://mineru.net/api/v4/extract/task/task-1",
"https://download.example/result.zip",
]
assert (tmp_path / "mineru_result.zip").exists()
def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
if str(url) == "https://upload.example/file":
raise AssertionError("upload URL should be handled by requests.put")
if str(url).endswith("/api/v4/extract/task"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
)
if str(url).endswith("/api/v4/extract/task/task-1"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(),
)
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
with pytest.raises(MineruClientError, match="bad pdf"):
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(
mineru_client.requests,
"put",
lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"),
)
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
with pytest.raises(MineruClientError, match="HTTP 403"):
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
if str(url).endswith("/api/v4/extract/task"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
)
if str(url).endswith("/api/v4/extract/task/task-1"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
)
if str(url) == "https://download.example/result.zip":
return FakeResponse(200, _zip_with_layout_and_model())
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
assert payload["pdf_info"][0]["page_size"] == [2, 2]

View File

@@ -0,0 +1,99 @@
from __future__ import annotations
from backend.app.mineru_parser import parse_mineru_fields
def test_parse_mineru_fields_extracts_text_and_bbox() -> None:
payload = {
"pdf_info": [
{
"page_idx": 0,
"page_size": [2772, 1961],
"para_blocks": [
{
"bbox": [704, 134, 2106, 229],
"type": "title",
"lines": [
{
"spans": [
{
"type": "text",
"content": "食品名称:天问礼品粽",
"bbox": [704, 134, 2106, 229],
}
]
}
],
}
],
}
]
}
parsed = parse_mineru_fields(payload)
assert parsed.page_width == 2772
assert parsed.page_height == 1961
assert parsed.fields == [
{
"page": 1,
"text": "食品名称:天问礼品粽",
"font_name": "",
"font_size_pt": None,
"font_height_mm": None,
"x0_pt": 704.0,
"top_pt": 134.0,
"x1_pt": 2106.0,
"bottom_pt": 229.0,
}
]
def test_parse_mineru_fields_turns_table_html_into_text() -> None:
payload = {
"pdf_info": [
{
"page_idx": 0,
"page_size": [1000, 800],
"para_blocks": [
{
"bbox": [10, 20, 300, 200],
"type": "table",
"lines": [
{
"spans": [
{
"type": "table",
"html": "<table><tr><td>品种</td><td>规格</td></tr><tr><td>黑猪肉粽</td><td>130克×1</td></tr></table>",
}
]
}
],
}
],
}
]
}
parsed = parse_mineru_fields(payload)
assert parsed.fields[0]["text"] == "品种 规格 黑猪肉粽 130克×1"
def test_parse_mineru_fields_skips_empty_decorative_blocks() -> None:
payload = {
"pdf_info": [
{
"page_idx": 0,
"page_size": [1000, 800],
"para_blocks": [
{"bbox": [1, 2, 3, 4], "type": "image", "lines": [{"spans": [{"type": "image"}]}]},
{"bbox": [5, 6, 7, 8], "type": "text", "lines": [{"spans": [{"content": " "}]}]},
],
}
]
}
parsed = parse_mineru_fields(payload)
assert parsed.fields == []

View File

@@ -0,0 +1,74 @@
from pathlib import Path
import pytest
from backend.app import pipeline
from backend.app.pipeline import process_files
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"
OUTPUT_DIR = WORKDIR / ".tmp_test_output"
def test_process_files_builds_preview_and_mineru_field_results(monkeypatch: pytest.MonkeyPatch) -> None:
def fake_parse_with_mineru(_preview_path: Path, _output_dir: Path):
return {
"pdf_info": [
{
"page_idx": 0,
"page_size": [2772, 1961],
"para_blocks": [
{
"bbox": [704, 134, 2106, 229],
"lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
},
{
"bbox": [10, 20, 40, 60],
"lines": [{"spans": [{"content": "Word中不存在的内容"}]}],
},
],
}
]
}
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)
result = process_files(AI_FILE, DOCX_FILE, OUTPUT_DIR, job_id="test-job")
assert result["preview"]["type"] == "pdf"
assert result["preview"]["url"] == "/api/files/test-job/preview.pdf"
assert result["preview"]["pageWidthPt"] == 2772
assert result["preview"]["pageHeightPt"] == 1961
assert result["fields"][0]["text"] == "食品名称:天问礼品粽"
assert result["fields"][0]["validation_status"] == "matched"
assert result["fields"][0]["x0_pt"] == 704.0
assert any(field["validation_status"] == "unmatched" for field in result["fields"])
assert (OUTPUT_DIR / "preview.pdf").exists()
def test_parse_preview_with_mineru_reads_key_from_env_file(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
captured: dict[str, str] = {}
class FakeMineruClient:
def __init__(self, api_key: str) -> None:
captured["api_key"] = api_key
def parse_pdf(self, preview_path: Path, output_dir: Path) -> dict:
return {"preview_path": str(preview_path), "output_dir": str(output_dir)}
env_file = tmp_path / ".env"
env_file.write_text("MINERU_API_KEY=from-env-file\n", encoding="utf-8")
monkeypatch.delenv("MINERU_API_KEY", raising=False)
monkeypatch.setattr(pipeline, "ENV_FILE_CANDIDATES", (env_file,))
monkeypatch.setattr(pipeline, "MineruClient", FakeMineruClient)
preview_path = tmp_path / "preview.pdf"
preview_path.write_bytes(b"%PDF-1.7")
result = pipeline._parse_preview_with_mineru(preview_path, tmp_path)
assert captured["api_key"] == "from-env-file"
assert result["preview_path"] == str(preview_path)

View File

@@ -0,0 +1,32 @@
from backend.app.text_validation import classify_text_block, normalize_text, validate_field_against_word
def test_normalize_text_collapses_whitespace_and_full_width_punctuation() -> None:
raw = " 食品生产许可证编号:\nSC11133042404806 "
assert normalize_text(raw) == "食品生产许可证编号:SC11133042404806"
def test_classify_text_block_marks_garbled_text() -> None:
assert classify_text_block("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><>-<2D><>") == "empty_or_garbled"
assert classify_text_block(" ") == "empty_or_garbled"
assert classify_text_block("食品名称:天问礼品粽") == "candidate"
def test_validate_field_against_word_returns_excerpt_for_match() -> None:
word_text = "电话0573-86981666 食品生产许可证编号SC11133042404806 产品标准代号GB/T 46259"
result = validate_field_against_word("食品生产许可证编号SC11133042404806", word_text)
assert result.status == "matched"
assert result.reason == "normalized text found in Word content"
assert "SC11133042404806" in (result.matched_excerpt or "")
def test_validate_field_against_word_rejects_missing_text() -> None:
word_text = "产品标准代号GB/T 46259"
result = validate_field_against_word("食品生产许可证编号SC11133042404806", word_text)
assert result.status == "unmatched"
assert result.matched_excerpt is None