Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions
--- a/tests/backend/test_ai_parser.py
+++ b/tests/backend/test_ai_parser.py
@@ -0,0 +1,85 @@
+from types import SimpleNamespace
+
+from backend.app.ai_parser import (
+    _estimate_text_width,
+    _estimate_text_width_from_text_matrix,
+    _page_horizontal_offset,
+    _text_rect_from_matrix,
+)
+
+
+def test_text_rect_from_matrix_uses_rendered_height_and_baseline() -> None:
+    font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
+        "食品名称： 天问礼品粽 （粽子/草木灰咸鸭蛋）",
+        [19.3618, 0.0, 0.0, 21.0, 435.9155, 629.3184],
+        942.06,
+        None,
+    )
+
+    assert font_size_pt == 21.0
+    assert x0_pt == 435.92
+    assert top_pt == 291.74
+    assert bottom_pt == 312.74
+    assert x1_pt > x0_pt
+
+
+def test_text_rect_from_matrix_handles_small_text_without_collapsing_height() -> None:
+    font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
+        "儿童青少年应避免过量摄入盐油糖。",
+        [4.3157, 0.0, 0.0, 8.0, 680.7383741, 516.1778],
+        942.06,
+        None,
+    )
+
+    assert font_size_pt == 8.0
+    assert x0_pt == 680.74
+    assert top_pt == 417.88
+    assert bottom_pt == 425.88
+    assert x1_pt > x0_pt
+
+
+def test_text_rect_from_matrix_applies_page_horizontal_offset() -> None:
+    font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
+        "材质：",
+        [7.0652, 0.0, 0.0, 12.36, 190.6111, 873.561],
+        942.06,
+        None,
+        24.21,
+    )
+
+    assert font_size_pt == 12.36
+    assert x0_pt == 166.4
+    assert top_pt == 56.14
+    assert bottom_pt == 68.5
+    assert x1_pt > x0_pt
+
+
+def test_page_horizontal_offset_uses_artbox_left_inset() -> None:
+    page = SimpleNamespace(
+        artbox=SimpleNamespace(left=24.2137, width=1314.7563),
+        cropbox=SimpleNamespace(width=1363.4),
+    )
+
+    assert _page_horizontal_offset(page) == 24.2137
+
+
+def test_text_matrix_width_is_tighter_than_fallback_for_food_name() -> None:
+    text = "食品名称： 天问礼品粽 （粽子/草木灰咸鸭蛋）"
+    reference_width = 374.51
+
+    fallback_width = round(_estimate_text_width(text, 21.0), 2)
+    matrix_width = round(_estimate_text_width_from_text_matrix(text, 19.3618) or 0.0, 2)
+
+    assert matrix_width > 0
+    assert abs(matrix_width - reference_width) < abs(fallback_width - reference_width)
+
+
+def test_text_matrix_width_is_tighter_than_fallback_for_small_heading() -> None:
+    text = "营养成分表"
+    reference_width = 21.75
+
+    fallback_width = round(_estimate_text_width(text, 8.0), 2)
+    matrix_width = round(_estimate_text_width_from_text_matrix(text, 4.3157) or 0.0, 2)
+
+    assert matrix_width > 0
+    assert abs(matrix_width - reference_width) <= abs(fallback_width - reference_width)
--- a/tests/backend/test_ai_render_crop.py
+++ b/tests/backend/test_ai_render_crop.py
@@ -0,0 +1,47 @@
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from backend.app.ai_render_crop import detect_main_content_box, process_ai_render_crop
+
+
+WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
+AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
+OUTPUT_DIR = WORKDIR / ".tmp_test_render_crop"
+
+
+def test_detect_main_content_box_finds_centered_content() -> None:
+    image = np.full((400, 600, 3), 255, dtype=np.uint8)
+    cv2.rectangle(image, (120, 90), (520, 310), (10, 10, 10), 3)
+    cv2.putText(image, "MAIN CONTENT", (150, 210), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (20, 20, 20), 3, cv2.LINE_AA)
+
+    x0, y0, x1, y1 = detect_main_content_box(image)
+
+    assert x0 < 120
+    assert y0 < 90
+    assert x1 > 520
+    assert y1 > 310
+
+
+def test_process_ai_render_crop_outputs_full_and_cropped_images() -> None:
+    result = process_ai_render_crop(AI_FILE, OUTPUT_DIR)
+
+    assert result["fullImage"]["url"].endswith(".png")
+    assert result["croppedImage"]["url"].endswith(".png")
+    assert result["cropBox"]["x0"] >= 0
+    assert result["cropBox"]["y0"] >= 0
+    assert result["cropBox"]["x1"] > result["cropBox"]["x0"]
+    assert result["cropBox"]["y1"] > result["cropBox"]["y0"]
+
+    full_path = OUTPUT_DIR / Path(result["fullImage"]["url"]).name
+    cropped_path = OUTPUT_DIR / Path(result["croppedImage"]["url"]).name
+    assert full_path.exists()
+    assert cropped_path.exists()
+
+    full_image = cv2.imread(str(full_path))
+    cropped_image = cv2.imread(str(cropped_path))
+    assert full_image is not None
+    assert cropped_image is not None
+    assert cropped_image.shape[1] < full_image.shape[1]
+    assert cropped_image.shape[0] < full_image.shape[0]
--- a/tests/backend/test_ai_render_crop_api.py
+++ b/tests/backend/test_ai_render_crop_api.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+
+from fastapi.testclient import TestClient
+
+from backend.app.main import app
+
+
+WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
+AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
+
+client = TestClient(app)
+
+
+def test_ai_render_crop_endpoint_returns_two_images_and_crop_box() -> None:
+    with AI_FILE.open("rb") as ai_fp:
+        response = client.post(
+            "/api/ai-render-crop",
+            files={
+                "ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
+            },
+        )
+
+    assert response.status_code == 200
+
+    payload = response.json()
+    assert payload["fullImage"]["type"] == "image"
+    assert payload["croppedImage"]["type"] == "image"
+    assert payload["fullImage"]["url"].endswith(".png")
+    assert payload["croppedImage"]["url"].endswith(".png")
+    assert payload["cropBox"]["x1"] > payload["cropBox"]["x0"]
+    assert payload["cropBox"]["y1"] > payload["cropBox"]["y0"]
--- a/tests/backend/test_api.py
+++ b/tests/backend/test_api.py
@@ -0,0 +1,169 @@
+from pathlib import Path
+
+import pytest
+from fastapi.testclient import TestClient
+
+from backend.app import pipeline
+from backend.app import main
+from backend.app.main import app
+
+
+WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
+AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
+DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"
+
+
+client = TestClient(app)
+
+
+def fake_mineru_payload() -> dict:
+    return {
+        "pdf_info": [
+            {
+                "page_idx": 0,
+                "page_size": [2772, 1961],
+                "para_blocks": [
+                    {
+                        "bbox": [704, 134, 2106, 229],
+                        "lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
+                    }
+                ],
+            }
+        ]
+    }
+
+
+def test_process_endpoint_returns_preview_and_fields(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())
+
+    with AI_FILE.open("rb") as ai_fp, DOCX_FILE.open("rb") as docx_fp:
+        response = client.post(
+            "/api/process",
+            files={
+                "ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
+                "word_file": (
+                    DOCX_FILE.name,
+                    docx_fp,
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                ),
+            },
+        )
+
+    assert response.status_code == 200
+
+    payload = response.json()
+    assert payload["preview"]["type"] == "pdf"
+    assert payload["fields"]
+    assert payload["preview"]["pageWidthPt"] == 2772
+    assert payload["fields"][0]["text"] == "食品名称:天问礼品粽"
+
+
+def test_process_endpoint_uses_default_sample_files_when_uploads_are_missing(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())
+
+    response = client.post("/api/process")
+
+    assert response.status_code == 200
+
+    payload = response.json()
+    assert payload["preview"]["type"] == "pdf"
+    assert payload["fields"]
+    assert any(field["text"] for field in payload["fields"])
+
+
+def test_process_endpoint_surfaces_missing_mineru_key(monkeypatch: pytest.MonkeyPatch) -> None:
+    def fake_parse_with_mineru(_preview_path, _output_dir):
+        raise RuntimeError("MINERU_API_KEY is required")
+
+    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)
+
+    response = client.post("/api/process")
+
+    assert response.status_code == 500
+    assert response.json()["detail"] == "MINERU_API_KEY is required"
+
+
+def test_mineru_extract_endpoint_returns_job_preview_and_blocks(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        pipeline,
+        "extract_mineru_result",
+        lambda _ai_path, _output_dir, job_id=None: {
+            "jobId": job_id,
+            "preview": {
+                "type": "pdf",
+                "url": f"/api/files/{job_id}/preview.pdf",
+                "pageWidthPt": 2772,
+                "pageHeightPt": 1961,
+            },
+            "artifacts": {
+                "json": {"path": "/tmp/structured.json", "url": f"/api/files/{job_id}/mineru/structured.json"},
+                "markdown": {"path": "/tmp/full.md", "url": f"/api/files/{job_id}/mineru/full.md"},
+            },
+            "blocks": [{"id": "block-1", "text": "食品名称:天问礼品粽", "page": 1, "x0_pt": 1, "top_pt": 2, "x1_pt": 3, "bottom_pt": 4}],
+        },
+    )
+
+    with AI_FILE.open("rb") as ai_fp:
+        response = client.post(
+            "/api/mineru-extract",
+            files={"ai_file": (AI_FILE.name, ai_fp, "application/postscript")},
+        )
+
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["jobId"]
+    assert payload["preview"]["type"] == "pdf"
+    assert payload["artifacts"]["json"]["url"].endswith("/mineru/structured.json")
+    assert payload["artifacts"]["markdown"]["url"].endswith("/mineru/full.md")
+    assert payload["blocks"][0]["id"] == "block-1"
+
+
+def test_compare_word_endpoint_returns_compared_fields(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    monkeypatch.setattr(main, "OUTPUTS_DIR", tmp_path)
+    (tmp_path / "test-job").mkdir(parents=True, exist_ok=True)
+    monkeypatch.setattr(
+        pipeline,
+        "compare_word_with_mineru",
+        lambda _word_path, _output_dir, job_id=None: {
+            "jobId": job_id,
+            "preview": {
+                "type": "pdf",
+                "url": f"/api/files/{job_id}/preview.pdf",
+                "pageWidthPt": 2772,
+                "pageHeightPt": 1961,
+            },
+            "fields": [
+                {
+                    "id": "field-1",
+                    "text": "食品名称:天问礼品粽",
+                    "page": 1,
+                    "x0_pt": 1,
+                    "top_pt": 2,
+                    "x1_pt": 3,
+                    "bottom_pt": 4,
+                    "normalized_text": "食品名称:天问礼品粽",
+                    "validation_status": "matched",
+                    "validation_reason": "normalized text found in Word content",
+                    "matched_excerpt": "食品名称:天问礼品粽",
+                }
+            ],
+        },
+    )
+
+    with DOCX_FILE.open("rb") as docx_fp:
+        response = client.post(
+            "/api/compare-word",
+            data={"job_id": "test-job"},
+            files={
+                "word_file": (
+                    DOCX_FILE.name,
+                    docx_fp,
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                )
+            },
+        )
+
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["jobId"] == "test-job"
+    assert payload["fields"][0]["validation_status"] == "matched"
--- a/tests/backend/test_barcode_cv.py
+++ b/tests/backend/test_barcode_cv.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+from backend.app.barcode_cv import decode_barcode_image
+
+
+WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
+
+
+def test_decode_barcode_image_reads_ean13_from_original_sample() -> None:
+    result = decode_barcode_image(WORKDIR / "1.jpg")
+
+    assert result["text"] == "6954930015983"
+    assert result["format"] == "EAN_13"
+    assert result["valid_checksum"] is True
+
+
+def test_decode_barcode_image_reads_ean13_from_ma1_sample() -> None:
+    result = decode_barcode_image(WORKDIR / "ma1.png")
+
+    assert result["text"] == "6954930015983"
+    assert result["format"] == "EAN_13"
+    assert result["valid_checksum"] is True
+
+
+def test_decode_barcode_image_reads_ean13_from_ma2_sample() -> None:
+    result = decode_barcode_image(WORKDIR / "ma2.png")
+
+    assert result["text"] == "6954930016737"
+    assert result["format"] == "EAN_13"
+    assert result["valid_checksum"] is True
--- a/tests/backend/test_layout_cv.py
+++ b/tests/backend/test_layout_cv.py
@@ -0,0 +1,33 @@
+import cv2
+import numpy as np
+
+from backend.app.layout_cv import Box, detect_text_lines, merge_text_and_rectangles
+
+
+def test_merge_text_and_rectangles_keeps_outer_table_box_and_drops_nested_cells() -> None:
+    text_lines = [
+        Box(20, 20, 120, 36, "line", "配料"),
+        Box(20, 40, 120, 56, "line", "糯米"),
+        Box(20, 60, 120, 76, "line", "红豆"),
+    ]
+    rectangles = [
+        Box(10, 10, 150, 90, "rectangle"),
+        Box(12, 12, 78, 44, "rectangle"),
+        Box(82, 12, 148, 44, "rectangle"),
+    ]
+
+    merged = merge_text_and_rectangles(text_lines, rectangles)
+
+    assert [box.kind for box in merged] == ["rectangle", "line", "line", "line"]
+    assert merged[0].as_tuple() == (10, 10, 150, 90)
+
+
+def test_detect_text_lines_finds_two_text_rows_without_ocr() -> None:
+    image = np.full((220, 420, 3), 255, dtype=np.uint8)
+    cv2.putText(image, "LINE ONE", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 2, cv2.LINE_AA)
+    cv2.putText(image, "LINE TWO", (20, 140), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 2, cv2.LINE_AA)
+
+    lines = detect_text_lines(image)
+
+    assert len(lines) == 2
+    assert lines[0].y1 < lines[1].y0
--- a/tests/backend/test_mineru_client.py
+++ b/tests/backend/test_mineru_client.py
@@ -0,0 +1,192 @@
+from __future__ import annotations
+
+import io
+import json
+import zipfile
+from pathlib import Path
+
+import pytest
+import requests
+
+from backend.app import mineru_client
+from backend.app.mineru_client import MineruClient, MineruClientError
+
+
+class FakeResponse:
+    def __init__(self, status: int, body: bytes):
+        self.status = status
+        self._body = body
+
+    def read(self) -> bytes:
+        return self._body
+
+    def __enter__(self) -> "FakeResponse":
+        return self
+
+    def __exit__(self, *_args: object) -> None:
+        return None
+
+
+class FakeRequestsResponse:
+    def __init__(self, status_code: int, text: str = ""):
+        self.status_code = status_code
+        self.text = text
+
+
+def _zip_with_json() -> bytes:
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w") as archive:
+        archive.writestr(
+            "demo_middle.json",
+            json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}),
+        )
+    return buffer.getvalue()
+
+
+def _zip_with_layout_and_model() -> bytes:
+    buffer = io.BytesIO()
+    with zipfile.ZipFile(buffer, "w") as archive:
+        archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]]))
+        archive.writestr(
+            "layout.json",
+            json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}),
+        )
+    return buffer.getvalue()
+
+
+def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    calls: list[str] = []
+
+    def fake_urlopen(request_obj, timeout=0):
+        url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
+        calls.append(str(url))
+        if str(url).endswith("/api/v4/file-urls/batch"):
+            return FakeResponse(
+                200,
+                json.dumps(
+                    {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
+                ).encode(),
+            )
+        if str(url) == "https://upload.example/file":
+            raise AssertionError("upload URL should be handled by requests.put")
+        if str(url).endswith("/api/v4/extract/task"):
+            return FakeResponse(
+                200,
+                json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
+            )
+        if str(url).endswith("/api/v4/extract/task/task-1"):
+            return FakeResponse(
+                200,
+                json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
+            )
+        if str(url) == "https://download.example/result.zip":
+            return FakeResponse(200, _zip_with_json())
+        raise AssertionError(f"unexpected URL {url}")
+
+    monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
+    monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
+    pdf_path = tmp_path / "preview.pdf"
+    pdf_path.write_bytes(b"%PDF-1.7")
+
+    payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
+
+    assert payload["pdf_info"][0]["page_size"] == [1, 1]
+    assert calls == [
+        "https://mineru.net/api/v4/file-urls/batch",
+        "https://mineru.net/api/v4/extract/task",
+        "https://mineru.net/api/v4/extract/task/task-1",
+        "https://download.example/result.zip",
+    ]
+    assert (tmp_path / "mineru_result.zip").exists()
+
+
+def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    def fake_urlopen(request_obj, timeout=0):
+        url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
+        if str(url).endswith("/api/v4/file-urls/batch"):
+            return FakeResponse(
+                200,
+                json.dumps(
+                    {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
+                ).encode(),
+            )
+        if str(url) == "https://upload.example/file":
+            raise AssertionError("upload URL should be handled by requests.put")
+        if str(url).endswith("/api/v4/extract/task"):
+            return FakeResponse(
+                200,
+                json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
+            )
+        if str(url).endswith("/api/v4/extract/task/task-1"):
+            return FakeResponse(
+                200,
+                json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(),
+            )
+        raise AssertionError(f"unexpected URL {url}")
+
+    monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
+    monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
+    pdf_path = tmp_path / "preview.pdf"
+    pdf_path.write_bytes(b"%PDF-1.7")
+
+    with pytest.raises(MineruClientError, match="bad pdf"):
+        MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
+
+
+def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    def fake_urlopen(request_obj, timeout=0):
+        url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
+        if str(url).endswith("/api/v4/file-urls/batch"):
+            return FakeResponse(
+                200,
+                json.dumps(
+                    {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
+                ).encode(),
+            )
+        raise AssertionError(f"unexpected URL {url}")
+
+    monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
+    monkeypatch.setattr(
+        mineru_client.requests,
+        "put",
+        lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"),
+    )
+    pdf_path = tmp_path / "preview.pdf"
+    pdf_path.write_bytes(b"%PDF-1.7")
+
+    with pytest.raises(MineruClientError, match="HTTP 403"):
+        MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
+
+
+def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    def fake_urlopen(request_obj, timeout=0):
+        url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
+        if str(url).endswith("/api/v4/file-urls/batch"):
+            return FakeResponse(
+                200,
+                json.dumps(
+                    {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
+                ).encode(),
+            )
+        if str(url).endswith("/api/v4/extract/task"):
+            return FakeResponse(
+                200,
+                json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
+            )
+        if str(url).endswith("/api/v4/extract/task/task-1"):
+            return FakeResponse(
+                200,
+                json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
+            )
+        if str(url) == "https://download.example/result.zip":
+            return FakeResponse(200, _zip_with_layout_and_model())
+        raise AssertionError(f"unexpected URL {url}")
+
+    monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
+    monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
+    pdf_path = tmp_path / "preview.pdf"
+    pdf_path.write_bytes(b"%PDF-1.7")
+
+    payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
+
+    assert payload["pdf_info"][0]["page_size"] == [2, 2]
--- a/tests/backend/test_mineru_parser.py
+++ b/tests/backend/test_mineru_parser.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from backend.app.mineru_parser import parse_mineru_fields
+
+
+def test_parse_mineru_fields_extracts_text_and_bbox() -> None:
+    payload = {
+        "pdf_info": [
+            {
+                "page_idx": 0,
+                "page_size": [2772, 1961],
+                "para_blocks": [
+                    {
+                        "bbox": [704, 134, 2106, 229],
+                        "type": "title",
+                        "lines": [
+                            {
+                                "spans": [
+                                    {
+                                        "type": "text",
+                                        "content": "食品名称:天问礼品粽",
+                                        "bbox": [704, 134, 2106, 229],
+                                    }
+                                ]
+                            }
+                        ],
+                    }
+                ],
+            }
+        ]
+    }
+
+    parsed = parse_mineru_fields(payload)
+
+    assert parsed.page_width == 2772
+    assert parsed.page_height == 1961
+    assert parsed.fields == [
+        {
+            "page": 1,
+            "text": "食品名称:天问礼品粽",
+            "font_name": "",
+            "font_size_pt": None,
+            "font_height_mm": None,
+            "x0_pt": 704.0,
+            "top_pt": 134.0,
+            "x1_pt": 2106.0,
+            "bottom_pt": 229.0,
+        }
+    ]
+
+
+def test_parse_mineru_fields_turns_table_html_into_text() -> None:
+    payload = {
+        "pdf_info": [
+            {
+                "page_idx": 0,
+                "page_size": [1000, 800],
+                "para_blocks": [
+                    {
+                        "bbox": [10, 20, 300, 200],
+                        "type": "table",
+                        "lines": [
+                            {
+                                "spans": [
+                                    {
+                                        "type": "table",
+                                        "html": "<table><tr><td>品种</td><td>规格</td></tr><tr><td>黑猪肉粽</td><td>130克×1</td></tr></table>",
+                                    }
+                                ]
+                            }
+                        ],
+                    }
+                ],
+            }
+        ]
+    }
+
+    parsed = parse_mineru_fields(payload)
+
+    assert parsed.fields[0]["text"] == "品种 规格 黑猪肉粽 130克×1"
+
+
+def test_parse_mineru_fields_skips_empty_decorative_blocks() -> None:
+    payload = {
+        "pdf_info": [
+            {
+                "page_idx": 0,
+                "page_size": [1000, 800],
+                "para_blocks": [
+                    {"bbox": [1, 2, 3, 4], "type": "image", "lines": [{"spans": [{"type": "image"}]}]},
+                    {"bbox": [5, 6, 7, 8], "type": "text", "lines": [{"spans": [{"content": "  "}]}]},
+                ],
+            }
+        ]
+    }
+
+    parsed = parse_mineru_fields(payload)
+
+    assert parsed.fields == []
--- a/tests/backend/test_pipeline.py
+++ b/tests/backend/test_pipeline.py
@@ -0,0 +1,74 @@
+from pathlib import Path
+
+import pytest
+
+from backend.app import pipeline
+from backend.app.pipeline import process_files
+
+
+WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
+AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
+DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"
+OUTPUT_DIR = WORKDIR / ".tmp_test_output"
+
+
+def test_process_files_builds_preview_and_mineru_field_results(monkeypatch: pytest.MonkeyPatch) -> None:
+    def fake_parse_with_mineru(_preview_path: Path, _output_dir: Path):
+        return {
+            "pdf_info": [
+                {
+                    "page_idx": 0,
+                    "page_size": [2772, 1961],
+                    "para_blocks": [
+                        {
+                            "bbox": [704, 134, 2106, 229],
+                            "lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
+                        },
+                        {
+                            "bbox": [10, 20, 40, 60],
+                            "lines": [{"spans": [{"content": "Word中不存在的内容"}]}],
+                        },
+                    ],
+                }
+            ]
+        }
+
+    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)
+
+    result = process_files(AI_FILE, DOCX_FILE, OUTPUT_DIR, job_id="test-job")
+
+    assert result["preview"]["type"] == "pdf"
+    assert result["preview"]["url"] == "/api/files/test-job/preview.pdf"
+    assert result["preview"]["pageWidthPt"] == 2772
+    assert result["preview"]["pageHeightPt"] == 1961
+    assert result["fields"][0]["text"] == "食品名称:天问礼品粽"
+    assert result["fields"][0]["validation_status"] == "matched"
+    assert result["fields"][0]["x0_pt"] == 704.0
+    assert any(field["validation_status"] == "unmatched" for field in result["fields"])
+    assert (OUTPUT_DIR / "preview.pdf").exists()
+
+
+def test_parse_preview_with_mineru_reads_key_from_env_file(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    captured: dict[str, str] = {}
+
+    class FakeMineruClient:
+        def __init__(self, api_key: str) -> None:
+            captured["api_key"] = api_key
+
+        def parse_pdf(self, preview_path: Path, output_dir: Path) -> dict:
+            return {"preview_path": str(preview_path), "output_dir": str(output_dir)}
+
+    env_file = tmp_path / ".env"
+    env_file.write_text("MINERU_API_KEY=from-env-file\n", encoding="utf-8")
+
+    monkeypatch.delenv("MINERU_API_KEY", raising=False)
+    monkeypatch.setattr(pipeline, "ENV_FILE_CANDIDATES", (env_file,))
+    monkeypatch.setattr(pipeline, "MineruClient", FakeMineruClient)
+
+    preview_path = tmp_path / "preview.pdf"
+    preview_path.write_bytes(b"%PDF-1.7")
+
+    result = pipeline._parse_preview_with_mineru(preview_path, tmp_path)
+
+    assert captured["api_key"] == "from-env-file"
+    assert result["preview_path"] == str(preview_path)
--- a/tests/backend/test_text_validation.py
+++ b/tests/backend/test_text_validation.py
@@ -0,0 +1,32 @@
+from backend.app.text_validation import classify_text_block, normalize_text, validate_field_against_word
+
+
+def test_normalize_text_collapses_whitespace_and_full_width_punctuation() -> None:
+    raw = "  食品生产许可证编号：\nSC11133042404806  "
+
+    assert normalize_text(raw) == "食品生产许可证编号:SC11133042404806"
+
+
+def test_classify_text_block_marks_garbled_text() -> None:
+    assert classify_text_block("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><>-<2D><>") == "empty_or_garbled"
+    assert classify_text_block("   ") == "empty_or_garbled"
+    assert classify_text_block("食品名称：天问礼品粽") == "candidate"
+
+
+def test_validate_field_against_word_returns_excerpt_for_match() -> None:
+    word_text = "电话：0573-86981666 食品生产许可证编号：SC11133042404806 产品标准代号：GB/T 46259"
+
+    result = validate_field_against_word("食品生产许可证编号：SC11133042404806", word_text)
+
+    assert result.status == "matched"
+    assert result.reason == "normalized text found in Word content"
+    assert "SC11133042404806" in (result.matched_excerpt or "")
+
+
+def test_validate_field_against_word_rejects_missing_text() -> None:
+    word_text = "产品标准代号：GB/T 46259"
+
+    result = validate_field_against_word("食品生产许可证编号：SC11133042404806", word_text)
+
+    assert result.status == "unmatched"
+    assert result.matched_excerpt is None