Initial commit: 包装审核 POC、Docker 与前后端
Made-with: Cursor
This commit is contained in:
85
tests/backend/test_ai_parser.py
Normal file
85
tests/backend/test_ai_parser.py
Normal file
@@ -0,0 +1,85 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from backend.app.ai_parser import (
|
||||
_estimate_text_width,
|
||||
_estimate_text_width_from_text_matrix,
|
||||
_page_horizontal_offset,
|
||||
_text_rect_from_matrix,
|
||||
)
|
||||
|
||||
|
||||
def test_text_rect_from_matrix_uses_rendered_height_and_baseline() -> None:
|
||||
font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
|
||||
"食品名称: 天问礼品粽 (粽子/草木灰咸鸭蛋)",
|
||||
[19.3618, 0.0, 0.0, 21.0, 435.9155, 629.3184],
|
||||
942.06,
|
||||
None,
|
||||
)
|
||||
|
||||
assert font_size_pt == 21.0
|
||||
assert x0_pt == 435.92
|
||||
assert top_pt == 291.74
|
||||
assert bottom_pt == 312.74
|
||||
assert x1_pt > x0_pt
|
||||
|
||||
|
||||
def test_text_rect_from_matrix_handles_small_text_without_collapsing_height() -> None:
|
||||
font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
|
||||
"儿童青少年应避免过量摄入盐油糖。",
|
||||
[4.3157, 0.0, 0.0, 8.0, 680.7383741, 516.1778],
|
||||
942.06,
|
||||
None,
|
||||
)
|
||||
|
||||
assert font_size_pt == 8.0
|
||||
assert x0_pt == 680.74
|
||||
assert top_pt == 417.88
|
||||
assert bottom_pt == 425.88
|
||||
assert x1_pt > x0_pt
|
||||
|
||||
|
||||
def test_text_rect_from_matrix_applies_page_horizontal_offset() -> None:
|
||||
font_size_pt, x0_pt, top_pt, x1_pt, bottom_pt = _text_rect_from_matrix(
|
||||
"材质:",
|
||||
[7.0652, 0.0, 0.0, 12.36, 190.6111, 873.561],
|
||||
942.06,
|
||||
None,
|
||||
24.21,
|
||||
)
|
||||
|
||||
assert font_size_pt == 12.36
|
||||
assert x0_pt == 166.4
|
||||
assert top_pt == 56.14
|
||||
assert bottom_pt == 68.5
|
||||
assert x1_pt > x0_pt
|
||||
|
||||
|
||||
def test_page_horizontal_offset_uses_artbox_left_inset() -> None:
|
||||
page = SimpleNamespace(
|
||||
artbox=SimpleNamespace(left=24.2137, width=1314.7563),
|
||||
cropbox=SimpleNamespace(width=1363.4),
|
||||
)
|
||||
|
||||
assert _page_horizontal_offset(page) == 24.2137
|
||||
|
||||
|
||||
def test_text_matrix_width_is_tighter_than_fallback_for_food_name() -> None:
|
||||
text = "食品名称: 天问礼品粽 (粽子/草木灰咸鸭蛋)"
|
||||
reference_width = 374.51
|
||||
|
||||
fallback_width = round(_estimate_text_width(text, 21.0), 2)
|
||||
matrix_width = round(_estimate_text_width_from_text_matrix(text, 19.3618) or 0.0, 2)
|
||||
|
||||
assert matrix_width > 0
|
||||
assert abs(matrix_width - reference_width) < abs(fallback_width - reference_width)
|
||||
|
||||
|
||||
def test_text_matrix_width_is_tighter_than_fallback_for_small_heading() -> None:
|
||||
text = "营养成分表"
|
||||
reference_width = 21.75
|
||||
|
||||
fallback_width = round(_estimate_text_width(text, 8.0), 2)
|
||||
matrix_width = round(_estimate_text_width_from_text_matrix(text, 4.3157) or 0.0, 2)
|
||||
|
||||
assert matrix_width > 0
|
||||
assert abs(matrix_width - reference_width) <= abs(fallback_width - reference_width)
|
||||
47
tests/backend/test_ai_render_crop.py
Normal file
47
tests/backend/test_ai_render_crop.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from backend.app.ai_render_crop import detect_main_content_box, process_ai_render_crop
|
||||
|
||||
|
||||
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
|
||||
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
|
||||
OUTPUT_DIR = WORKDIR / ".tmp_test_render_crop"
|
||||
|
||||
|
||||
def test_detect_main_content_box_finds_centered_content() -> None:
|
||||
image = np.full((400, 600, 3), 255, dtype=np.uint8)
|
||||
cv2.rectangle(image, (120, 90), (520, 310), (10, 10, 10), 3)
|
||||
cv2.putText(image, "MAIN CONTENT", (150, 210), cv2.FONT_HERSHEY_SIMPLEX, 1.2, (20, 20, 20), 3, cv2.LINE_AA)
|
||||
|
||||
x0, y0, x1, y1 = detect_main_content_box(image)
|
||||
|
||||
assert x0 < 120
|
||||
assert y0 < 90
|
||||
assert x1 > 520
|
||||
assert y1 > 310
|
||||
|
||||
|
||||
def test_process_ai_render_crop_outputs_full_and_cropped_images() -> None:
|
||||
result = process_ai_render_crop(AI_FILE, OUTPUT_DIR)
|
||||
|
||||
assert result["fullImage"]["url"].endswith(".png")
|
||||
assert result["croppedImage"]["url"].endswith(".png")
|
||||
assert result["cropBox"]["x0"] >= 0
|
||||
assert result["cropBox"]["y0"] >= 0
|
||||
assert result["cropBox"]["x1"] > result["cropBox"]["x0"]
|
||||
assert result["cropBox"]["y1"] > result["cropBox"]["y0"]
|
||||
|
||||
full_path = OUTPUT_DIR / Path(result["fullImage"]["url"]).name
|
||||
cropped_path = OUTPUT_DIR / Path(result["croppedImage"]["url"]).name
|
||||
assert full_path.exists()
|
||||
assert cropped_path.exists()
|
||||
|
||||
full_image = cv2.imread(str(full_path))
|
||||
cropped_image = cv2.imread(str(cropped_path))
|
||||
assert full_image is not None
|
||||
assert cropped_image is not None
|
||||
assert cropped_image.shape[1] < full_image.shape[1]
|
||||
assert cropped_image.shape[0] < full_image.shape[0]
|
||||
31
tests/backend/test_ai_render_crop_api.py
Normal file
31
tests/backend/test_ai_render_crop_api.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
|
||||
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
def test_ai_render_crop_endpoint_returns_two_images_and_crop_box() -> None:
|
||||
with AI_FILE.open("rb") as ai_fp:
|
||||
response = client.post(
|
||||
"/api/ai-render-crop",
|
||||
files={
|
||||
"ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
payload = response.json()
|
||||
assert payload["fullImage"]["type"] == "image"
|
||||
assert payload["croppedImage"]["type"] == "image"
|
||||
assert payload["fullImage"]["url"].endswith(".png")
|
||||
assert payload["croppedImage"]["url"].endswith(".png")
|
||||
assert payload["cropBox"]["x1"] > payload["cropBox"]["x0"]
|
||||
assert payload["cropBox"]["y1"] > payload["cropBox"]["y0"]
|
||||
169
tests/backend/test_api.py
Normal file
169
tests/backend/test_api.py
Normal file
@@ -0,0 +1,169 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from backend.app import pipeline
|
||||
from backend.app import main
|
||||
from backend.app.main import app
|
||||
|
||||
|
||||
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
|
||||
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
|
||||
DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"
|
||||
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
|
||||
def fake_mineru_payload() -> dict:
|
||||
return {
|
||||
"pdf_info": [
|
||||
{
|
||||
"page_idx": 0,
|
||||
"page_size": [2772, 1961],
|
||||
"para_blocks": [
|
||||
{
|
||||
"bbox": [704, 134, 2106, 229],
|
||||
"lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def test_process_endpoint_returns_preview_and_fields(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())
|
||||
|
||||
with AI_FILE.open("rb") as ai_fp, DOCX_FILE.open("rb") as docx_fp:
|
||||
response = client.post(
|
||||
"/api/process",
|
||||
files={
|
||||
"ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
|
||||
"word_file": (
|
||||
DOCX_FILE.name,
|
||||
docx_fp,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
payload = response.json()
|
||||
assert payload["preview"]["type"] == "pdf"
|
||||
assert payload["fields"]
|
||||
assert payload["preview"]["pageWidthPt"] == 2772
|
||||
assert payload["fields"][0]["text"] == "食品名称:天问礼品粽"
|
||||
|
||||
|
||||
def test_process_endpoint_uses_default_sample_files_when_uploads_are_missing(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())
|
||||
|
||||
response = client.post("/api/process")
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
payload = response.json()
|
||||
assert payload["preview"]["type"] == "pdf"
|
||||
assert payload["fields"]
|
||||
assert any(field["text"] for field in payload["fields"])
|
||||
|
||||
|
||||
def test_process_endpoint_surfaces_missing_mineru_key(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
def fake_parse_with_mineru(_preview_path, _output_dir):
|
||||
raise RuntimeError("MINERU_API_KEY is required")
|
||||
|
||||
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)
|
||||
|
||||
response = client.post("/api/process")
|
||||
|
||||
assert response.status_code == 500
|
||||
assert response.json()["detail"] == "MINERU_API_KEY is required"
|
||||
|
||||
|
||||
def test_mineru_extract_endpoint_returns_job_preview_and_blocks(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
monkeypatch.setattr(
|
||||
pipeline,
|
||||
"extract_mineru_result",
|
||||
lambda _ai_path, _output_dir, job_id=None: {
|
||||
"jobId": job_id,
|
||||
"preview": {
|
||||
"type": "pdf",
|
||||
"url": f"/api/files/{job_id}/preview.pdf",
|
||||
"pageWidthPt": 2772,
|
||||
"pageHeightPt": 1961,
|
||||
},
|
||||
"artifacts": {
|
||||
"json": {"path": "/tmp/structured.json", "url": f"/api/files/{job_id}/mineru/structured.json"},
|
||||
"markdown": {"path": "/tmp/full.md", "url": f"/api/files/{job_id}/mineru/full.md"},
|
||||
},
|
||||
"blocks": [{"id": "block-1", "text": "食品名称:天问礼品粽", "page": 1, "x0_pt": 1, "top_pt": 2, "x1_pt": 3, "bottom_pt": 4}],
|
||||
},
|
||||
)
|
||||
|
||||
with AI_FILE.open("rb") as ai_fp:
|
||||
response = client.post(
|
||||
"/api/mineru-extract",
|
||||
files={"ai_file": (AI_FILE.name, ai_fp, "application/postscript")},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload["jobId"]
|
||||
assert payload["preview"]["type"] == "pdf"
|
||||
assert payload["artifacts"]["json"]["url"].endswith("/mineru/structured.json")
|
||||
assert payload["artifacts"]["markdown"]["url"].endswith("/mineru/full.md")
|
||||
assert payload["blocks"][0]["id"] == "block-1"
|
||||
|
||||
|
||||
def test_compare_word_endpoint_returns_compared_fields(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
monkeypatch.setattr(main, "OUTPUTS_DIR", tmp_path)
|
||||
(tmp_path / "test-job").mkdir(parents=True, exist_ok=True)
|
||||
monkeypatch.setattr(
|
||||
pipeline,
|
||||
"compare_word_with_mineru",
|
||||
lambda _word_path, _output_dir, job_id=None: {
|
||||
"jobId": job_id,
|
||||
"preview": {
|
||||
"type": "pdf",
|
||||
"url": f"/api/files/{job_id}/preview.pdf",
|
||||
"pageWidthPt": 2772,
|
||||
"pageHeightPt": 1961,
|
||||
},
|
||||
"fields": [
|
||||
{
|
||||
"id": "field-1",
|
||||
"text": "食品名称:天问礼品粽",
|
||||
"page": 1,
|
||||
"x0_pt": 1,
|
||||
"top_pt": 2,
|
||||
"x1_pt": 3,
|
||||
"bottom_pt": 4,
|
||||
"normalized_text": "食品名称:天问礼品粽",
|
||||
"validation_status": "matched",
|
||||
"validation_reason": "normalized text found in Word content",
|
||||
"matched_excerpt": "食品名称:天问礼品粽",
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
with DOCX_FILE.open("rb") as docx_fp:
|
||||
response = client.post(
|
||||
"/api/compare-word",
|
||||
data={"job_id": "test-job"},
|
||||
files={
|
||||
"word_file": (
|
||||
DOCX_FILE.name,
|
||||
docx_fp,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload["jobId"] == "test-job"
|
||||
assert payload["fields"][0]["validation_status"] == "matched"
|
||||
30
tests/backend/test_barcode_cv.py
Normal file
30
tests/backend/test_barcode_cv.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from pathlib import Path
|
||||
|
||||
from backend.app.barcode_cv import decode_barcode_image
|
||||
|
||||
|
||||
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
|
||||
|
||||
|
||||
def test_decode_barcode_image_reads_ean13_from_original_sample() -> None:
|
||||
result = decode_barcode_image(WORKDIR / "1.jpg")
|
||||
|
||||
assert result["text"] == "6954930015983"
|
||||
assert result["format"] == "EAN_13"
|
||||
assert result["valid_checksum"] is True
|
||||
|
||||
|
||||
def test_decode_barcode_image_reads_ean13_from_ma1_sample() -> None:
|
||||
result = decode_barcode_image(WORKDIR / "ma1.png")
|
||||
|
||||
assert result["text"] == "6954930015983"
|
||||
assert result["format"] == "EAN_13"
|
||||
assert result["valid_checksum"] is True
|
||||
|
||||
|
||||
def test_decode_barcode_image_reads_ean13_from_ma2_sample() -> None:
|
||||
result = decode_barcode_image(WORKDIR / "ma2.png")
|
||||
|
||||
assert result["text"] == "6954930016737"
|
||||
assert result["format"] == "EAN_13"
|
||||
assert result["valid_checksum"] is True
|
||||
33
tests/backend/test_layout_cv.py
Normal file
33
tests/backend/test_layout_cv.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from backend.app.layout_cv import Box, detect_text_lines, merge_text_and_rectangles
|
||||
|
||||
|
||||
def test_merge_text_and_rectangles_keeps_outer_table_box_and_drops_nested_cells() -> None:
|
||||
text_lines = [
|
||||
Box(20, 20, 120, 36, "line", "配料"),
|
||||
Box(20, 40, 120, 56, "line", "糯米"),
|
||||
Box(20, 60, 120, 76, "line", "红豆"),
|
||||
]
|
||||
rectangles = [
|
||||
Box(10, 10, 150, 90, "rectangle"),
|
||||
Box(12, 12, 78, 44, "rectangle"),
|
||||
Box(82, 12, 148, 44, "rectangle"),
|
||||
]
|
||||
|
||||
merged = merge_text_and_rectangles(text_lines, rectangles)
|
||||
|
||||
assert [box.kind for box in merged] == ["rectangle", "line", "line", "line"]
|
||||
assert merged[0].as_tuple() == (10, 10, 150, 90)
|
||||
|
||||
|
||||
def test_detect_text_lines_finds_two_text_rows_without_ocr() -> None:
|
||||
image = np.full((220, 420, 3), 255, dtype=np.uint8)
|
||||
cv2.putText(image, "LINE ONE", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 2, cv2.LINE_AA)
|
||||
cv2.putText(image, "LINE TWO", (20, 140), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 2, cv2.LINE_AA)
|
||||
|
||||
lines = detect_text_lines(image)
|
||||
|
||||
assert len(lines) == 2
|
||||
assert lines[0].y1 < lines[1].y0
|
||||
192
tests/backend/test_mineru_client.py
Normal file
192
tests/backend/test_mineru_client.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from backend.app import mineru_client
|
||||
from backend.app.mineru_client import MineruClient, MineruClientError
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, status: int, body: bytes):
|
||||
self.status = status
|
||||
self._body = body
|
||||
|
||||
def read(self) -> bytes:
|
||||
return self._body
|
||||
|
||||
def __enter__(self) -> "FakeResponse":
|
||||
return self
|
||||
|
||||
def __exit__(self, *_args: object) -> None:
|
||||
return None
|
||||
|
||||
|
||||
class FakeRequestsResponse:
|
||||
def __init__(self, status_code: int, text: str = ""):
|
||||
self.status_code = status_code
|
||||
self.text = text
|
||||
|
||||
|
||||
def _zip_with_json() -> bytes:
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr(
|
||||
"demo_middle.json",
|
||||
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}),
|
||||
)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def _zip_with_layout_and_model() -> bytes:
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]]))
|
||||
archive.writestr(
|
||||
"layout.json",
|
||||
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}),
|
||||
)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
calls: list[str] = []
|
||||
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
calls.append(str(url))
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
if str(url) == "https://upload.example/file":
|
||||
raise AssertionError("upload URL should be handled by requests.put")
|
||||
if str(url).endswith("/api/v4/extract/task"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task/task-1"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
|
||||
)
|
||||
if str(url) == "https://download.example/result.zip":
|
||||
return FakeResponse(200, _zip_with_json())
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
assert payload["pdf_info"][0]["page_size"] == [1, 1]
|
||||
assert calls == [
|
||||
"https://mineru.net/api/v4/file-urls/batch",
|
||||
"https://mineru.net/api/v4/extract/task",
|
||||
"https://mineru.net/api/v4/extract/task/task-1",
|
||||
"https://download.example/result.zip",
|
||||
]
|
||||
assert (tmp_path / "mineru_result.zip").exists()
|
||||
|
||||
|
||||
def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
if str(url) == "https://upload.example/file":
|
||||
raise AssertionError("upload URL should be handled by requests.put")
|
||||
if str(url).endswith("/api/v4/extract/task"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task/task-1"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(),
|
||||
)
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
with pytest.raises(MineruClientError, match="bad pdf"):
|
||||
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
|
||||
def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(
|
||||
mineru_client.requests,
|
||||
"put",
|
||||
lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"),
|
||||
)
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
with pytest.raises(MineruClientError, match="HTTP 403"):
|
||||
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
|
||||
def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task/task-1"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
|
||||
)
|
||||
if str(url) == "https://download.example/result.zip":
|
||||
return FakeResponse(200, _zip_with_layout_and_model())
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
assert payload["pdf_info"][0]["page_size"] == [2, 2]
|
||||
99
tests/backend/test_mineru_parser.py
Normal file
99
tests/backend/test_mineru_parser.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from backend.app.mineru_parser import parse_mineru_fields
|
||||
|
||||
|
||||
def test_parse_mineru_fields_extracts_text_and_bbox() -> None:
|
||||
payload = {
|
||||
"pdf_info": [
|
||||
{
|
||||
"page_idx": 0,
|
||||
"page_size": [2772, 1961],
|
||||
"para_blocks": [
|
||||
{
|
||||
"bbox": [704, 134, 2106, 229],
|
||||
"type": "title",
|
||||
"lines": [
|
||||
{
|
||||
"spans": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": "食品名称:天问礼品粽",
|
||||
"bbox": [704, 134, 2106, 229],
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
parsed = parse_mineru_fields(payload)
|
||||
|
||||
assert parsed.page_width == 2772
|
||||
assert parsed.page_height == 1961
|
||||
assert parsed.fields == [
|
||||
{
|
||||
"page": 1,
|
||||
"text": "食品名称:天问礼品粽",
|
||||
"font_name": "",
|
||||
"font_size_pt": None,
|
||||
"font_height_mm": None,
|
||||
"x0_pt": 704.0,
|
||||
"top_pt": 134.0,
|
||||
"x1_pt": 2106.0,
|
||||
"bottom_pt": 229.0,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_parse_mineru_fields_turns_table_html_into_text() -> None:
|
||||
payload = {
|
||||
"pdf_info": [
|
||||
{
|
||||
"page_idx": 0,
|
||||
"page_size": [1000, 800],
|
||||
"para_blocks": [
|
||||
{
|
||||
"bbox": [10, 20, 300, 200],
|
||||
"type": "table",
|
||||
"lines": [
|
||||
{
|
||||
"spans": [
|
||||
{
|
||||
"type": "table",
|
||||
"html": "<table><tr><td>品种</td><td>规格</td></tr><tr><td>黑猪肉粽</td><td>130克×1</td></tr></table>",
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
parsed = parse_mineru_fields(payload)
|
||||
|
||||
assert parsed.fields[0]["text"] == "品种 规格 黑猪肉粽 130克×1"
|
||||
|
||||
|
||||
def test_parse_mineru_fields_skips_empty_decorative_blocks() -> None:
|
||||
payload = {
|
||||
"pdf_info": [
|
||||
{
|
||||
"page_idx": 0,
|
||||
"page_size": [1000, 800],
|
||||
"para_blocks": [
|
||||
{"bbox": [1, 2, 3, 4], "type": "image", "lines": [{"spans": [{"type": "image"}]}]},
|
||||
{"bbox": [5, 6, 7, 8], "type": "text", "lines": [{"spans": [{"content": " "}]}]},
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
parsed = parse_mineru_fields(payload)
|
||||
|
||||
assert parsed.fields == []
|
||||
74
tests/backend/test_pipeline.py
Normal file
74
tests/backend/test_pipeline.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from backend.app import pipeline
|
||||
from backend.app.pipeline import process_files
|
||||
|
||||
|
||||
WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
|
||||
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
|
||||
DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"
|
||||
OUTPUT_DIR = WORKDIR / ".tmp_test_output"
|
||||
|
||||
|
||||
def test_process_files_builds_preview_and_mineru_field_results(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
def fake_parse_with_mineru(_preview_path: Path, _output_dir: Path):
|
||||
return {
|
||||
"pdf_info": [
|
||||
{
|
||||
"page_idx": 0,
|
||||
"page_size": [2772, 1961],
|
||||
"para_blocks": [
|
||||
{
|
||||
"bbox": [704, 134, 2106, 229],
|
||||
"lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
|
||||
},
|
||||
{
|
||||
"bbox": [10, 20, 40, 60],
|
||||
"lines": [{"spans": [{"content": "Word中不存在的内容"}]}],
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)
|
||||
|
||||
result = process_files(AI_FILE, DOCX_FILE, OUTPUT_DIR, job_id="test-job")
|
||||
|
||||
assert result["preview"]["type"] == "pdf"
|
||||
assert result["preview"]["url"] == "/api/files/test-job/preview.pdf"
|
||||
assert result["preview"]["pageWidthPt"] == 2772
|
||||
assert result["preview"]["pageHeightPt"] == 1961
|
||||
assert result["fields"][0]["text"] == "食品名称:天问礼品粽"
|
||||
assert result["fields"][0]["validation_status"] == "matched"
|
||||
assert result["fields"][0]["x0_pt"] == 704.0
|
||||
assert any(field["validation_status"] == "unmatched" for field in result["fields"])
|
||||
assert (OUTPUT_DIR / "preview.pdf").exists()
|
||||
|
||||
|
||||
def test_parse_preview_with_mineru_reads_key_from_env_file(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
captured: dict[str, str] = {}
|
||||
|
||||
class FakeMineruClient:
|
||||
def __init__(self, api_key: str) -> None:
|
||||
captured["api_key"] = api_key
|
||||
|
||||
def parse_pdf(self, preview_path: Path, output_dir: Path) -> dict:
|
||||
return {"preview_path": str(preview_path), "output_dir": str(output_dir)}
|
||||
|
||||
env_file = tmp_path / ".env"
|
||||
env_file.write_text("MINERU_API_KEY=from-env-file\n", encoding="utf-8")
|
||||
|
||||
monkeypatch.delenv("MINERU_API_KEY", raising=False)
|
||||
monkeypatch.setattr(pipeline, "ENV_FILE_CANDIDATES", (env_file,))
|
||||
monkeypatch.setattr(pipeline, "MineruClient", FakeMineruClient)
|
||||
|
||||
preview_path = tmp_path / "preview.pdf"
|
||||
preview_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
result = pipeline._parse_preview_with_mineru(preview_path, tmp_path)
|
||||
|
||||
assert captured["api_key"] == "from-env-file"
|
||||
assert result["preview_path"] == str(preview_path)
|
||||
32
tests/backend/test_text_validation.py
Normal file
32
tests/backend/test_text_validation.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from backend.app.text_validation import classify_text_block, normalize_text, validate_field_against_word
|
||||
|
||||
|
||||
def test_normalize_text_collapses_whitespace_and_full_width_punctuation() -> None:
|
||||
raw = " 食品生产许可证编号:\nSC11133042404806 "
|
||||
|
||||
assert normalize_text(raw) == "食品生产许可证编号:SC11133042404806"
|
||||
|
||||
|
||||
def test_classify_text_block_marks_garbled_text() -> None:
|
||||
assert classify_text_block("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>-<2D><>-<2D><>") == "empty_or_garbled"
|
||||
assert classify_text_block(" ") == "empty_or_garbled"
|
||||
assert classify_text_block("食品名称:天问礼品粽") == "candidate"
|
||||
|
||||
|
||||
def test_validate_field_against_word_returns_excerpt_for_match() -> None:
|
||||
word_text = "电话:0573-86981666 食品生产许可证编号:SC11133042404806 产品标准代号:GB/T 46259"
|
||||
|
||||
result = validate_field_against_word("食品生产许可证编号:SC11133042404806", word_text)
|
||||
|
||||
assert result.status == "matched"
|
||||
assert result.reason == "normalized text found in Word content"
|
||||
assert "SC11133042404806" in (result.matched_excerpt or "")
|
||||
|
||||
|
||||
def test_validate_field_against_word_rejects_missing_text() -> None:
|
||||
word_text = "产品标准代号:GB/T 46259"
|
||||
|
||||
result = validate_field_against_word("食品生产许可证编号:SC11133042404806", word_text)
|
||||
|
||||
assert result.status == "unmatched"
|
||||
assert result.matched_excerpt is None
|
||||
Reference in New Issue
Block a user