ZLD_POC/tests/backend/test_api.py

from pathlib import Path

import pytest
from fastapi.testclient import TestClient

from backend.app import pipeline
from backend.app import main
from backend.app.main import app


WORKDIR = Path("/Users/icemilk/Workspace/zld_POC")
AI_FILE = WORKDIR / "【2026-04-09】端午 - 背标 - 天问.ai"
DOCX_FILE = WORKDIR / "天问礼品粽【260331】.docx"


client = TestClient(app)


def fake_mineru_payload() -> dict:
    return {
        "pdf_info": [
            {
                "page_idx": 0,
                "page_size": [2772, 1961],
                "para_blocks": [
                    {
                        "bbox": [704, 134, 2106, 229],
                        "lines": [{"spans": [{"content": "食品名称:天问礼品粽"}]}],
                    }
                ],
            }
        ]
    }


def test_process_endpoint_returns_preview_and_fields(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())

    with AI_FILE.open("rb") as ai_fp, DOCX_FILE.open("rb") as docx_fp:
        response = client.post(
            "/api/process",
            files={
                "ai_file": (AI_FILE.name, ai_fp, "application/postscript"),
                "word_file": (
                    DOCX_FILE.name,
                    docx_fp,
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                ),
            },
        )

    assert response.status_code == 200

    payload = response.json()
    assert payload["preview"]["type"] == "pdf"
    assert payload["fields"]
    assert payload["preview"]["pageWidthPt"] == 2772
    assert payload["fields"][0]["text"] == "食品名称:天问礼品粽"


def test_process_endpoint_uses_default_sample_files_when_uploads_are_missing(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", lambda _preview_path, _output_dir: fake_mineru_payload())

    response = client.post("/api/process")

    assert response.status_code == 200

    payload = response.json()
    assert payload["preview"]["type"] == "pdf"
    assert payload["fields"]
    assert any(field["text"] for field in payload["fields"])


def test_process_endpoint_surfaces_missing_mineru_key(monkeypatch: pytest.MonkeyPatch) -> None:
    def fake_parse_with_mineru(_preview_path, _output_dir):
        raise RuntimeError("MINERU_API_KEY is required")

    monkeypatch.setattr(pipeline, "_parse_preview_with_mineru", fake_parse_with_mineru)

    response = client.post("/api/process")

    assert response.status_code == 500
    assert response.json()["detail"] == "MINERU_API_KEY is required"


def test_mineru_extract_endpoint_returns_job_preview_and_blocks(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setattr(
        pipeline,
        "extract_mineru_result",
        lambda _ai_path, _output_dir, job_id=None: {
            "jobId": job_id,
            "preview": {
                "type": "pdf",
                "url": f"/api/files/{job_id}/preview.pdf",
                "pageWidthPt": 2772,
                "pageHeightPt": 1961,
            },
            "artifacts": {
                "json": {"path": "/tmp/structured.json", "url": f"/api/files/{job_id}/mineru/structured.json"},
                "markdown": {"path": "/tmp/full.md", "url": f"/api/files/{job_id}/mineru/full.md"},
            },
            "blocks": [{"id": "block-1", "text": "食品名称:天问礼品粽", "page": 1, "x0_pt": 1, "top_pt": 2, "x1_pt": 3, "bottom_pt": 4}],
        },
    )

    with AI_FILE.open("rb") as ai_fp:
        response = client.post(
            "/api/mineru-extract",
            files={"ai_file": (AI_FILE.name, ai_fp, "application/postscript")},
        )

    assert response.status_code == 200
    payload = response.json()
    assert payload["jobId"]
    assert payload["preview"]["type"] == "pdf"
    assert payload["artifacts"]["json"]["url"].endswith("/mineru/structured.json")
    assert payload["artifacts"]["markdown"]["url"].endswith("/mineru/full.md")
    assert payload["blocks"][0]["id"] == "block-1"


def test_compare_word_endpoint_returns_compared_fields(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
    monkeypatch.setattr(main, "OUTPUTS_DIR", tmp_path)
    (tmp_path / "test-job").mkdir(parents=True, exist_ok=True)
    monkeypatch.setattr(
        pipeline,
        "compare_word_with_mineru",
        lambda _word_path, _output_dir, job_id=None: {
            "jobId": job_id,
            "preview": {
                "type": "pdf",
                "url": f"/api/files/{job_id}/preview.pdf",
                "pageWidthPt": 2772,
                "pageHeightPt": 1961,
            },
            "fields": [
                {
                    "id": "field-1",
                    "text": "食品名称:天问礼品粽",
                    "page": 1,
                    "x0_pt": 1,
                    "top_pt": 2,
                    "x1_pt": 3,
                    "bottom_pt": 4,
                    "normalized_text": "食品名称:天问礼品粽",
                    "validation_status": "matched",
                    "validation_reason": "normalized text found in Word content",
                    "matched_excerpt": "食品名称:天问礼品粽",
                }
            ],
        },
    )

    with DOCX_FILE.open("rb") as docx_fp:
        response = client.post(
            "/api/compare-word",
            data={"job_id": "test-job"},
            files={
                "word_file": (
                    DOCX_FILE.name,
                    docx_fp,
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                )
            },
        )

    assert response.status_code == 200
    payload = response.json()
    assert payload["jobId"] == "test-job"
    assert payload["fields"][0]["validation_status"] == "matched"