Files
ZLD_POC/tests/backend/test_mineru_client.py
2026-04-15 17:18:49 +08:00

193 lines
7.6 KiB
Python

from __future__ import annotations
import io
import json
import zipfile
from pathlib import Path
import pytest
import requests
from backend.app import mineru_client
from backend.app.mineru_client import MineruClient, MineruClientError
class FakeResponse:
def __init__(self, status: int, body: bytes):
self.status = status
self._body = body
def read(self) -> bytes:
return self._body
def __enter__(self) -> "FakeResponse":
return self
def __exit__(self, *_args: object) -> None:
return None
class FakeRequestsResponse:
def __init__(self, status_code: int, text: str = ""):
self.status_code = status_code
self.text = text
def _zip_with_json() -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr(
"demo_middle.json",
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}),
)
return buffer.getvalue()
def _zip_with_layout_and_model() -> bytes:
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]]))
archive.writestr(
"layout.json",
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}),
)
return buffer.getvalue()
def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
calls: list[str] = []
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
calls.append(str(url))
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
if str(url) == "https://upload.example/file":
raise AssertionError("upload URL should be handled by requests.put")
if str(url).endswith("/api/v4/extract/task"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
)
if str(url).endswith("/api/v4/extract/task/task-1"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
)
if str(url) == "https://download.example/result.zip":
return FakeResponse(200, _zip_with_json())
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
assert payload["pdf_info"][0]["page_size"] == [1, 1]
assert calls == [
"https://mineru.net/api/v4/file-urls/batch",
"https://mineru.net/api/v4/extract/task",
"https://mineru.net/api/v4/extract/task/task-1",
"https://download.example/result.zip",
]
assert (tmp_path / "mineru_result.zip").exists()
def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
if str(url) == "https://upload.example/file":
raise AssertionError("upload URL should be handled by requests.put")
if str(url).endswith("/api/v4/extract/task"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
)
if str(url).endswith("/api/v4/extract/task/task-1"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(),
)
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
with pytest.raises(MineruClientError, match="bad pdf"):
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(
mineru_client.requests,
"put",
lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"),
)
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
with pytest.raises(MineruClientError, match="HTTP 403"):
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
def fake_urlopen(request_obj, timeout=0):
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
if str(url).endswith("/api/v4/file-urls/batch"):
return FakeResponse(
200,
json.dumps(
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
).encode(),
)
if str(url).endswith("/api/v4/extract/task"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
)
if str(url).endswith("/api/v4/extract/task/task-1"):
return FakeResponse(
200,
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
)
if str(url) == "https://download.example/result.zip":
return FakeResponse(200, _zip_with_layout_and_model())
raise AssertionError(f"unexpected URL {url}")
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
pdf_path = tmp_path / "preview.pdf"
pdf_path.write_bytes(b"%PDF-1.7")
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
assert payload["pdf_info"][0]["page_size"] == [2, 2]