from __future__ import annotations import io import json import zipfile from pathlib import Path import pytest import requests from backend.app import mineru_client from backend.app.mineru_client import MineruClient, MineruClientError class FakeResponse: def __init__(self, status: int, body: bytes): self.status = status self._body = body def read(self) -> bytes: return self._body def __enter__(self) -> "FakeResponse": return self def __exit__(self, *_args: object) -> None: return None class FakeRequestsResponse: def __init__(self, status_code: int, text: str = ""): self.status_code = status_code self.text = text def _zip_with_json() -> bytes: buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w") as archive: archive.writestr( "demo_middle.json", json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}), ) return buffer.getvalue() def _zip_with_layout_and_model() -> bytes: buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w") as archive: archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]])) archive.writestr( "layout.json", json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}), ) return buffer.getvalue() def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: calls: list[str] = [] def fake_urlopen(request_obj, timeout=0): url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj calls.append(str(url)) if str(url).endswith("/api/v4/file-urls/batch"): return FakeResponse( 200, json.dumps( {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}} ).encode(), ) if str(url) == "https://upload.example/file": raise AssertionError("upload URL should be handled by requests.put") if str(url).endswith("/api/v4/extract/task"): return FakeResponse( 200, json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(), ) if str(url).endswith("/api/v4/extract/task/task-1"): return FakeResponse( 200, json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(), ) if str(url) == "https://download.example/result.zip": return FakeResponse(200, _zip_with_json()) raise AssertionError(f"unexpected URL {url}") monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen) monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200)) pdf_path = tmp_path / "preview.pdf" pdf_path.write_bytes(b"%PDF-1.7") payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path) assert payload["pdf_info"][0]["page_size"] == [1, 1] assert calls == [ "https://mineru.net/api/v4/file-urls/batch", "https://mineru.net/api/v4/extract/task", "https://mineru.net/api/v4/extract/task/task-1", "https://download.example/result.zip", ] assert (tmp_path / "mineru_result.zip").exists() def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_urlopen(request_obj, timeout=0): url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj if str(url).endswith("/api/v4/file-urls/batch"): return FakeResponse( 200, json.dumps( {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}} ).encode(), ) if str(url) == "https://upload.example/file": raise AssertionError("upload URL should be handled by requests.put") if str(url).endswith("/api/v4/extract/task"): return FakeResponse( 200, json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(), ) if str(url).endswith("/api/v4/extract/task/task-1"): return FakeResponse( 200, json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(), ) raise AssertionError(f"unexpected URL {url}") monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen) monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200)) pdf_path = tmp_path / "preview.pdf" pdf_path.write_bytes(b"%PDF-1.7") with pytest.raises(MineruClientError, match="bad pdf"): MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path) def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_urlopen(request_obj, timeout=0): url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj if str(url).endswith("/api/v4/file-urls/batch"): return FakeResponse( 200, json.dumps( {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}} ).encode(), ) raise AssertionError(f"unexpected URL {url}") monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen) monkeypatch.setattr( mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"), ) pdf_path = tmp_path / "preview.pdf" pdf_path.write_bytes(b"%PDF-1.7") with pytest.raises(MineruClientError, match="HTTP 403"): MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path) def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: def fake_urlopen(request_obj, timeout=0): url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj if str(url).endswith("/api/v4/file-urls/batch"): return FakeResponse( 200, json.dumps( {"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}} ).encode(), ) if str(url).endswith("/api/v4/extract/task"): return FakeResponse( 200, json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(), ) if str(url).endswith("/api/v4/extract/task/task-1"): return FakeResponse( 200, json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(), ) if str(url) == "https://download.example/result.zip": return FakeResponse(200, _zip_with_layout_and_model()) raise AssertionError(f"unexpected URL {url}") monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen) monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200)) pdf_path = tmp_path / "preview.pdf" pdf_path.write_bytes(b"%PDF-1.7") payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path) assert payload["pdf_info"][0]["page_size"] == [2, 2]