193 lines
7.6 KiB
Python
193 lines
7.6 KiB
Python
from __future__ import annotations
|
|
|
|
import io
|
|
import json
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
from backend.app import mineru_client
|
|
from backend.app.mineru_client import MineruClient, MineruClientError
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, status: int, body: bytes):
|
|
self.status = status
|
|
self._body = body
|
|
|
|
def read(self) -> bytes:
|
|
return self._body
|
|
|
|
def __enter__(self) -> "FakeResponse":
|
|
return self
|
|
|
|
def __exit__(self, *_args: object) -> None:
|
|
return None
|
|
|
|
|
|
class FakeRequestsResponse:
|
|
def __init__(self, status_code: int, text: str = ""):
|
|
self.status_code = status_code
|
|
self.text = text
|
|
|
|
|
|
def _zip_with_json() -> bytes:
|
|
buffer = io.BytesIO()
|
|
with zipfile.ZipFile(buffer, "w") as archive:
|
|
archive.writestr(
|
|
"demo_middle.json",
|
|
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}),
|
|
)
|
|
return buffer.getvalue()
|
|
|
|
|
|
def _zip_with_layout_and_model() -> bytes:
|
|
buffer = io.BytesIO()
|
|
with zipfile.ZipFile(buffer, "w") as archive:
|
|
archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]]))
|
|
archive.writestr(
|
|
"layout.json",
|
|
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}),
|
|
)
|
|
return buffer.getvalue()
|
|
|
|
|
|
def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
calls: list[str] = []
|
|
|
|
def fake_urlopen(request_obj, timeout=0):
|
|
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
|
calls.append(str(url))
|
|
if str(url).endswith("/api/v4/file-urls/batch"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps(
|
|
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
|
).encode(),
|
|
)
|
|
if str(url) == "https://upload.example/file":
|
|
raise AssertionError("upload URL should be handled by requests.put")
|
|
if str(url).endswith("/api/v4/extract/task"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
|
)
|
|
if str(url).endswith("/api/v4/extract/task/task-1"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
|
|
)
|
|
if str(url) == "https://download.example/result.zip":
|
|
return FakeResponse(200, _zip_with_json())
|
|
raise AssertionError(f"unexpected URL {url}")
|
|
|
|
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
|
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
|
pdf_path = tmp_path / "preview.pdf"
|
|
pdf_path.write_bytes(b"%PDF-1.7")
|
|
|
|
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
|
|
|
assert payload["pdf_info"][0]["page_size"] == [1, 1]
|
|
assert calls == [
|
|
"https://mineru.net/api/v4/file-urls/batch",
|
|
"https://mineru.net/api/v4/extract/task",
|
|
"https://mineru.net/api/v4/extract/task/task-1",
|
|
"https://download.example/result.zip",
|
|
]
|
|
assert (tmp_path / "mineru_result.zip").exists()
|
|
|
|
|
|
def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
def fake_urlopen(request_obj, timeout=0):
|
|
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
|
if str(url).endswith("/api/v4/file-urls/batch"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps(
|
|
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
|
).encode(),
|
|
)
|
|
if str(url) == "https://upload.example/file":
|
|
raise AssertionError("upload URL should be handled by requests.put")
|
|
if str(url).endswith("/api/v4/extract/task"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
|
)
|
|
if str(url).endswith("/api/v4/extract/task/task-1"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(),
|
|
)
|
|
raise AssertionError(f"unexpected URL {url}")
|
|
|
|
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
|
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
|
pdf_path = tmp_path / "preview.pdf"
|
|
pdf_path.write_bytes(b"%PDF-1.7")
|
|
|
|
with pytest.raises(MineruClientError, match="bad pdf"):
|
|
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
|
|
|
|
|
def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
def fake_urlopen(request_obj, timeout=0):
|
|
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
|
if str(url).endswith("/api/v4/file-urls/batch"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps(
|
|
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
|
).encode(),
|
|
)
|
|
raise AssertionError(f"unexpected URL {url}")
|
|
|
|
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
|
monkeypatch.setattr(
|
|
mineru_client.requests,
|
|
"put",
|
|
lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"),
|
|
)
|
|
pdf_path = tmp_path / "preview.pdf"
|
|
pdf_path.write_bytes(b"%PDF-1.7")
|
|
|
|
with pytest.raises(MineruClientError, match="HTTP 403"):
|
|
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
|
|
|
|
|
def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
|
def fake_urlopen(request_obj, timeout=0):
|
|
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
|
if str(url).endswith("/api/v4/file-urls/batch"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps(
|
|
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
|
).encode(),
|
|
)
|
|
if str(url).endswith("/api/v4/extract/task"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
|
)
|
|
if str(url).endswith("/api/v4/extract/task/task-1"):
|
|
return FakeResponse(
|
|
200,
|
|
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
|
|
)
|
|
if str(url) == "https://download.example/result.zip":
|
|
return FakeResponse(200, _zip_with_layout_and_model())
|
|
raise AssertionError(f"unexpected URL {url}")
|
|
|
|
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
|
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
|
pdf_path = tmp_path / "preview.pdf"
|
|
pdf_path.write_bytes(b"%PDF-1.7")
|
|
|
|
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
|
|
|
assert payload["pdf_info"][0]["page_size"] == [2, 2]
|