Initial commit: 包装审核 POC、Docker 与前后端
Made-with: Cursor
This commit is contained in:
192
tests/backend/test_mineru_client.py
Normal file
192
tests/backend/test_mineru_client.py
Normal file
@@ -0,0 +1,192 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from backend.app import mineru_client
|
||||
from backend.app.mineru_client import MineruClient, MineruClientError
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, status: int, body: bytes):
|
||||
self.status = status
|
||||
self._body = body
|
||||
|
||||
def read(self) -> bytes:
|
||||
return self._body
|
||||
|
||||
def __enter__(self) -> "FakeResponse":
|
||||
return self
|
||||
|
||||
def __exit__(self, *_args: object) -> None:
|
||||
return None
|
||||
|
||||
|
||||
class FakeRequestsResponse:
|
||||
def __init__(self, status_code: int, text: str = ""):
|
||||
self.status_code = status_code
|
||||
self.text = text
|
||||
|
||||
|
||||
def _zip_with_json() -> bytes:
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr(
|
||||
"demo_middle.json",
|
||||
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [1, 1], "para_blocks": []}]}),
|
||||
)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def _zip_with_layout_and_model() -> bytes:
|
||||
buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(buffer, "w") as archive:
|
||||
archive.writestr("demo_model.json", json.dumps([[{"type": "header"}]]))
|
||||
archive.writestr(
|
||||
"layout.json",
|
||||
json.dumps({"pdf_info": [{"page_idx": 0, "page_size": [2, 2], "para_blocks": []}]}),
|
||||
)
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def test_submit_pdf_downloads_and_loads_structured_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
calls: list[str] = []
|
||||
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
calls.append(str(url))
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
if str(url) == "https://upload.example/file":
|
||||
raise AssertionError("upload URL should be handled by requests.put")
|
||||
if str(url).endswith("/api/v4/extract/task"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task/task-1"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
|
||||
)
|
||||
if str(url) == "https://download.example/result.zip":
|
||||
return FakeResponse(200, _zip_with_json())
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
assert payload["pdf_info"][0]["page_size"] == [1, 1]
|
||||
assert calls == [
|
||||
"https://mineru.net/api/v4/file-urls/batch",
|
||||
"https://mineru.net/api/v4/extract/task",
|
||||
"https://mineru.net/api/v4/extract/task/task-1",
|
||||
"https://download.example/result.zip",
|
||||
]
|
||||
assert (tmp_path / "mineru_result.zip").exists()
|
||||
|
||||
|
||||
def test_submit_pdf_raises_on_failed_task(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
if str(url) == "https://upload.example/file":
|
||||
raise AssertionError("upload URL should be handled by requests.put")
|
||||
if str(url).endswith("/api/v4/extract/task"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task/task-1"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"state": "failed", "err_msg": "bad pdf"}}).encode(),
|
||||
)
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
with pytest.raises(MineruClientError, match="bad pdf"):
|
||||
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
|
||||
def test_submit_pdf_raises_on_upload_http_error(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(
|
||||
mineru_client.requests,
|
||||
"put",
|
||||
lambda url, data, timeout=0: FakeRequestsResponse(403, "SignatureDoesNotMatch"),
|
||||
)
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
with pytest.raises(MineruClientError, match="HTTP 403"):
|
||||
MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
|
||||
def test_submit_pdf_prefers_layout_json_over_model_json(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
|
||||
def fake_urlopen(request_obj, timeout=0):
|
||||
url = request_obj.full_url if hasattr(request_obj, "full_url") else request_obj
|
||||
if str(url).endswith("/api/v4/file-urls/batch"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps(
|
||||
{"code": 0, "data": {"batch_id": "batch-1", "file_urls": ["https://upload.example/file"]}}
|
||||
).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"task_id": "task-1"}}).encode(),
|
||||
)
|
||||
if str(url).endswith("/api/v4/extract/task/task-1"):
|
||||
return FakeResponse(
|
||||
200,
|
||||
json.dumps({"code": 0, "data": {"state": "done", "full_zip_url": "https://download.example/result.zip"}}).encode(),
|
||||
)
|
||||
if str(url) == "https://download.example/result.zip":
|
||||
return FakeResponse(200, _zip_with_layout_and_model())
|
||||
raise AssertionError(f"unexpected URL {url}")
|
||||
|
||||
monkeypatch.setattr(mineru_client.request, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(mineru_client.requests, "put", lambda url, data, timeout=0: FakeRequestsResponse(200))
|
||||
pdf_path = tmp_path / "preview.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.7")
|
||||
|
||||
payload = MineruClient(api_key="secret", poll_interval_seconds=0, max_polls=1).parse_pdf(pdf_path, tmp_path)
|
||||
|
||||
assert payload["pdf_info"][0]["page_size"] == [2, 2]
|
||||
Reference in New Issue
Block a user