Files
ZLD_POC/backend/app/mineru_client.py
2026-04-15 17:18:49 +08:00

249 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import json
import logging
import time
import zipfile
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
MINERU_BASE = "https://mineru.net/api/v4"
TERMINAL_STATES = {"done", "failed"}
IN_PROGRESS_STATE_LABELS = {
"waiting-file": "等待文件上传",
"pending": "排队中",
"running": "解析中",
"converting": "格式转换中",
}
class MineruClientError(RuntimeError):
pass
class MineruClient:
"""MinerU 精准解析 API 客户端(需要 Token
针对本地图片文件的完整调用流程:
1. POST /file-urls/batch → 获取 batch_id + OSS 签名上传 URL
2. PUT 上传图片到 OSS → 系统自动感知并提交解析任务
3. GET /extract-results/batch/{batch_id} 轮询直到 state=done
4. 下载 full_zip_url解压提取结构化 JSON
文件限制:≤ 200MB≤ 600 页
支持格式PDF、图片png/jpg/jpeg/jp2/webp/gif/bmp、Doc、Docx、Ppt、PPTx
"""
def __init__(
self,
api_key: str,
model_version: str = "vlm",
language: str = "ch",
enable_table: bool = True,
is_ocr: bool = True,
enable_formula: bool = True,
poll_interval: float = 3.0,
timeout: float = 300.0,
) -> None:
self.api_key = api_key
self.model_version = model_version
self.language = language
self.enable_table = enable_table
self.is_ocr = is_ocr
self.enable_formula = enable_formula
self.poll_interval = poll_interval
self.timeout = timeout
def parse_image(self, image_path: Path, output_dir: Path) -> dict:
"""解析本地图片文件,返回结构化 JSON 数据。
Parameters
----------
image_path:
本地图片路径png/jpg/jpeg/jp2/webp/gif/bmp
output_dir:
中间产物zip、解压目录的存放目录
Returns
-------
dict
包含 pdf_info 的结构化 JSONlayout.json 或 content_list.json
"""
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"图片文件不存在: {image_path}")
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
logger.info("MinerU 精准解析开始: %s", image_path.name)
batch_id, upload_url = self._request_upload_url(image_path.name)
logger.info("MinerU 批次已创建: batch_id=%s", batch_id)
self._upload_file(upload_url, image_path)
logger.info("MinerU 文件上传完成: %s(系统自动提交解析)", image_path.name)
zip_url = self._poll_batch_until_done(batch_id)
logger.info("MinerU 解析完成: batch_id=%s", batch_id)
zip_path = self._download_zip(zip_url, output_dir)
extract_dir = output_dir / "result"
self._extract_zip(zip_path, extract_dir)
result = self._load_structured_json(extract_dir)
logger.info("MinerU 结构化 JSON 加载完毕")
return result
# ------------------------------------------------------------------
# 内部方法
# ------------------------------------------------------------------
def _auth_headers(self) -> dict[str, str]:
return {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
def _request_upload_url(self, file_name: str) -> tuple[str, str]:
"""申请批量上传链接,返回 (batch_id, oss_upload_url)。"""
payload = {
"files": [{"name": file_name, "is_ocr": self.is_ocr}],
"model_version": self.model_version,
"language": self.language,
"enable_table": self.enable_table,
"enable_formula": self.enable_formula,
}
try:
resp = requests.post(
f"{MINERU_BASE}/file-urls/batch",
headers=self._auth_headers(),
json=payload,
timeout=30,
)
resp.raise_for_status()
except requests.RequestException as exc:
raise MineruClientError(f"MinerU 申请上传 URL 失败: {exc}") from exc
body = resp.json()
if body.get("code") != 0:
raise MineruClientError(f"MinerU 申请上传 URL 失败: {body.get('msg')}")
data = body.get("data", {})
batch_id = data.get("batch_id")
file_urls = data.get("file_urls", [])
if not batch_id or not file_urls:
raise MineruClientError("MinerU 返回的 batch_id 或 file_urls 为空")
return batch_id, file_urls[0]
def _upload_file(self, upload_url: str, image_path: Path) -> None:
"""将图片 PUT 上传到 OSS。上传时无需设置 Content-Type。"""
try:
with image_path.open("rb") as f:
resp = requests.put(upload_url, data=f, timeout=120)
except requests.RequestException as exc:
raise MineruClientError(f"MinerU 文件上传网络错误: {exc}") from exc
if resp.status_code not in (200, 201):
raise MineruClientError(
f"MinerU 文件上传失败: HTTP {resp.status_code} {resp.text[:200]}"
)
def _poll_batch_until_done(self, batch_id: str) -> str:
"""轮询批次结果,返回 full_zip_url。"""
url = f"{MINERU_BASE}/extract-results/batch/{batch_id}"
deadline = time.monotonic() + self.timeout
while time.monotonic() < deadline:
try:
resp = requests.get(url, headers=self._auth_headers(), timeout=30)
resp.raise_for_status()
except requests.RequestException as exc:
raise MineruClientError(f"MinerU 查询批次状态失败: {exc}") from exc
body = resp.json()
if body.get("code") != 0:
raise MineruClientError(f"MinerU 查询批次失败: {body.get('msg')}")
results: list[dict] = body.get("data", {}).get("extract_result", [])
if not results:
time.sleep(self.poll_interval)
continue
item = results[0]
state = item.get("state", "")
label = IN_PROGRESS_STATE_LABELS.get(state, state)
logger.info("MinerU 批次状态: batch_id=%s state=%s (%s)", batch_id, state, label)
if state == "done":
zip_url = item.get("full_zip_url")
if not zip_url:
raise MineruClientError("MinerU 完成但未返回 full_zip_url")
return zip_url
if state == "failed":
err_msg = item.get("err_msg") or "未知错误"
raise MineruClientError(f"MinerU 解析失败: {err_msg}")
time.sleep(self.poll_interval)
raise MineruClientError(
f"MinerU 轮询超时 ({self.timeout:.0f}s): batch_id={batch_id}"
)
def _download_zip(self, zip_url: str, output_dir: Path) -> Path:
"""下载结果 zip 包到本地。"""
target = output_dir / "mineru_result.zip"
try:
resp = requests.get(zip_url, timeout=120, stream=True)
resp.raise_for_status()
with target.open("wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
except requests.RequestException as exc:
raise MineruClientError(f"MinerU zip 下载失败: {exc}") from exc
logger.info("MinerU zip 下载完毕: %s", target)
return target
def _extract_zip(self, zip_path: Path, extract_dir: Path) -> None:
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path) as archive:
archive.extractall(extract_dir)
logger.info("MinerU zip 解压完毕: %s", extract_dir)
def _load_structured_json(self, extract_dir: Path) -> dict:
"""从解压目录中找到并加载包含 pdf_info 的结构化 JSON。
MinerU zip 结构说明:
layout.json → 中间处理结果(对应 middle.json
*_content_list.json → 内容列表
*_model.json → 模型推理结果
full.md → Markdown 解析结果
"""
candidates = [
*sorted(extract_dir.rglob("layout.json")),
*sorted(extract_dir.rglob("*layout*.json")),
*sorted(extract_dir.rglob("*_content_list*.json")),
*sorted(extract_dir.rglob("*.json")),
]
seen: set[Path] = set()
for candidate in candidates:
if candidate in seen:
continue
seen.add(candidate)
try:
parsed = json.loads(candidate.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
if isinstance(parsed, dict) and isinstance(parsed.get("pdf_info"), list):
logger.info("MinerU 结构化 JSON 选用: %s", candidate.name)
return parsed
raise MineruClientError(
"MinerU 结果 zip 中未找到包含 pdf_info 的结构化 JSON"
)