249 lines
9.1 KiB
Python
249 lines
9.1 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import time
|
||
import zipfile
|
||
from pathlib import Path
|
||
|
||
import requests
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
MINERU_BASE = "https://mineru.net/api/v4"
|
||
|
||
TERMINAL_STATES = {"done", "failed"}
|
||
IN_PROGRESS_STATE_LABELS = {
|
||
"waiting-file": "等待文件上传",
|
||
"pending": "排队中",
|
||
"running": "解析中",
|
||
"converting": "格式转换中",
|
||
}
|
||
|
||
|
||
class MineruClientError(RuntimeError):
|
||
pass
|
||
|
||
|
||
class MineruClient:
|
||
"""MinerU 精准解析 API 客户端(需要 Token)。
|
||
|
||
针对本地图片文件的完整调用流程:
|
||
1. POST /file-urls/batch → 获取 batch_id + OSS 签名上传 URL
|
||
2. PUT 上传图片到 OSS → 系统自动感知并提交解析任务
|
||
3. GET /extract-results/batch/{batch_id} 轮询直到 state=done
|
||
4. 下载 full_zip_url,解压提取结构化 JSON
|
||
|
||
文件限制:≤ 200MB,≤ 600 页
|
||
支持格式:PDF、图片(png/jpg/jpeg/jp2/webp/gif/bmp)、Doc、Docx、Ppt、PPTx
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
api_key: str,
|
||
model_version: str = "vlm",
|
||
language: str = "ch",
|
||
enable_table: bool = True,
|
||
is_ocr: bool = True,
|
||
enable_formula: bool = True,
|
||
poll_interval: float = 3.0,
|
||
timeout: float = 300.0,
|
||
) -> None:
|
||
self.api_key = api_key
|
||
self.model_version = model_version
|
||
self.language = language
|
||
self.enable_table = enable_table
|
||
self.is_ocr = is_ocr
|
||
self.enable_formula = enable_formula
|
||
self.poll_interval = poll_interval
|
||
self.timeout = timeout
|
||
|
||
def parse_image(self, image_path: Path, output_dir: Path) -> dict:
|
||
"""解析本地图片文件,返回结构化 JSON 数据。
|
||
|
||
Parameters
|
||
----------
|
||
image_path:
|
||
本地图片路径(png/jpg/jpeg/jp2/webp/gif/bmp)
|
||
output_dir:
|
||
中间产物(zip、解压目录)的存放目录
|
||
|
||
Returns
|
||
-------
|
||
dict
|
||
包含 pdf_info 的结构化 JSON(layout.json 或 content_list.json)
|
||
"""
|
||
image_path = Path(image_path)
|
||
if not image_path.exists():
|
||
raise FileNotFoundError(f"图片文件不存在: {image_path}")
|
||
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
logger.info("MinerU 精准解析开始: %s", image_path.name)
|
||
batch_id, upload_url = self._request_upload_url(image_path.name)
|
||
logger.info("MinerU 批次已创建: batch_id=%s", batch_id)
|
||
|
||
self._upload_file(upload_url, image_path)
|
||
logger.info("MinerU 文件上传完成: %s(系统自动提交解析)", image_path.name)
|
||
|
||
zip_url = self._poll_batch_until_done(batch_id)
|
||
logger.info("MinerU 解析完成: batch_id=%s", batch_id)
|
||
|
||
zip_path = self._download_zip(zip_url, output_dir)
|
||
extract_dir = output_dir / "result"
|
||
self._extract_zip(zip_path, extract_dir)
|
||
|
||
result = self._load_structured_json(extract_dir)
|
||
logger.info("MinerU 结构化 JSON 加载完毕")
|
||
return result
|
||
|
||
# ------------------------------------------------------------------
|
||
# 内部方法
|
||
# ------------------------------------------------------------------
|
||
|
||
def _auth_headers(self) -> dict[str, str]:
|
||
return {
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
"Content-Type": "application/json",
|
||
}
|
||
|
||
def _request_upload_url(self, file_name: str) -> tuple[str, str]:
|
||
"""申请批量上传链接,返回 (batch_id, oss_upload_url)。"""
|
||
payload = {
|
||
"files": [{"name": file_name, "is_ocr": self.is_ocr}],
|
||
"model_version": self.model_version,
|
||
"language": self.language,
|
||
"enable_table": self.enable_table,
|
||
"enable_formula": self.enable_formula,
|
||
}
|
||
try:
|
||
resp = requests.post(
|
||
f"{MINERU_BASE}/file-urls/batch",
|
||
headers=self._auth_headers(),
|
||
json=payload,
|
||
timeout=30,
|
||
)
|
||
resp.raise_for_status()
|
||
except requests.RequestException as exc:
|
||
raise MineruClientError(f"MinerU 申请上传 URL 失败: {exc}") from exc
|
||
|
||
body = resp.json()
|
||
if body.get("code") != 0:
|
||
raise MineruClientError(f"MinerU 申请上传 URL 失败: {body.get('msg')}")
|
||
|
||
data = body.get("data", {})
|
||
batch_id = data.get("batch_id")
|
||
file_urls = data.get("file_urls", [])
|
||
if not batch_id or not file_urls:
|
||
raise MineruClientError("MinerU 返回的 batch_id 或 file_urls 为空")
|
||
|
||
return batch_id, file_urls[0]
|
||
|
||
def _upload_file(self, upload_url: str, image_path: Path) -> None:
|
||
"""将图片 PUT 上传到 OSS。上传时无需设置 Content-Type。"""
|
||
try:
|
||
with image_path.open("rb") as f:
|
||
resp = requests.put(upload_url, data=f, timeout=120)
|
||
except requests.RequestException as exc:
|
||
raise MineruClientError(f"MinerU 文件上传网络错误: {exc}") from exc
|
||
|
||
if resp.status_code not in (200, 201):
|
||
raise MineruClientError(
|
||
f"MinerU 文件上传失败: HTTP {resp.status_code} {resp.text[:200]}"
|
||
)
|
||
|
||
def _poll_batch_until_done(self, batch_id: str) -> str:
|
||
"""轮询批次结果,返回 full_zip_url。"""
|
||
url = f"{MINERU_BASE}/extract-results/batch/{batch_id}"
|
||
deadline = time.monotonic() + self.timeout
|
||
|
||
while time.monotonic() < deadline:
|
||
try:
|
||
resp = requests.get(url, headers=self._auth_headers(), timeout=30)
|
||
resp.raise_for_status()
|
||
except requests.RequestException as exc:
|
||
raise MineruClientError(f"MinerU 查询批次状态失败: {exc}") from exc
|
||
|
||
body = resp.json()
|
||
if body.get("code") != 0:
|
||
raise MineruClientError(f"MinerU 查询批次失败: {body.get('msg')}")
|
||
|
||
results: list[dict] = body.get("data", {}).get("extract_result", [])
|
||
if not results:
|
||
time.sleep(self.poll_interval)
|
||
continue
|
||
|
||
item = results[0]
|
||
state = item.get("state", "")
|
||
label = IN_PROGRESS_STATE_LABELS.get(state, state)
|
||
logger.info("MinerU 批次状态: batch_id=%s state=%s (%s)", batch_id, state, label)
|
||
|
||
if state == "done":
|
||
zip_url = item.get("full_zip_url")
|
||
if not zip_url:
|
||
raise MineruClientError("MinerU 完成但未返回 full_zip_url")
|
||
return zip_url
|
||
|
||
if state == "failed":
|
||
err_msg = item.get("err_msg") or "未知错误"
|
||
raise MineruClientError(f"MinerU 解析失败: {err_msg}")
|
||
|
||
time.sleep(self.poll_interval)
|
||
|
||
raise MineruClientError(
|
||
f"MinerU 轮询超时 ({self.timeout:.0f}s): batch_id={batch_id}"
|
||
)
|
||
|
||
def _download_zip(self, zip_url: str, output_dir: Path) -> Path:
|
||
"""下载结果 zip 包到本地。"""
|
||
target = output_dir / "mineru_result.zip"
|
||
try:
|
||
resp = requests.get(zip_url, timeout=120, stream=True)
|
||
resp.raise_for_status()
|
||
with target.open("wb") as f:
|
||
for chunk in resp.iter_content(chunk_size=8192):
|
||
f.write(chunk)
|
||
except requests.RequestException as exc:
|
||
raise MineruClientError(f"MinerU zip 下载失败: {exc}") from exc
|
||
logger.info("MinerU zip 下载完毕: %s", target)
|
||
return target
|
||
|
||
def _extract_zip(self, zip_path: Path, extract_dir: Path) -> None:
|
||
extract_dir.mkdir(parents=True, exist_ok=True)
|
||
with zipfile.ZipFile(zip_path) as archive:
|
||
archive.extractall(extract_dir)
|
||
logger.info("MinerU zip 解压完毕: %s", extract_dir)
|
||
|
||
def _load_structured_json(self, extract_dir: Path) -> dict:
|
||
"""从解压目录中找到并加载包含 pdf_info 的结构化 JSON。
|
||
|
||
MinerU zip 结构说明:
|
||
layout.json → 中间处理结果(对应 middle.json)
|
||
*_content_list.json → 内容列表
|
||
*_model.json → 模型推理结果
|
||
full.md → Markdown 解析结果
|
||
"""
|
||
candidates = [
|
||
*sorted(extract_dir.rglob("layout.json")),
|
||
*sorted(extract_dir.rglob("*layout*.json")),
|
||
*sorted(extract_dir.rglob("*_content_list*.json")),
|
||
*sorted(extract_dir.rglob("*.json")),
|
||
]
|
||
seen: set[Path] = set()
|
||
for candidate in candidates:
|
||
if candidate in seen:
|
||
continue
|
||
seen.add(candidate)
|
||
try:
|
||
parsed = json.loads(candidate.read_text(encoding="utf-8"))
|
||
except (json.JSONDecodeError, OSError):
|
||
continue
|
||
if isinstance(parsed, dict) and isinstance(parsed.get("pdf_info"), list):
|
||
logger.info("MinerU 结构化 JSON 选用: %s", candidate.name)
|
||
return parsed
|
||
|
||
raise MineruClientError(
|
||
"MinerU 结果 zip 中未找到包含 pdf_info 的结构化 JSON"
|
||
)
|