Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
This commit is contained in:
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions

View File

@@ -0,0 +1,248 @@
from __future__ import annotations
import json
import logging
import time
import zipfile
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
MINERU_BASE = "https://mineru.net/api/v4"
TERMINAL_STATES = {"done", "failed"}
IN_PROGRESS_STATE_LABELS = {
"waiting-file": "等待文件上传",
"pending": "排队中",
"running": "解析中",
"converting": "格式转换中",
}
class MineruClientError(RuntimeError):
pass
class MineruClient:
"""MinerU 精准解析 API 客户端(需要 Token
针对本地图片文件的完整调用流程:
1. POST /file-urls/batch → 获取 batch_id + OSS 签名上传 URL
2. PUT 上传图片到 OSS → 系统自动感知并提交解析任务
3. GET /extract-results/batch/{batch_id} 轮询直到 state=done
4. 下载 full_zip_url解压提取结构化 JSON
文件限制:≤ 200MB≤ 600 页
支持格式PDF、图片png/jpg/jpeg/jp2/webp/gif/bmp、Doc、Docx、Ppt、PPTx
"""
def __init__(
self,
api_key: str,
model_version: str = "vlm",
language: str = "ch",
enable_table: bool = True,
is_ocr: bool = True,
enable_formula: bool = True,
poll_interval: float = 3.0,
timeout: float = 300.0,
) -> None:
self.api_key = api_key
self.model_version = model_version
self.language = language
self.enable_table = enable_table
self.is_ocr = is_ocr
self.enable_formula = enable_formula
self.poll_interval = poll_interval
self.timeout = timeout
def parse_image(self, image_path: Path, output_dir: Path) -> dict:
"""解析本地图片文件,返回结构化 JSON 数据。
Parameters
----------
image_path:
本地图片路径png/jpg/jpeg/jp2/webp/gif/bmp
output_dir:
中间产物zip、解压目录的存放目录
Returns
-------
dict
包含 pdf_info 的结构化 JSONlayout.json 或 content_list.json
"""
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"图片文件不存在: {image_path}")
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
logger.info("MinerU 精准解析开始: %s", image_path.name)
batch_id, upload_url = self._request_upload_url(image_path.name)
logger.info("MinerU 批次已创建: batch_id=%s", batch_id)
self._upload_file(upload_url, image_path)
logger.info("MinerU 文件上传完成: %s(系统自动提交解析)", image_path.name)
zip_url = self._poll_batch_until_done(batch_id)
logger.info("MinerU 解析完成: batch_id=%s", batch_id)
zip_path = self._download_zip(zip_url, output_dir)
extract_dir = output_dir / "result"
self._extract_zip(zip_path, extract_dir)
result = self._load_structured_json(extract_dir)
logger.info("MinerU 结构化 JSON 加载完毕")
return result
# ------------------------------------------------------------------
# 内部方法
# ------------------------------------------------------------------
def _auth_headers(self) -> dict[str, str]:
return {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
def _request_upload_url(self, file_name: str) -> tuple[str, str]:
"""申请批量上传链接,返回 (batch_id, oss_upload_url)。"""
payload = {
"files": [{"name": file_name, "is_ocr": self.is_ocr}],
"model_version": self.model_version,
"language": self.language,
"enable_table": self.enable_table,
"enable_formula": self.enable_formula,
}
try:
resp = requests.post(
f"{MINERU_BASE}/file-urls/batch",
headers=self._auth_headers(),
json=payload,
timeout=30,
)
resp.raise_for_status()
except requests.RequestException as exc:
raise MineruClientError(f"MinerU 申请上传 URL 失败: {exc}") from exc
body = resp.json()
if body.get("code") != 0:
raise MineruClientError(f"MinerU 申请上传 URL 失败: {body.get('msg')}")
data = body.get("data", {})
batch_id = data.get("batch_id")
file_urls = data.get("file_urls", [])
if not batch_id or not file_urls:
raise MineruClientError("MinerU 返回的 batch_id 或 file_urls 为空")
return batch_id, file_urls[0]
def _upload_file(self, upload_url: str, image_path: Path) -> None:
"""将图片 PUT 上传到 OSS。上传时无需设置 Content-Type。"""
try:
with image_path.open("rb") as f:
resp = requests.put(upload_url, data=f, timeout=120)
except requests.RequestException as exc:
raise MineruClientError(f"MinerU 文件上传网络错误: {exc}") from exc
if resp.status_code not in (200, 201):
raise MineruClientError(
f"MinerU 文件上传失败: HTTP {resp.status_code} {resp.text[:200]}"
)
def _poll_batch_until_done(self, batch_id: str) -> str:
"""轮询批次结果,返回 full_zip_url。"""
url = f"{MINERU_BASE}/extract-results/batch/{batch_id}"
deadline = time.monotonic() + self.timeout
while time.monotonic() < deadline:
try:
resp = requests.get(url, headers=self._auth_headers(), timeout=30)
resp.raise_for_status()
except requests.RequestException as exc:
raise MineruClientError(f"MinerU 查询批次状态失败: {exc}") from exc
body = resp.json()
if body.get("code") != 0:
raise MineruClientError(f"MinerU 查询批次失败: {body.get('msg')}")
results: list[dict] = body.get("data", {}).get("extract_result", [])
if not results:
time.sleep(self.poll_interval)
continue
item = results[0]
state = item.get("state", "")
label = IN_PROGRESS_STATE_LABELS.get(state, state)
logger.info("MinerU 批次状态: batch_id=%s state=%s (%s)", batch_id, state, label)
if state == "done":
zip_url = item.get("full_zip_url")
if not zip_url:
raise MineruClientError("MinerU 完成但未返回 full_zip_url")
return zip_url
if state == "failed":
err_msg = item.get("err_msg") or "未知错误"
raise MineruClientError(f"MinerU 解析失败: {err_msg}")
time.sleep(self.poll_interval)
raise MineruClientError(
f"MinerU 轮询超时 ({self.timeout:.0f}s): batch_id={batch_id}"
)
def _download_zip(self, zip_url: str, output_dir: Path) -> Path:
"""下载结果 zip 包到本地。"""
target = output_dir / "mineru_result.zip"
try:
resp = requests.get(zip_url, timeout=120, stream=True)
resp.raise_for_status()
with target.open("wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
except requests.RequestException as exc:
raise MineruClientError(f"MinerU zip 下载失败: {exc}") from exc
logger.info("MinerU zip 下载完毕: %s", target)
return target
def _extract_zip(self, zip_path: Path, extract_dir: Path) -> None:
extract_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(zip_path) as archive:
archive.extractall(extract_dir)
logger.info("MinerU zip 解压完毕: %s", extract_dir)
def _load_structured_json(self, extract_dir: Path) -> dict:
"""从解压目录中找到并加载包含 pdf_info 的结构化 JSON。
MinerU zip 结构说明:
layout.json → 中间处理结果(对应 middle.json
*_content_list.json → 内容列表
*_model.json → 模型推理结果
full.md → Markdown 解析结果
"""
candidates = [
*sorted(extract_dir.rglob("layout.json")),
*sorted(extract_dir.rglob("*layout*.json")),
*sorted(extract_dir.rglob("*_content_list*.json")),
*sorted(extract_dir.rglob("*.json")),
]
seen: set[Path] = set()
for candidate in candidates:
if candidate in seen:
continue
seen.add(candidate)
try:
parsed = json.loads(candidate.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
if isinstance(parsed, dict) and isinstance(parsed.get("pdf_info"), list):
logger.info("MinerU 结构化 JSON 选用: %s", candidate.name)
return parsed
raise MineruClientError(
"MinerU 结果 zip 中未找到包含 pdf_info 的结构化 JSON"
)