支持mineru 2.7.1

This commit is contained in:
xunbu
2026-01-07 19:56:27 +08:00
parent 50255069b7
commit 3b13157670
5 changed files with 96 additions and 58 deletions

View File

@@ -337,7 +337,7 @@ class TranslateServiceRequest(BaseModel):
], ],
) )
file_content: str = Field( file_content: str = Field(
..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."] ..., description="Base64编码的文件内容。", examples=["JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PC9..."]
) )
payload: TranslatePayload = Field( payload: TranslatePayload = Field(
..., description="包含工作流类型和相应参数的载荷。" ..., description="包含工作流类型和相应参数的载荷。"
@@ -666,7 +666,7 @@ async def _perform_translation(
"force_json", "force_json",
"rpm", "rpm",
"tpm", "tpm",
"provider", # Added provider "provider",
}, },
exclude_none=True, exclude_none=True,
) )
@@ -688,7 +688,9 @@ async def _perform_translation(
converter_config = ConverterMineruDeployConfig( converter_config = ConverterMineruDeployConfig(
base_url=payload.mineru_deploy_base_url, base_url=payload.mineru_deploy_base_url,
backend=payload.mineru_deploy_backend, backend=payload.mineru_deploy_backend,
parse_method=payload.mineru_deploy_parse_method,
formula_enable=payload.mineru_deploy_formula_enable, formula_enable=payload.mineru_deploy_formula_enable,
table_enable=payload.mineru_deploy_table_enable,
start_page_id=payload.mineru_deploy_start_page_id, start_page_id=payload.mineru_deploy_start_page_id,
end_page_id=payload.mineru_deploy_end_page_id, end_page_id=payload.mineru_deploy_end_page_id,
lang_list=payload.mineru_deploy_lang_list, lang_list=payload.mineru_deploy_lang_list,
@@ -2214,8 +2216,10 @@ async def service_flat_translate(
formula_ocr: bool = Form(True, description="[PDF] 是否启用公式识别"), formula_ocr: bool = Form(True, description="[PDF] 是否启用公式识别"),
code_ocr: bool = Form(True, description="[Docling] 是否启用代码块识别"), code_ocr: bool = Form(True, description="[Docling] 是否启用代码块识别"),
mineru_deploy_base_url: str = Form("http://127.0.0.1:8000", description="[MinerU Local] 服务地址"), mineru_deploy_base_url: str = Form("http://127.0.0.1:8000", description="[MinerU Local] 服务地址"),
mineru_deploy_backend: str = Form("VLM", description="[MinerU Local] 后端类型"), mineru_deploy_backend: str = Form("hybrid-auto-engine", description="[MinerU Local] 后端类型: hybrid-auto-engine, pipeline 等"),
mineru_deploy_parse_method: str = Form("auto", description="[MinerU Local] 解析方法: auto, txt, ocr"),
mineru_deploy_formula_enable: bool = Form(True, description="[MinerU Local] 是否启用公式"), mineru_deploy_formula_enable: bool = Form(True, description="[MinerU Local] 是否启用公式"),
mineru_deploy_table_enable: bool = Form(True, description="[MinerU Local] 是否启用表格"),
mineru_deploy_start_page_id: int = Form(0, description="[MinerU Local] 起始页码"), mineru_deploy_start_page_id: int = Form(0, description="[MinerU Local] 起始页码"),
mineru_deploy_end_page_id: int = Form(99999, description="[MinerU Local] 结束页码"), mineru_deploy_end_page_id: int = Form(99999, description="[MinerU Local] 结束页码"),
mineru_deploy_lang_list: Optional[List[str]] = Form(None, description="[MinerU Local] 语言列表"), mineru_deploy_lang_list: Optional[List[str]] = Form(None, description="[MinerU Local] 语言列表"),
@@ -2317,7 +2321,9 @@ async def service_flat_translate(
# --- MinerU 本地部署参数 --- # --- MinerU 本地部署参数 ---
"mineru_deploy_base_url": mineru_deploy_base_url, "mineru_deploy_base_url": mineru_deploy_base_url,
"mineru_deploy_backend": mineru_deploy_backend, "mineru_deploy_backend": mineru_deploy_backend,
"mineru_deploy_parse_method": mineru_deploy_parse_method,
"mineru_deploy_formula_enable": mineru_deploy_formula_enable, "mineru_deploy_formula_enable": mineru_deploy_formula_enable,
"mineru_deploy_table_enable": mineru_deploy_table_enable,
"mineru_deploy_start_page_id": mineru_deploy_start_page_id, "mineru_deploy_start_page_id": mineru_deploy_start_page_id,
"mineru_deploy_end_page_id": mineru_deploy_end_page_id, "mineru_deploy_end_page_id": mineru_deploy_end_page_id,
"mineru_deploy_lang_list": mineru_deploy_lang_list, "mineru_deploy_lang_list": mineru_deploy_lang_list,

View File

@@ -2,7 +2,7 @@
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
import asyncio import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal, Hashable from typing import Literal, Hashable, List
import httpx import httpx
@@ -16,24 +16,42 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
@dataclass(kw_only=True) @dataclass(kw_only=True)
class ConverterMineruDeployConfig(X2MarkdownConverterConfig): class ConverterMineruDeployConfig(X2MarkdownConverterConfig):
base_url: str = "http://127.0.0.1:8000" base_url: str = "http://127.0.0.1:8000"
output_dir: str = "./output" # 覆盖默认值 ./output output_dir: str = "./output"
lang_list: list[Literal["ch", "ch_server", "ch_lite", "en", "korean", "japan", "chinese_cht", "ta", "te", "ka", "th", "el", "latin", "arabic", "east_slavic", "cyrillic", "devanagari"]] | None = None # 支持的语言列表 (来自 MinerU API)
backend: Literal["pipeline", "vlm-transformers", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine","vlm-http-client"] = "pipeline" lang_list: List[str] | None = None # 默认值在 API 侧处理,这里 None 即可
# parse_method: str = "auto"
# 后端引擎选项 (更新适配最新的 MinerU API)
backend: Literal[
"pipeline",
"vlm-auto-engine",
"vlm-http-client",
"hybrid-auto-engine",
"hybrid-http-client"
] = "hybrid-auto-engine"
parse_method: Literal["auto", "txt", "ocr"] = "auto"
formula_enable: bool = True formula_enable: bool = True
# table_enable: bool = True table_enable: bool = True
server_url: str | None = None #(Adapted only for vlm-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000
# return_md: bool = True # 用于 vlm-http-client 或 hybrid-http-client 后端
# return_middle_json: bool = True server_url: str | None = None
# return_model_output: bool = False
# return_content_list: bool = False # 返回选项
# return_images: bool = True return_md: bool = True
# response_format_zip: bool = True return_middle_json: bool = False
return_model_output: bool = False
return_content_list: bool = False
return_images: bool = True
response_format_zip: bool = True
# 页面范围
start_page_id: int = 0 start_page_id: int = 0
end_page_id: int = 99999 end_page_id: int = 99999
def gethash(self) ->Hashable: def gethash(self) -> Hashable:
return (self.backend,self.formula_enable,self.start_page_id,self.end_page_id) return (self.backend, self.formula_enable, self.table_enable,
self.parse_method, self.start_page_id, self.end_page_id)
# 配置HTTP客户端 # 配置HTTP客户端
timeout = httpx.Timeout( timeout = httpx.Timeout(
@@ -57,27 +75,37 @@ class ConverterMineruDeploy(X2MarkdownConverter):
self._api_url = f"{self.base_url}/file_parse" self._api_url = f"{self.base_url}/file_parse"
def _build_form_data(self)->dict: def _build_form_data(self) -> dict:
# httpx 在处理 data 参数时,如果值为 list会自动展开为多个同名 key (例如 lang_list=ch&lang_list=en)
# 这符合 FastAPI/Starlette 对 List 字段的解析要求
data = { data = {
"output_dir": self.config.output_dir, "output_dir": self.config.output_dir,
"backend": self.config.backend, "backend": self.config.backend,
"parse_method": "auto", "parse_method": self.config.parse_method,
"formula_enable": self.config.formula_enable, # bool 类型在 multipart/form-data 中通常需要转为字符串 'true'/'false',但 httpx 会处理 python bool
"table_enable": True, "formula_enable": str(self.config.formula_enable).lower(),
"server_url": None, "table_enable": str(self.config.table_enable).lower(),
"return_md": True, "return_md": str(self.config.return_md).lower(),
"return_middle_json": True, "return_middle_json": str(self.config.return_middle_json).lower(),
"return_model_output": False, "return_model_output": str(self.config.return_model_output).lower(),
"return_content_list": False, "return_content_list": str(self.config.return_content_list).lower(),
"return_images": True, "return_images": str(self.config.return_images).lower(),
"response_format_zip": True, "response_format_zip": str(self.config.response_format_zip).lower(),
"start_page_id": self.config.start_page_id, "start_page_id": self.config.start_page_id,
"end_page_id": self.config.end_page_id "end_page_id": self.config.end_page_id
} }
if self.config.lang_list:
data["lang_list"] = self.config.lang_list
else:
data["lang_list"] = ["ch"] # 默认值
if self.config.server_url:
data["server_url"] = self.config.server_url
return data return data
def convert(self, d: Document) -> MarkdownDocument:
def convert(self,d:Document)->MarkdownDocument:
self.logger.info("开始解析文件") self.logger.info("开始解析文件")
files = [("files", (d.name, d.content, "application/octet-stream"))] files = [("files", (d.name, d.content, "application/octet-stream"))]
response = client.post( response = client.post(
@@ -88,15 +116,15 @@ class ConverterMineruDeploy(X2MarkdownConverter):
) )
response.raise_for_status() # 检查是否有错误 response.raise_for_status() # 检查是否有错误
md=embed_inline_image_from_zip(response.content,None) # Mineru API 返回 zip 时包含图片和 md
md = embed_inline_image_from_zip(response.content, None)
self.logger.info("已转化为markdown") self.logger.info("已转化为markdown")
return MarkdownDocument.from_bytes(md.encode(),suffix=".md",stem=d.stem) return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
async def convert_async(self, d: Document) -> MarkdownDocument: async def convert_async(self, d: Document) -> MarkdownDocument:
self.logger.info("开始解析文件") self.logger.info("开始解析文件")
files = [("files", (d.name, d.content, "application/octet-stream"))] files = [("files", (d.name, d.content, "application/octet-stream"))]
response =await client_async.post( response = await client_async.post(
self._api_url, self._api_url,
files=files, files=files,
data=self._build_form_data(), data=self._build_form_data(),
@@ -104,15 +132,9 @@ class ConverterMineruDeploy(X2MarkdownConverter):
) )
response.raise_for_status() response.raise_for_status()
md = await asyncio.to_thread(embed_inline_image_from_zip,response.content, None) md = await asyncio.to_thread(embed_inline_image_from_zip, response.content, None)
self.logger.info("已转化为markdown") self.logger.info("已转化为markdown")
return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem) return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
def support_format(self) -> list[str]: def support_format(self) -> list[str]:
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"] return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
if __name__ == '__main__':
d = Document.from_path(r"C:\Users\jxgm\Desktop\testfiles\table.pdf")
config=ConverterMineruDeployConfig()
converter = ConverterMineruDeploy(config=config)
converter.convert(d)

View File

@@ -252,17 +252,27 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
"http://127.0.0.1:8000", "http://127.0.0.1:8000",
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务地址。", description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务地址。",
) )
# --- UPDATED BACKEND LIST ---
mineru_deploy_backend: Literal[ mineru_deploy_backend: Literal[
"pipeline", "pipeline",
"vlm-transformers", "vlm-auto-engine",
"vlm-mlx-engine",
"vlm-vllm-async-engine",
"vlm-lmdeploy-engine",
"vlm-http-client", "vlm-http-client",
"hybrid-auto-engine",
"hybrid-http-client"
] = Field( ] = Field(
"pipeline", "hybrid-auto-engine",
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务使用的后端。", description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务使用的后端。",
) )
# --- NEW PARAMETERS START ---
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = Field(
"auto",
description="[仅当 convert_engine='mineru_deploy'] 解析方法: auto, txt, ocr"
)
mineru_deploy_table_enable: bool = Field(
True,
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用表格解析。",
)
# --- NEW PARAMETERS END ---
mineru_deploy_formula_enable: bool = Field( mineru_deploy_formula_enable: bool = Field(
True, True,
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用公式解析。", description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用公式解析。",
@@ -275,13 +285,13 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
) )
mineru_deploy_lang_list: Optional[List[str]] = Field( mineru_deploy_lang_list: Optional[List[str]] = Field(
None, None,
description="[仅当 convert_engine='mineru_deploy' 且 backend='pipeline'] 语言列表", description="[仅当 convert_engine='mineru_deploy'] 语言列表, 默认 ['ch']。",
examples=[None], examples=[["ch", "en"]],
) )
# 修改: 默认值改为 "" # 修改: 默认值改为 ""
mineru_deploy_server_url: Optional[str] = Field( mineru_deploy_server_url: Optional[str] = Field(
default="", default="",
description="[仅当 convert_engine='mineru_deploy' 且 backend='vlm-http-client'] Server URL.", description="[仅当 convert_engine='mineru_deploy' 且 backendhttp-client相关时] Server URL.",
) )
@model_validator(mode="after") @model_validator(mode="after")
@@ -312,10 +322,6 @@ class TextWorkflowParams(BaseWorkflowParams):
"\n", "\n",
description="当 insert_mode 为 'append''prepend' 时,用于分隔原文和译文的分隔符。", description="当 insert_mode 为 'append''prepend' 时,用于分隔原文和译文的分隔符。",
) )
segment_mode: Literal["line", "paragraph", "none"] = Field(
"line",
description="分段模式。'line':按行分段(每行独立翻译),'paragraph':按段落分段(连续非空行合并为段落),'none':不分段(全文视为一个段落)。",
)
class JsonWorkflowParams(BaseWorkflowParams): class JsonWorkflowParams(BaseWorkflowParams):

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long