支持mineru 2.7.1
This commit is contained in:
@@ -337,7 +337,7 @@ class TranslateServiceRequest(BaseModel):
|
||||
],
|
||||
)
|
||||
file_content: str = Field(
|
||||
..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."]
|
||||
..., description="Base64编码的文件内容。", examples=["JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PC9..."]
|
||||
)
|
||||
payload: TranslatePayload = Field(
|
||||
..., description="包含工作流类型和相应参数的载荷。"
|
||||
@@ -666,7 +666,7 @@ async def _perform_translation(
|
||||
"force_json",
|
||||
"rpm",
|
||||
"tpm",
|
||||
"provider", # Added provider
|
||||
"provider",
|
||||
},
|
||||
exclude_none=True,
|
||||
)
|
||||
@@ -688,7 +688,9 @@ async def _perform_translation(
|
||||
converter_config = ConverterMineruDeployConfig(
|
||||
base_url=payload.mineru_deploy_base_url,
|
||||
backend=payload.mineru_deploy_backend,
|
||||
parse_method=payload.mineru_deploy_parse_method,
|
||||
formula_enable=payload.mineru_deploy_formula_enable,
|
||||
table_enable=payload.mineru_deploy_table_enable,
|
||||
start_page_id=payload.mineru_deploy_start_page_id,
|
||||
end_page_id=payload.mineru_deploy_end_page_id,
|
||||
lang_list=payload.mineru_deploy_lang_list,
|
||||
@@ -2214,8 +2216,10 @@ async def service_flat_translate(
|
||||
formula_ocr: bool = Form(True, description="[PDF] 是否启用公式识别"),
|
||||
code_ocr: bool = Form(True, description="[Docling] 是否启用代码块识别"),
|
||||
mineru_deploy_base_url: str = Form("http://127.0.0.1:8000", description="[MinerU Local] 服务地址"),
|
||||
mineru_deploy_backend: str = Form("VLM", description="[MinerU Local] 后端类型"),
|
||||
mineru_deploy_backend: str = Form("hybrid-auto-engine", description="[MinerU Local] 后端类型: hybrid-auto-engine, pipeline 等"),
|
||||
mineru_deploy_parse_method: str = Form("auto", description="[MinerU Local] 解析方法: auto, txt, ocr"),
|
||||
mineru_deploy_formula_enable: bool = Form(True, description="[MinerU Local] 是否启用公式"),
|
||||
mineru_deploy_table_enable: bool = Form(True, description="[MinerU Local] 是否启用表格"),
|
||||
mineru_deploy_start_page_id: int = Form(0, description="[MinerU Local] 起始页码"),
|
||||
mineru_deploy_end_page_id: int = Form(99999, description="[MinerU Local] 结束页码"),
|
||||
mineru_deploy_lang_list: Optional[List[str]] = Form(None, description="[MinerU Local] 语言列表"),
|
||||
@@ -2317,7 +2321,9 @@ async def service_flat_translate(
|
||||
# --- MinerU 本地部署参数 ---
|
||||
"mineru_deploy_base_url": mineru_deploy_base_url,
|
||||
"mineru_deploy_backend": mineru_deploy_backend,
|
||||
"mineru_deploy_parse_method": mineru_deploy_parse_method,
|
||||
"mineru_deploy_formula_enable": mineru_deploy_formula_enable,
|
||||
"mineru_deploy_table_enable": mineru_deploy_table_enable,
|
||||
"mineru_deploy_start_page_id": mineru_deploy_start_page_id,
|
||||
"mineru_deploy_end_page_id": mineru_deploy_end_page_id,
|
||||
"mineru_deploy_lang_list": mineru_deploy_lang_list,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Hashable
|
||||
from typing import Literal, Hashable, List
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -16,24 +16,42 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||
@dataclass(kw_only=True)
|
||||
class ConverterMineruDeployConfig(X2MarkdownConverterConfig):
|
||||
base_url: str = "http://127.0.0.1:8000"
|
||||
output_dir: str = "./output" # 覆盖默认值 ./output
|
||||
lang_list: list[Literal["ch", "ch_server", "ch_lite", "en", "korean", "japan", "chinese_cht", "ta", "te", "ka", "th", "el", "latin", "arabic", "east_slavic", "cyrillic", "devanagari"]] | None = None
|
||||
backend: Literal["pipeline", "vlm-transformers", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine","vlm-http-client"] = "pipeline"
|
||||
# parse_method: str = "auto"
|
||||
output_dir: str = "./output"
|
||||
# 支持的语言列表 (来自 MinerU API)
|
||||
lang_list: List[str] | None = None # 默认值在 API 侧处理,这里 None 即可
|
||||
|
||||
# 后端引擎选项 (更新适配最新的 MinerU API)
|
||||
backend: Literal[
|
||||
"pipeline",
|
||||
"vlm-auto-engine",
|
||||
"vlm-http-client",
|
||||
"hybrid-auto-engine",
|
||||
"hybrid-http-client"
|
||||
] = "hybrid-auto-engine"
|
||||
|
||||
parse_method: Literal["auto", "txt", "ocr"] = "auto"
|
||||
formula_enable: bool = True
|
||||
# table_enable: bool = True
|
||||
server_url: str | None = None #(Adapted only for vlm-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000
|
||||
# return_md: bool = True
|
||||
# return_middle_json: bool = True
|
||||
# return_model_output: bool = False
|
||||
# return_content_list: bool = False
|
||||
# return_images: bool = True
|
||||
# response_format_zip: bool = True
|
||||
table_enable: bool = True
|
||||
|
||||
# 用于 vlm-http-client 或 hybrid-http-client 后端
|
||||
server_url: str | None = None
|
||||
|
||||
# 返回选项
|
||||
return_md: bool = True
|
||||
return_middle_json: bool = False
|
||||
return_model_output: bool = False
|
||||
return_content_list: bool = False
|
||||
return_images: bool = True
|
||||
response_format_zip: bool = True
|
||||
|
||||
# 页面范围
|
||||
start_page_id: int = 0
|
||||
end_page_id: int = 99999
|
||||
|
||||
def gethash(self) ->Hashable:
|
||||
return (self.backend,self.formula_enable,self.start_page_id,self.end_page_id)
|
||||
def gethash(self) -> Hashable:
|
||||
return (self.backend, self.formula_enable, self.table_enable,
|
||||
self.parse_method, self.start_page_id, self.end_page_id)
|
||||
|
||||
|
||||
# 配置HTTP客户端
|
||||
timeout = httpx.Timeout(
|
||||
@@ -57,27 +75,37 @@ class ConverterMineruDeploy(X2MarkdownConverter):
|
||||
|
||||
self._api_url = f"{self.base_url}/file_parse"
|
||||
|
||||
def _build_form_data(self)->dict:
|
||||
def _build_form_data(self) -> dict:
|
||||
# httpx 在处理 data 参数时,如果值为 list,会自动展开为多个同名 key (例如 lang_list=ch&lang_list=en)
|
||||
# 这符合 FastAPI/Starlette 对 List 字段的解析要求
|
||||
data = {
|
||||
"output_dir": self.config.output_dir,
|
||||
"backend": self.config.backend,
|
||||
"parse_method": "auto",
|
||||
"formula_enable": self.config.formula_enable,
|
||||
"table_enable": True,
|
||||
"server_url": None,
|
||||
"return_md": True,
|
||||
"return_middle_json": True,
|
||||
"return_model_output": False,
|
||||
"return_content_list": False,
|
||||
"return_images": True,
|
||||
"response_format_zip": True,
|
||||
"parse_method": self.config.parse_method,
|
||||
# bool 类型在 multipart/form-data 中通常需要转为字符串 'true'/'false',但 httpx 会处理 python bool
|
||||
"formula_enable": str(self.config.formula_enable).lower(),
|
||||
"table_enable": str(self.config.table_enable).lower(),
|
||||
"return_md": str(self.config.return_md).lower(),
|
||||
"return_middle_json": str(self.config.return_middle_json).lower(),
|
||||
"return_model_output": str(self.config.return_model_output).lower(),
|
||||
"return_content_list": str(self.config.return_content_list).lower(),
|
||||
"return_images": str(self.config.return_images).lower(),
|
||||
"response_format_zip": str(self.config.response_format_zip).lower(),
|
||||
"start_page_id": self.config.start_page_id,
|
||||
"end_page_id": self.config.end_page_id
|
||||
}
|
||||
|
||||
if self.config.lang_list:
|
||||
data["lang_list"] = self.config.lang_list
|
||||
else:
|
||||
data["lang_list"] = ["ch"] # 默认值
|
||||
|
||||
if self.config.server_url:
|
||||
data["server_url"] = self.config.server_url
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def convert(self,d:Document)->MarkdownDocument:
|
||||
def convert(self, d: Document) -> MarkdownDocument:
|
||||
self.logger.info("开始解析文件")
|
||||
files = [("files", (d.name, d.content, "application/octet-stream"))]
|
||||
response = client.post(
|
||||
@@ -88,15 +116,15 @@ class ConverterMineruDeploy(X2MarkdownConverter):
|
||||
)
|
||||
|
||||
response.raise_for_status() # 检查是否有错误
|
||||
md=embed_inline_image_from_zip(response.content,None)
|
||||
# Mineru API 返回 zip 时包含图片和 md
|
||||
md = embed_inline_image_from_zip(response.content, None)
|
||||
self.logger.info("已转化为markdown")
|
||||
return MarkdownDocument.from_bytes(md.encode(),suffix=".md",stem=d.stem)
|
||||
|
||||
return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
|
||||
|
||||
async def convert_async(self, d: Document) -> MarkdownDocument:
|
||||
self.logger.info("开始解析文件")
|
||||
files = [("files", (d.name, d.content, "application/octet-stream"))]
|
||||
response =await client_async.post(
|
||||
response = await client_async.post(
|
||||
self._api_url,
|
||||
files=files,
|
||||
data=self._build_form_data(),
|
||||
@@ -104,15 +132,9 @@ class ConverterMineruDeploy(X2MarkdownConverter):
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
md = await asyncio.to_thread(embed_inline_image_from_zip,response.content, None)
|
||||
md = await asyncio.to_thread(embed_inline_image_from_zip, response.content, None)
|
||||
self.logger.info("已转化为markdown")
|
||||
return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
|
||||
|
||||
if __name__ == '__main__':
|
||||
d = Document.from_path(r"C:\Users\jxgm\Desktop\testfiles\table.pdf")
|
||||
config=ConverterMineruDeployConfig()
|
||||
converter = ConverterMineruDeploy(config=config)
|
||||
converter.convert(d)
|
||||
|
||||
@@ -252,17 +252,27 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
|
||||
"http://127.0.0.1:8000",
|
||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务地址。",
|
||||
)
|
||||
# --- UPDATED BACKEND LIST ---
|
||||
mineru_deploy_backend: Literal[
|
||||
"pipeline",
|
||||
"vlm-transformers",
|
||||
"vlm-mlx-engine",
|
||||
"vlm-vllm-async-engine",
|
||||
"vlm-lmdeploy-engine",
|
||||
"vlm-auto-engine",
|
||||
"vlm-http-client",
|
||||
"hybrid-auto-engine",
|
||||
"hybrid-http-client"
|
||||
] = Field(
|
||||
"pipeline",
|
||||
"hybrid-auto-engine",
|
||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务使用的后端。",
|
||||
)
|
||||
# --- NEW PARAMETERS START ---
|
||||
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = Field(
|
||||
"auto",
|
||||
description="[仅当 convert_engine='mineru_deploy'] 解析方法: auto, txt, ocr"
|
||||
)
|
||||
mineru_deploy_table_enable: bool = Field(
|
||||
True,
|
||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用表格解析。",
|
||||
)
|
||||
# --- NEW PARAMETERS END ---
|
||||
mineru_deploy_formula_enable: bool = Field(
|
||||
True,
|
||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用公式解析。",
|
||||
@@ -275,13 +285,13 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
|
||||
)
|
||||
mineru_deploy_lang_list: Optional[List[str]] = Field(
|
||||
None,
|
||||
description="[仅当 convert_engine='mineru_deploy' 且 backend='pipeline'] 语言列表。",
|
||||
examples=[None],
|
||||
description="[仅当 convert_engine='mineru_deploy'] 语言列表, 默认 ['ch']。",
|
||||
examples=[["ch", "en"]],
|
||||
)
|
||||
# 修改: 默认值改为 ""
|
||||
mineru_deploy_server_url: Optional[str] = Field(
|
||||
default="",
|
||||
description="[仅当 convert_engine='mineru_deploy' 且 backend='vlm-http-client'] Server URL.",
|
||||
description="[仅当 convert_engine='mineru_deploy' 且 backend为http-client相关时] Server URL.",
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
@@ -312,10 +322,6 @@ class TextWorkflowParams(BaseWorkflowParams):
|
||||
"\n",
|
||||
description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。",
|
||||
)
|
||||
segment_mode: Literal["line", "paragraph", "none"] = Field(
|
||||
"line",
|
||||
description="分段模式。'line':按行分段(每行独立翻译),'paragraph':按段落分段(连续非空行合并为段落),'none':不分段(全文视为一个段落)。",
|
||||
)
|
||||
|
||||
|
||||
class JsonWorkflowParams(BaseWorkflowParams):
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user