支持mineru 2.7.1
This commit is contained in:
@@ -337,7 +337,7 @@ class TranslateServiceRequest(BaseModel):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
file_content: str = Field(
|
file_content: str = Field(
|
||||||
..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."]
|
..., description="Base64编码的文件内容。", examples=["JVBERi0xLjcKJeLjz9MKMSAwIG9iago8PC9..."]
|
||||||
)
|
)
|
||||||
payload: TranslatePayload = Field(
|
payload: TranslatePayload = Field(
|
||||||
..., description="包含工作流类型和相应参数的载荷。"
|
..., description="包含工作流类型和相应参数的载荷。"
|
||||||
@@ -666,7 +666,7 @@ async def _perform_translation(
|
|||||||
"force_json",
|
"force_json",
|
||||||
"rpm",
|
"rpm",
|
||||||
"tpm",
|
"tpm",
|
||||||
"provider", # Added provider
|
"provider",
|
||||||
},
|
},
|
||||||
exclude_none=True,
|
exclude_none=True,
|
||||||
)
|
)
|
||||||
@@ -688,7 +688,9 @@ async def _perform_translation(
|
|||||||
converter_config = ConverterMineruDeployConfig(
|
converter_config = ConverterMineruDeployConfig(
|
||||||
base_url=payload.mineru_deploy_base_url,
|
base_url=payload.mineru_deploy_base_url,
|
||||||
backend=payload.mineru_deploy_backend,
|
backend=payload.mineru_deploy_backend,
|
||||||
|
parse_method=payload.mineru_deploy_parse_method,
|
||||||
formula_enable=payload.mineru_deploy_formula_enable,
|
formula_enable=payload.mineru_deploy_formula_enable,
|
||||||
|
table_enable=payload.mineru_deploy_table_enable,
|
||||||
start_page_id=payload.mineru_deploy_start_page_id,
|
start_page_id=payload.mineru_deploy_start_page_id,
|
||||||
end_page_id=payload.mineru_deploy_end_page_id,
|
end_page_id=payload.mineru_deploy_end_page_id,
|
||||||
lang_list=payload.mineru_deploy_lang_list,
|
lang_list=payload.mineru_deploy_lang_list,
|
||||||
@@ -2214,8 +2216,10 @@ async def service_flat_translate(
|
|||||||
formula_ocr: bool = Form(True, description="[PDF] 是否启用公式识别"),
|
formula_ocr: bool = Form(True, description="[PDF] 是否启用公式识别"),
|
||||||
code_ocr: bool = Form(True, description="[Docling] 是否启用代码块识别"),
|
code_ocr: bool = Form(True, description="[Docling] 是否启用代码块识别"),
|
||||||
mineru_deploy_base_url: str = Form("http://127.0.0.1:8000", description="[MinerU Local] 服务地址"),
|
mineru_deploy_base_url: str = Form("http://127.0.0.1:8000", description="[MinerU Local] 服务地址"),
|
||||||
mineru_deploy_backend: str = Form("VLM", description="[MinerU Local] 后端类型"),
|
mineru_deploy_backend: str = Form("hybrid-auto-engine", description="[MinerU Local] 后端类型: hybrid-auto-engine, pipeline 等"),
|
||||||
|
mineru_deploy_parse_method: str = Form("auto", description="[MinerU Local] 解析方法: auto, txt, ocr"),
|
||||||
mineru_deploy_formula_enable: bool = Form(True, description="[MinerU Local] 是否启用公式"),
|
mineru_deploy_formula_enable: bool = Form(True, description="[MinerU Local] 是否启用公式"),
|
||||||
|
mineru_deploy_table_enable: bool = Form(True, description="[MinerU Local] 是否启用表格"),
|
||||||
mineru_deploy_start_page_id: int = Form(0, description="[MinerU Local] 起始页码"),
|
mineru_deploy_start_page_id: int = Form(0, description="[MinerU Local] 起始页码"),
|
||||||
mineru_deploy_end_page_id: int = Form(99999, description="[MinerU Local] 结束页码"),
|
mineru_deploy_end_page_id: int = Form(99999, description="[MinerU Local] 结束页码"),
|
||||||
mineru_deploy_lang_list: Optional[List[str]] = Form(None, description="[MinerU Local] 语言列表"),
|
mineru_deploy_lang_list: Optional[List[str]] = Form(None, description="[MinerU Local] 语言列表"),
|
||||||
@@ -2317,7 +2321,9 @@ async def service_flat_translate(
|
|||||||
# --- MinerU 本地部署参数 ---
|
# --- MinerU 本地部署参数 ---
|
||||||
"mineru_deploy_base_url": mineru_deploy_base_url,
|
"mineru_deploy_base_url": mineru_deploy_base_url,
|
||||||
"mineru_deploy_backend": mineru_deploy_backend,
|
"mineru_deploy_backend": mineru_deploy_backend,
|
||||||
|
"mineru_deploy_parse_method": mineru_deploy_parse_method,
|
||||||
"mineru_deploy_formula_enable": mineru_deploy_formula_enable,
|
"mineru_deploy_formula_enable": mineru_deploy_formula_enable,
|
||||||
|
"mineru_deploy_table_enable": mineru_deploy_table_enable,
|
||||||
"mineru_deploy_start_page_id": mineru_deploy_start_page_id,
|
"mineru_deploy_start_page_id": mineru_deploy_start_page_id,
|
||||||
"mineru_deploy_end_page_id": mineru_deploy_end_page_id,
|
"mineru_deploy_end_page_id": mineru_deploy_end_page_id,
|
||||||
"mineru_deploy_lang_list": mineru_deploy_lang_list,
|
"mineru_deploy_lang_list": mineru_deploy_lang_list,
|
||||||
@@ -2531,4 +2537,4 @@ def run_app(host=None, port: int | None = None, enable_CORS=False,
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_app()
|
run_app()
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
import asyncio
|
import asyncio
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, Hashable
|
from typing import Literal, Hashable, List
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -16,24 +16,42 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
|||||||
@dataclass(kw_only=True)
|
@dataclass(kw_only=True)
|
||||||
class ConverterMineruDeployConfig(X2MarkdownConverterConfig):
|
class ConverterMineruDeployConfig(X2MarkdownConverterConfig):
|
||||||
base_url: str = "http://127.0.0.1:8000"
|
base_url: str = "http://127.0.0.1:8000"
|
||||||
output_dir: str = "./output" # 覆盖默认值 ./output
|
output_dir: str = "./output"
|
||||||
lang_list: list[Literal["ch", "ch_server", "ch_lite", "en", "korean", "japan", "chinese_cht", "ta", "te", "ka", "th", "el", "latin", "arabic", "east_slavic", "cyrillic", "devanagari"]] | None = None
|
# 支持的语言列表 (来自 MinerU API)
|
||||||
backend: Literal["pipeline", "vlm-transformers", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine","vlm-http-client"] = "pipeline"
|
lang_list: List[str] | None = None # 默认值在 API 侧处理,这里 None 即可
|
||||||
# parse_method: str = "auto"
|
|
||||||
|
# 后端引擎选项 (更新适配最新的 MinerU API)
|
||||||
|
backend: Literal[
|
||||||
|
"pipeline",
|
||||||
|
"vlm-auto-engine",
|
||||||
|
"vlm-http-client",
|
||||||
|
"hybrid-auto-engine",
|
||||||
|
"hybrid-http-client"
|
||||||
|
] = "hybrid-auto-engine"
|
||||||
|
|
||||||
|
parse_method: Literal["auto", "txt", "ocr"] = "auto"
|
||||||
formula_enable: bool = True
|
formula_enable: bool = True
|
||||||
# table_enable: bool = True
|
table_enable: bool = True
|
||||||
server_url: str | None = None #(Adapted only for vlm-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000
|
|
||||||
# return_md: bool = True
|
# 用于 vlm-http-client 或 hybrid-http-client 后端
|
||||||
# return_middle_json: bool = True
|
server_url: str | None = None
|
||||||
# return_model_output: bool = False
|
|
||||||
# return_content_list: bool = False
|
# 返回选项
|
||||||
# return_images: bool = True
|
return_md: bool = True
|
||||||
# response_format_zip: bool = True
|
return_middle_json: bool = False
|
||||||
|
return_model_output: bool = False
|
||||||
|
return_content_list: bool = False
|
||||||
|
return_images: bool = True
|
||||||
|
response_format_zip: bool = True
|
||||||
|
|
||||||
|
# 页面范围
|
||||||
start_page_id: int = 0
|
start_page_id: int = 0
|
||||||
end_page_id: int = 99999
|
end_page_id: int = 99999
|
||||||
|
|
||||||
def gethash(self) ->Hashable:
|
def gethash(self) -> Hashable:
|
||||||
return (self.backend,self.formula_enable,self.start_page_id,self.end_page_id)
|
return (self.backend, self.formula_enable, self.table_enable,
|
||||||
|
self.parse_method, self.start_page_id, self.end_page_id)
|
||||||
|
|
||||||
|
|
||||||
# 配置HTTP客户端
|
# 配置HTTP客户端
|
||||||
timeout = httpx.Timeout(
|
timeout = httpx.Timeout(
|
||||||
@@ -57,27 +75,37 @@ class ConverterMineruDeploy(X2MarkdownConverter):
|
|||||||
|
|
||||||
self._api_url = f"{self.base_url}/file_parse"
|
self._api_url = f"{self.base_url}/file_parse"
|
||||||
|
|
||||||
def _build_form_data(self)->dict:
|
def _build_form_data(self) -> dict:
|
||||||
|
# httpx 在处理 data 参数时,如果值为 list,会自动展开为多个同名 key (例如 lang_list=ch&lang_list=en)
|
||||||
|
# 这符合 FastAPI/Starlette 对 List 字段的解析要求
|
||||||
data = {
|
data = {
|
||||||
"output_dir": self.config.output_dir,
|
"output_dir": self.config.output_dir,
|
||||||
"backend": self.config.backend,
|
"backend": self.config.backend,
|
||||||
"parse_method": "auto",
|
"parse_method": self.config.parse_method,
|
||||||
"formula_enable": self.config.formula_enable,
|
# bool 类型在 multipart/form-data 中通常需要转为字符串 'true'/'false',但 httpx 会处理 python bool
|
||||||
"table_enable": True,
|
"formula_enable": str(self.config.formula_enable).lower(),
|
||||||
"server_url": None,
|
"table_enable": str(self.config.table_enable).lower(),
|
||||||
"return_md": True,
|
"return_md": str(self.config.return_md).lower(),
|
||||||
"return_middle_json": True,
|
"return_middle_json": str(self.config.return_middle_json).lower(),
|
||||||
"return_model_output": False,
|
"return_model_output": str(self.config.return_model_output).lower(),
|
||||||
"return_content_list": False,
|
"return_content_list": str(self.config.return_content_list).lower(),
|
||||||
"return_images": True,
|
"return_images": str(self.config.return_images).lower(),
|
||||||
"response_format_zip": True,
|
"response_format_zip": str(self.config.response_format_zip).lower(),
|
||||||
"start_page_id": self.config.start_page_id,
|
"start_page_id": self.config.start_page_id,
|
||||||
"end_page_id": self.config.end_page_id
|
"end_page_id": self.config.end_page_id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.config.lang_list:
|
||||||
|
data["lang_list"] = self.config.lang_list
|
||||||
|
else:
|
||||||
|
data["lang_list"] = ["ch"] # 默认值
|
||||||
|
|
||||||
|
if self.config.server_url:
|
||||||
|
data["server_url"] = self.config.server_url
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def convert(self, d: Document) -> MarkdownDocument:
|
||||||
def convert(self,d:Document)->MarkdownDocument:
|
|
||||||
self.logger.info("开始解析文件")
|
self.logger.info("开始解析文件")
|
||||||
files = [("files", (d.name, d.content, "application/octet-stream"))]
|
files = [("files", (d.name, d.content, "application/octet-stream"))]
|
||||||
response = client.post(
|
response = client.post(
|
||||||
@@ -88,15 +116,15 @@ class ConverterMineruDeploy(X2MarkdownConverter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
response.raise_for_status() # 检查是否有错误
|
response.raise_for_status() # 检查是否有错误
|
||||||
md=embed_inline_image_from_zip(response.content,None)
|
# Mineru API 返回 zip 时包含图片和 md
|
||||||
|
md = embed_inline_image_from_zip(response.content, None)
|
||||||
self.logger.info("已转化为markdown")
|
self.logger.info("已转化为markdown")
|
||||||
return MarkdownDocument.from_bytes(md.encode(),suffix=".md",stem=d.stem)
|
return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
|
||||||
|
|
||||||
|
|
||||||
async def convert_async(self, d: Document) -> MarkdownDocument:
|
async def convert_async(self, d: Document) -> MarkdownDocument:
|
||||||
self.logger.info("开始解析文件")
|
self.logger.info("开始解析文件")
|
||||||
files = [("files", (d.name, d.content, "application/octet-stream"))]
|
files = [("files", (d.name, d.content, "application/octet-stream"))]
|
||||||
response =await client_async.post(
|
response = await client_async.post(
|
||||||
self._api_url,
|
self._api_url,
|
||||||
files=files,
|
files=files,
|
||||||
data=self._build_form_data(),
|
data=self._build_form_data(),
|
||||||
@@ -104,15 +132,9 @@ class ConverterMineruDeploy(X2MarkdownConverter):
|
|||||||
)
|
)
|
||||||
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
md = await asyncio.to_thread(embed_inline_image_from_zip,response.content, None)
|
md = await asyncio.to_thread(embed_inline_image_from_zip, response.content, None)
|
||||||
self.logger.info("已转化为markdown")
|
self.logger.info("已转化为markdown")
|
||||||
return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
|
return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem)
|
||||||
|
|
||||||
def support_format(self) -> list[str]:
|
def support_format(self) -> list[str]:
|
||||||
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
|
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
d = Document.from_path(r"C:\Users\jxgm\Desktop\testfiles\table.pdf")
|
|
||||||
config=ConverterMineruDeployConfig()
|
|
||||||
converter = ConverterMineruDeploy(config=config)
|
|
||||||
converter.convert(d)
|
|
||||||
@@ -252,17 +252,27 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
|
|||||||
"http://127.0.0.1:8000",
|
"http://127.0.0.1:8000",
|
||||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务地址。",
|
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务地址。",
|
||||||
)
|
)
|
||||||
|
# --- UPDATED BACKEND LIST ---
|
||||||
mineru_deploy_backend: Literal[
|
mineru_deploy_backend: Literal[
|
||||||
"pipeline",
|
"pipeline",
|
||||||
"vlm-transformers",
|
"vlm-auto-engine",
|
||||||
"vlm-mlx-engine",
|
|
||||||
"vlm-vllm-async-engine",
|
|
||||||
"vlm-lmdeploy-engine",
|
|
||||||
"vlm-http-client",
|
"vlm-http-client",
|
||||||
|
"hybrid-auto-engine",
|
||||||
|
"hybrid-http-client"
|
||||||
] = Field(
|
] = Field(
|
||||||
"pipeline",
|
"hybrid-auto-engine",
|
||||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务使用的后端。",
|
description="[仅当 convert_engine='mineru_deploy'] 本地部署的 MinerU 服务使用的后端。",
|
||||||
)
|
)
|
||||||
|
# --- NEW PARAMETERS START ---
|
||||||
|
mineru_deploy_parse_method: Literal["auto", "txt", "ocr"] = Field(
|
||||||
|
"auto",
|
||||||
|
description="[仅当 convert_engine='mineru_deploy'] 解析方法: auto, txt, ocr"
|
||||||
|
)
|
||||||
|
mineru_deploy_table_enable: bool = Field(
|
||||||
|
True,
|
||||||
|
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用表格解析。",
|
||||||
|
)
|
||||||
|
# --- NEW PARAMETERS END ---
|
||||||
mineru_deploy_formula_enable: bool = Field(
|
mineru_deploy_formula_enable: bool = Field(
|
||||||
True,
|
True,
|
||||||
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用公式解析。",
|
description="[仅当 convert_engine='mineru_deploy'] 本地部署的服务是否启用公式解析。",
|
||||||
@@ -275,13 +285,13 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
|
|||||||
)
|
)
|
||||||
mineru_deploy_lang_list: Optional[List[str]] = Field(
|
mineru_deploy_lang_list: Optional[List[str]] = Field(
|
||||||
None,
|
None,
|
||||||
description="[仅当 convert_engine='mineru_deploy' 且 backend='pipeline'] 语言列表。",
|
description="[仅当 convert_engine='mineru_deploy'] 语言列表, 默认 ['ch']。",
|
||||||
examples=[None],
|
examples=[["ch", "en"]],
|
||||||
)
|
)
|
||||||
# 修改: 默认值改为 ""
|
# 修改: 默认值改为 ""
|
||||||
mineru_deploy_server_url: Optional[str] = Field(
|
mineru_deploy_server_url: Optional[str] = Field(
|
||||||
default="",
|
default="",
|
||||||
description="[仅当 convert_engine='mineru_deploy' 且 backend='vlm-http-client'] Server URL.",
|
description="[仅当 convert_engine='mineru_deploy' 且 backend为http-client相关时] Server URL.",
|
||||||
)
|
)
|
||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
@@ -312,10 +322,6 @@ class TextWorkflowParams(BaseWorkflowParams):
|
|||||||
"\n",
|
"\n",
|
||||||
description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。",
|
description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。",
|
||||||
)
|
)
|
||||||
segment_mode: Literal["line", "paragraph", "none"] = Field(
|
|
||||||
"line",
|
|
||||||
description="分段模式。'line':按行分段(每行独立翻译),'paragraph':按段落分段(连续非空行合并为段落),'none':不分段(全文视为一个段落)。",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class JsonWorkflowParams(BaseWorkflowParams):
|
class JsonWorkflowParams(BaseWorkflowParams):
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user