diff --git a/docutranslate/converter/x2md/converter_mineru_deploy.py b/docutranslate/converter/x2md/converter_mineru_deploy.py new file mode 100644 index 0000000..8db5bd8 --- /dev/null +++ b/docutranslate/converter/x2md/converter_mineru_deploy.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: 2025 QinHan +# SPDX-License-Identifier: MPL-2.0 +import asyncio +from dataclasses import dataclass +from typing import Literal, Hashable + +import httpx + +from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig +from docutranslate.ir.attachment_manager import AttachMent +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument +from docutranslate.utils.markdown_utils import embed_inline_image_from_zip + + +@dataclass(kw_only=True) +class ConverterMineruDeployConfig(X2MarkdownConverterConfig): + base_url: str = "http://127.0.0.1:8000" + output_dir: str = "./output" # 覆盖默认值 ./output + lang_list: list[str] | None = None + backend: Literal["pipeline", "vlm"] = "pipeline" # 默认值 + parse_method: str = "auto" # 默认值 + formula_enable: bool = True # 默认值 + table_enable: bool = True # 默认值 + server_url: str | None = None # 可选 + return_md: bool = True # 默认值 + return_middle_json: bool = True # 默认值 + return_model_output: bool = False # 默认值 + return_content_list: bool = False # 默认值 + return_images: bool = True # 默认值 + response_format_zip: bool = True # 默认值 + start_page_id: int = 0 # 默认值 + end_page_id: int = 99999 # 默认值 + + def gethash(self) ->Hashable: + return (self.backend,self.formula_enable,self.table_enable) + +# 配置HTTP客户端 +timeout = httpx.Timeout( + connect=5.0, + read=1800.0, # 本地部署可能处理时间较长,增加读取超时 + write=300.0, + pool=1.0 +) + +limits = httpx.Limits(max_connections=500, max_keepalive_connections=20) +client = httpx.Client(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False) +client_async = httpx.AsyncClient(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False) + + +class ConverterMineruDeploy(X2MarkdownConverter): + def __init__(self, config: ConverterMineruDeployConfig): + super().__init__(config=config) + self.base_url = config.base_url.rstrip('/') + self.config = config + self.attachments: list[AttachMent] = [] + + self._api_url = f"{self.base_url}/file_parse" + + def _build_form_data(self)->dict: + data = { + "output_dir": self.config.output_dir, + "backend": self.config.backend, + "parse_method": self.config.parse_method, + "formula_enable": self.config.formula_enable, + "table_enable": self.config.table_enable, + "server_url": self.config.server_url, + "return_md": self.config.return_md, + "return_middle_json": self.config.return_middle_json, + "return_model_output": self.config.return_model_output, + "return_content_list": self.config.return_content_list, + "return_images": self.config.return_images, + "response_format_zip": self.config.response_format_zip, + "start_page_id": self.config.start_page_id, + "end_page_id": self.config.end_page_id + } + return data + + + def convert(self,d:Document)->MarkdownDocument: + self.logger.info("开始解析文件") + files = [("files", (d.name, d.content, "application/octet-stream"))] + response = client.post( + self._api_url, + files=files, + data=self._build_form_data(), + timeout=2000, + ) + + response.raise_for_status() # 检查是否有错误 + md=embed_inline_image_from_zip(response.content,None) + self.logger.info("已转化为markdown") + return MarkdownDocument.from_bytes(md.encode(),suffix=".md",stem=d.stem) + + + async def convert_async(self, d: Document) -> MarkdownDocument: + self.logger.info("开始解析文件") + files = [("files", (d.name, d.content, "application/octet-stream"))] + response =await client_async.post( + self._api_url, + files=files, + data=self._build_form_data(), + timeout=2000, + ) + + response.raise_for_status() + md = await asyncio.to_thread(embed_inline_image_from_zip,response.content, None) + self.logger.info("已转化为markdown") + return MarkdownDocument.from_bytes(md.encode(), suffix=".md", stem=d.stem) + + def support_format(self) -> list[str]: + return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"] + +if __name__ == '__main__': + d = Document.from_path(r"C:\Users\jxgm\Desktop\testfiles\table.pdf") + config=ConverterMineruDeployConfig() + converter = ConverterMineruDeploy(config=config) + converter.convert(d) diff --git a/docutranslate/exporter/md/types.py b/docutranslate/exporter/md/types.py index 78a0f1c..0ec5f6f 100644 --- a/docutranslate/exporter/md/types.py +++ b/docutranslate/exporter/md/types.py @@ -2,4 +2,4 @@ # SPDX-License-Identifier: MPL-2.0 from typing import Literal -ConvertEngineType = Literal["mineru", "docling", "identity"] \ No newline at end of file +ConvertEngineType = Literal["mineru", "docling", "identity","mineru_deploy"] \ No newline at end of file diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index f065e77..d0973a7 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -64,6 +64,7 @@ def uris2placeholder(markdown: str, mask_dict: MaskDict): # 整个图片都替换为占位符 mask_dict.set(id, match.group()) + print(f"生成占位符") return f"" uri_pattern = r'(!\[.*?\])\((.*?)\)' @@ -77,6 +78,7 @@ def placeholder2uris(markdown: str, mask_dict: MaskDict): uri = mask_dict.get(id) if uri is None: return match.group() + print(f"占位符已还原为图片") return uri ph_pattern = r"" diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index df8382d..b5cc590 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Self, Tuple, Type from docutranslate.cacher import md_based_convert_cacher +from docutranslate.converter.x2md.converter_mineru_deploy import ConverterMineruDeploy, ConverterMineruDeployConfig from docutranslate.exporter.base import ExporterConfig from docutranslate.global_values.conditional_import import DOCLING_EXIST from docutranslate.glossary.glossary import Glossary @@ -42,8 +43,10 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark ConvertEngineType, Tuple[Type[X2MarkdownConverter | ConverterIdentity], Type[ X2MarkdownConverterConfig]] | None] = { "mineru": (ConverterMineru, ConverterMineruConfig), - "identity": (ConverterIdentity, None) + "identity": (ConverterIdentity, None), + "mineru_deploy": (ConverterMineruDeploy, ConverterMineruDeployConfig) } + if DOCLING_EXIST: _converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig) diff --git a/uv.lock b/uv.lock index 8cf2db4..bf24594 100644 --- a/uv.lock +++ b/uv.lock @@ -2,8 +2,12 @@ version = 1 revision = 1 requires-python = ">=3.11" resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version < '3.12'", + "python_full_version >= '3.13' and sys_platform == 'win32'", + "python_full_version >= '3.13' and sys_platform != 'win32'", + "python_full_version == '3.12.*' and sys_platform == 'win32'", + "python_full_version == '3.12.*' and sys_platform != 'win32'", + "python_full_version < '3.12' and sys_platform == 'win32'", + "python_full_version < '3.12' and sys_platform != 'win32'", ] [[package]] @@ -941,22 +945,18 @@ wheels = [ [[package]] name = "multiprocess" -version = "0.70.18" +version = "0.70.16" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dill" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/72/fd/2ae3826f5be24c6ed87266bc4e59c46ea5b059a103f3d7e7eb76a52aeecb/multiprocess-0.70.18.tar.gz", hash = "sha256:f9597128e6b3e67b23956da07cf3d2e5cba79e2f4e0fba8d7903636663ec6d0d", size = 1798503 } +sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603 } wheels = [ - { url = "https://files.pythonhosted.org/packages/55/4d/9af0d1279c84618bcd35bf5fd7e371657358c7b0a523e54a9cffb87461f8/multiprocess-0.70.18-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b8940ae30139e04b076da6c5b83e9398585ebdf0f2ad3250673fef5b2ff06d6", size = 144695 }, - { url = "https://files.pythonhosted.org/packages/17/bf/87323e79dd0562474fad3373c21c66bc6c3c9963b68eb2a209deb4c8575e/multiprocess-0.70.18-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0929ba95831adb938edbd5fb801ac45e705ecad9d100b3e653946b7716cb6bd3", size = 144742 }, - { url = "https://files.pythonhosted.org/packages/dd/74/cb8c831e58dc6d5cf450b17c7db87f14294a1df52eb391da948b5e0a0b94/multiprocess-0.70.18-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d77f8e4bfe6c6e2e661925bbf9aed4d5ade9a1c6502d5dfc10129b9d1141797", size = 144745 }, - { url = "https://files.pythonhosted.org/packages/ba/d8/0cba6cf51a1a31f20471fbc823a716170c73012ddc4fb85d706630ed6e8f/multiprocess-0.70.18-py310-none-any.whl", hash = "sha256:60c194974c31784019c1f459d984e8f33ee48f10fcf42c309ba97b30d9bd53ea", size = 134948 }, - { url = "https://files.pythonhosted.org/packages/4b/88/9039f2fed1012ef584751d4ceff9ab4a51e5ae264898f0b7cbf44340a859/multiprocess-0.70.18-py311-none-any.whl", hash = "sha256:5aa6eef98e691281b3ad923be2832bf1c55dd2c859acd73e5ec53a66aae06a1d", size = 144462 }, - { url = "https://files.pythonhosted.org/packages/bf/b6/5f922792be93b82ec6b5f270bbb1ef031fd0622847070bbcf9da816502cc/multiprocess-0.70.18-py312-none-any.whl", hash = "sha256:9b78f8e5024b573730bfb654783a13800c2c0f2dfc0c25e70b40d184d64adaa2", size = 150287 }, - { url = "https://files.pythonhosted.org/packages/ee/25/7d7e78e750bc1aecfaf0efbf826c69a791d2eeaf29cf20cba93ff4cced78/multiprocess-0.70.18-py313-none-any.whl", hash = "sha256:871743755f43ef57d7910a38433cfe41319e72be1bbd90b79c7a5ac523eb9334", size = 151917 }, - { url = "https://files.pythonhosted.org/packages/3b/c3/ca84c19bd14cdfc21c388fdcebf08b86a7a470ebc9f5c3c084fc2dbc50f7/multiprocess-0.70.18-py38-none-any.whl", hash = "sha256:dbf705e52a154fe5e90fb17b38f02556169557c2dd8bb084f2e06c2784d8279b", size = 132636 }, - { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478 }, + { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824 }, + { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519 }, + { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741 }, + { url = "https://files.pythonhosted.org/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl", hash = "sha256:a71d82033454891091a226dfc319d0cfa8019a4e888ef9ca910372a446de4435", size = 132628 }, + { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351 }, ] [[package]] @@ -1079,7 +1079,7 @@ name = "nvidia-cudnn-cu12" version = "9.10.2.21" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform != 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467 }, @@ -1090,7 +1090,7 @@ name = "nvidia-cufft-cu12" version = "11.3.3.83" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695 }, @@ -1117,9 +1117,9 @@ name = "nvidia-cusolver-cu12" version = "11.7.3.90" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-cublas-cu12", marker = "sys_platform != 'win32'" }, + { name = "nvidia-cusparse-cu12", marker = "sys_platform != 'win32'" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905 }, @@ -1130,7 +1130,7 @@ name = "nvidia-cusparse-cu12" version = "12.5.8.93" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "sys_platform != 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466 }, @@ -2503,7 +2503,7 @@ wheels = [ [[package]] name = "transformers" -version = "4.56.2" +version = "4.57.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -2517,9 +2517,9 @@ dependencies = [ { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e5/82/0bcfddd134cdf53440becb5e738257cc3cf34cf229d63b57bfd288e6579f/transformers-4.56.2.tar.gz", hash = "sha256:5e7c623e2d7494105c726dd10f6f90c2c99a55ebe86eef7233765abd0cb1c529", size = 9844296 } +sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511 } wheels = [ - { url = "https://files.pythonhosted.org/packages/70/26/2591b48412bde75e33bfd292034103ffe41743cacd03120e3242516cd143/transformers-4.56.2-py3-none-any.whl", hash = "sha256:79c03d0e85b26cb573c109ff9eafa96f3c8d4febfd8a0774e8bba32702dd6dde", size = 11608055 }, + { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925 }, ] [[package]] @@ -2527,7 +2527,7 @@ name = "triton" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "setuptools" }, + { name = "setuptools", marker = "sys_platform != 'win32'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/7d/39/43325b3b651d50187e591eefa22e236b2981afcebaefd4f2fc0ea99df191/triton-3.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b70f5e6a41e52e48cfc087436c8a28c17ff98db369447bcaff3b887a3ab4467", size = 155531138 },