diff --git a/docutranslate/converter/x2md/converter_docling.py b/docutranslate/converter/x2md/converter_docling.py index 6b8d19a..bd30272 100644 --- a/docutranslate/converter/x2md/converter_docling.py +++ b/docutranslate/converter/x2md/converter_docling.py @@ -17,6 +17,7 @@ from docling_core.types.doc import ImageRefMode from huggingface_hub.errors import LocalEntryNotFoundError from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig +from docutranslate.ir.attachment_manager import AttachMent from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument @@ -27,10 +28,10 @@ IMAGE_RESOLUTION_SCALE = 4 class ConverterDoclingConfig(X2MarkdownConverterConfig): code_ocr: bool = True formula_ocr: bool = True - artifact: Path |str| None = None + artifact: Path | str | None = None def gethash(self): - return self.code_ocr,self.formula_ocr + return self.code_ocr, self.formula_ocr class ConverterDocling(X2MarkdownConverter): @@ -44,6 +45,7 @@ class ConverterDocling(X2MarkdownConverter): self.artifact = artifact else: self.artifact = config.artifact + self.attachments: list[AttachMent] = [] def convert(self, document) -> MarkdownDocument: assert isinstance(document.name, str) @@ -52,6 +54,7 @@ class ConverterDocling(X2MarkdownConverter): document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content)) content = self.file2markdown_embed_images(document_stream) self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") + self.attachments.append(AttachMent("docling",MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem="docling"))) md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) return md_document diff --git a/docutranslate/converter/x2md/converter_mineru.py b/docutranslate/converter/x2md/converter_mineru.py index c314919..78c2728 100644 --- a/docutranslate/converter/x2md/converter_mineru.py +++ b/docutranslate/converter/x2md/converter_mineru.py @@ -10,6 +10,7 @@ from typing import Hashable, Literal import httpx from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig +from docutranslate.ir.attachment_manager import AttachMent from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.utils.markdown_utils import embed_inline_image_from_zip @@ -51,6 +52,7 @@ class ConverterMineru(X2MarkdownConverter): self.mineru_token = config.mineru_token.strip() self.formula = config.formula_ocr self.model_version = config.model_version + self.attachments: list[AttachMent] = [] def _get_header(self): return { @@ -136,7 +138,9 @@ class ConverterMineru(X2MarkdownConverter): time1 = time.time() batch_id = self.upload(document) file_url = self.get_file_url(batch_id) - content = get_md_from_zip_url_with_inline_images(zip_url=file_url) + content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url) + if mineru_parsed: + self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru"))) self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) return md_document @@ -146,7 +150,9 @@ class ConverterMineru(X2MarkdownConverter): time1 = time.time() batch_id = await self.upload_async(document) file_url = await self.get_file_url_async(batch_id) - content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) + content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) + if mineru_parsed: + self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru"))) self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) return md_document @@ -159,7 +165,7 @@ def get_md_from_zip_url_with_inline_images( zip_url: str, filename_in_zip: str = "full.md", encoding: str = "utf-8" -) -> str: +) -> tuple[str, bytes]: """ 从给定的ZIP文件URL中下载并提取指定文件的内容, 并将Markdown文件中的相对路径图片转换为内联Base64图片。 @@ -169,16 +175,14 @@ def get_md_from_zip_url_with_inline_images( filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。 默认为 "full.md"。 encoding (str): 目标文件的预期编码。默认为 "utf-8"。 - - Returns: - str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。 """ try: print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") response = client.get(zip_url) # 增加超时 response.raise_for_status() print("ZIP文件下载完成。") - return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip, encoding=encoding) + return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip, + encoding=encoding), response.content except httpx.HTTPStatusError as e: @@ -200,7 +204,7 @@ async def get_md_from_zip_url_with_inline_images_async( zip_url: str, filename_in_zip: str = "full.md", encoding: str = "utf-8" -) -> str: +) -> tuple[str, bytes]: """ 从给定的ZIP文件URL中下载并提取指定文件的内容, 并将Markdown文件中的相对路径图片转换为内联Base64图片。 @@ -220,7 +224,7 @@ async def get_md_from_zip_url_with_inline_images_async( response.raise_for_status() print("ZIP文件下载完成。") return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip, - encoding=encoding) + encoding=encoding), response.content except httpx.HTTPStatusError as e: diff --git a/docutranslate/ir/attachment.py b/docutranslate/ir/attachment.py deleted file mode 100644 index 79ab151..0000000 --- a/docutranslate/ir/attachment.py +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-FileCopyrightText: 2025 QinHan -# SPDX-License-Identifier: MPL-2.0 -from typing import Literal - -from docutranslate.ir.document import Document - -AttachMentIdentifier = Literal["glossary"] - - -class AttachMent: - def __init__(self): - self.attachment_dict: dict[AttachMentIdentifier, Document] = {} - - def add_attachment(self, identifier: AttachMentIdentifier, document: Document): - self.attachment_dict[identifier] = document diff --git a/docutranslate/ir/attachment_manager.py b/docutranslate/ir/attachment_manager.py new file mode 100644 index 0000000..650f4b1 --- /dev/null +++ b/docutranslate/ir/attachment_manager.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2025 QinHan +# SPDX-License-Identifier: MPL-2.0 +from typing import Literal + +from docutranslate.ir.document import Document + +AttachMentIdentifier = Literal["glossary", "mineru", "docling", "md_cached"] + + +class AttachMent: + def __init__(self, identifier: AttachMentIdentifier, document: Document): + self.identifier = identifier + self.document = document + + +class AttachMentManager: + def __init__(self): + self.attachment_dict: dict[AttachMentIdentifier, Document] = {} + + def add_document(self, identifier: AttachMentIdentifier, document: Document): + self.attachment_dict[identifier] = document + + def add_attachment(self, attachment: AttachMent): + self.attachment_dict[attachment.identifier] = attachment.document diff --git a/docutranslate/workflow/base.py b/docutranslate/workflow/base.py index e012663..ba42dde 100644 --- a/docutranslate/workflow/base.py +++ b/docutranslate/workflow/base.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Self, Generic, TypeVar from docutranslate.exporter.base import Exporter -from docutranslate.ir.attachment import AttachMent +from docutranslate.ir.attachment_manager import AttachMentManager from docutranslate.ir.document import Document @@ -27,7 +27,7 @@ class Workflow(ABC, Generic[T_Config, T_original, T_Translated]): self.logger = self.config.logger self.document_original: T_original | None = None self.document_translated: T_Translated | None = None - self.attachment = AttachMent() + self.attachment = AttachMentManager() def read_path(self, path: Path | str) -> Self: document = Document.from_path(path) diff --git a/docutranslate/workflow/docx_workflow.py b/docutranslate/workflow/docx_workflow.py index cb1bf17..1083c19 100644 --- a/docutranslate/workflow/docx_workflow.py +++ b/docutranslate/workflow/docx_workflow.py @@ -39,7 +39,7 @@ class DocxWorkflow(Workflow[DocxWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -47,7 +47,7 @@ class DocxWorkflow(Workflow[DocxWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self diff --git a/docutranslate/workflow/epub_workflow.py b/docutranslate/workflow/epub_workflow.py index d0b699a..bc0940f 100644 --- a/docutranslate/workflow/epub_workflow.py +++ b/docutranslate/workflow/epub_workflow.py @@ -40,7 +40,7 @@ class EpubWorkflow(Workflow[EpubWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -48,7 +48,7 @@ class EpubWorkflow(Workflow[EpubWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self diff --git a/docutranslate/workflow/html_workflow.py b/docutranslate/workflow/html_workflow.py index 52b8070..44f22f1 100644 --- a/docutranslate/workflow/html_workflow.py +++ b/docutranslate/workflow/html_workflow.py @@ -38,7 +38,7 @@ class HtmlWorkflow(Workflow[HtmlWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -46,7 +46,7 @@ class HtmlWorkflow(Workflow[HtmlWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self diff --git a/docutranslate/workflow/json_workflow.py b/docutranslate/workflow/json_workflow.py index e89f596..d374284 100644 --- a/docutranslate/workflow/json_workflow.py +++ b/docutranslate/workflow/json_workflow.py @@ -39,7 +39,7 @@ class JsonWorkflow(Workflow[JsonWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -47,7 +47,7 @@ class JsonWorkflow(Workflow[JsonWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(self.document_original) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index 31cfff8..3055652 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -62,6 +62,7 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, convert_config) if document_cached: + self.attachment.add_document("md_cached",document_cached) return document_cached # 未缓存则解析文件 @@ -74,8 +75,12 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark else: raise ValueError(f"不存在{convert_engin}解析引擎") document_md = converter.convert(self.document_original) + if hasattr(converter,"attachments"): + for attachment in converter.attachments: + self.attachment.add_attachment(attachment) # 获取缓存解析后文件 md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) + return document_md def _pre_translate(self, document: Document): @@ -90,7 +95,7 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark document_md = self._get_document_md(convert_engine, convert_config) translator.translate(document_md) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document_md return self @@ -99,7 +104,7 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark document_md = await asyncio.to_thread(self._get_document_md, convert_engine, convert_config) await translator.translate_async(document_md) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document_md return self diff --git a/docutranslate/workflow/srt_workflow.py b/docutranslate/workflow/srt_workflow.py index c055907..81776ef 100644 --- a/docutranslate/workflow/srt_workflow.py +++ b/docutranslate/workflow/srt_workflow.py @@ -40,7 +40,7 @@ class SrtWorkflow(Workflow[SrtWorkflowConfig, Document, Document], HTMLExportabl document, translator=self._pre_translate(self.document_original) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -48,7 +48,7 @@ class SrtWorkflow(Workflow[SrtWorkflowConfig, Document, Document], HTMLExportabl document, translator = self._pre_translate(self.document_original) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self diff --git a/docutranslate/workflow/txt_workflow.py b/docutranslate/workflow/txt_workflow.py index 27bd1ea..fe225e1 100644 --- a/docutranslate/workflow/txt_workflow.py +++ b/docutranslate/workflow/txt_workflow.py @@ -40,7 +40,7 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig, Document, Document], HTMLExportabl document, translator=self._pre_translate(self.document_original) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -48,7 +48,7 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig, Document, Document], HTMLExportabl document, translator = self._pre_translate(self.document_original) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self diff --git a/docutranslate/workflow/xlsx_workflow.py b/docutranslate/workflow/xlsx_workflow.py index fb26471..3baa9d2 100644 --- a/docutranslate/workflow/xlsx_workflow.py +++ b/docutranslate/workflow/xlsx_workflow.py @@ -63,7 +63,7 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(document_xlsx) translator.translate(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self @@ -72,7 +72,7 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta document, translator = self._pre_translate(document_xlsx) await translator.translate_async(document) if translator.glossary_dict_gen: - self.attachment.add_attachment("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) + self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen)) self.document_translated = document return self