From 10d528afbb299acec8affbca191d864263cdc233 Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 31 Jul 2025 09:41:46 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84mdbasedworkflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/exporter/md/types.py | 2 +- docutranslate/workflow/interfaces.py | 9 +- docutranslate/workflow/md_based_workflow.py | 92 +++++++++++---------- docutranslate/workflow/txt_workflow.py | 35 ++++---- 4 files changed, 75 insertions(+), 63 deletions(-) diff --git a/docutranslate/exporter/md/types.py b/docutranslate/exporter/md/types.py index eab404b..cb29be8 100644 --- a/docutranslate/exporter/md/types.py +++ b/docutranslate/exporter/md/types.py @@ -1,3 +1,3 @@ from typing import Literal -ConvertEnginType = Literal["mineru", "docling","identity"] \ No newline at end of file +ConvertEngineType = Literal["mineru", "docling", "identity"] \ No newline at end of file diff --git a/docutranslate/workflow/interfaces.py b/docutranslate/workflow/interfaces.py index df35624..03aa248 100644 --- a/docutranslate/workflow/interfaces.py +++ b/docutranslate/workflow/interfaces.py @@ -5,6 +5,7 @@ from docutranslate.exporter.base import ExporterConfig T_ExporterConfig = TypeVar("T_ExporterConfig", bound=ExporterConfig) + @runtime_checkable class HTMLExportable(Protocol[T_ExporterConfig]): def export_to_html(self, config: T_ExporterConfig | None = None) -> str: @@ -13,6 +14,7 @@ class HTMLExportable(Protocol[T_ExporterConfig]): def save_as_html(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class MDExportable(Protocol[T_ExporterConfig]): @@ -22,6 +24,7 @@ class MDExportable(Protocol[T_ExporterConfig]): def save_as_markdown(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class MDZIPExportable(Protocol[T_ExporterConfig]): @@ -31,14 +34,16 @@ class MDZIPExportable(Protocol[T_ExporterConfig]): def save_as_markdown_zip(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + @runtime_checkable class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig]): ... + @runtime_checkable class TXTExportable(Protocol[T_ExporterConfig]): - def export_to_txt(self) -> str: + def export_to_txt(self, config: T_ExporterConfig | None = None) -> str: ... - def save_as_txt(self, name: str, output_dir: Path | str) -> Self: + def save_as_txt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index a71b03d..d3aa019 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -17,7 +17,7 @@ from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2Markd from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter from docutranslate.exporter.md.md2md_exporter import MD2MDExporter from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter -from docutranslate.exporter.md.types import ConvertEnginType +from docutranslate.exporter.md.types import ConvertEngineType from docutranslate.workflow.base import Workflow, WorkflowConfig from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator @@ -25,78 +25,80 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon @dataclass(kw_only=True) class MarkdownBasedWorkflowConfig(WorkflowConfig): - convert_engine: ConvertEnginType + convert_engine: ConvertEngineType converter_config: X2MarkdownConverterConfig | None translator_config: MDTranslatorConfig html_exporter_config: MD2HTMLExporterConfig -class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable, - MDFormatsExportable): +class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], + HTMLExportable[MD2HTMLExporterConfig], + MDFormatsExportable[ExporterConfig]): + _converter_factory: dict[ + ConvertEngineType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = { + "mineru": (ConverterMineru, ConverterMineruConfig), + "identity": (ConverterIdentity, None) + } + if DOCLING_EXIST: + _converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig) + def __init__(self, config: MarkdownBasedWorkflowConfig): super().__init__(config=config) - self._converter_factory: dict[ - ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = { - "mineru": (ConverterMineru, ConverterMineruConfig), - "identity": (ConverterIdentity, None) - } - if DOCLING_EXIST: - self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig) self.convert_engine = config.convert_engine if config.logger: - for sub_config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]: - if sub_config and sub_config.logger is not None: + for sub_config in [self.config.converter_config, self.config.translator_config, + self.config.html_exporter_config]: + if sub_config: sub_config.logger = config.logger - def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig): + def _get_document_md(self, convert_engin: ConvertEngineType, convert_config: X2MarkdownConverterConfig): if self.document_original is None: raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") # 获取缓存的解析后文件 document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, convert_config) - # 获取解析文件 if document_cached: - document_md = document_cached + return document_cached + + # 未缓存则解析文件 + if convert_engin in self._converter_factory: + converter_class, config_class = self._converter_factory[convert_engin] + if config_class and not isinstance(convert_config, config_class): + raise TypeError( + f"未传入正确的convert_config,应为{config_class.__name__}类型,现为{type(convert_config).__name__}类型") + converter = converter_class(convert_config) else: - if self.document_original.suffix == ".md": - converter = ConverterIdentity() - elif convert_engin in self._converter_factory: - converter_class, config_class = self._converter_factory[convert_engin] - if not isinstance(convert_config, config_class): - raise TypeError( - f"未传入正确的convert_config,应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型") - converter = converter_class(convert_config) - else: - raise ValueError(f"不存在{convert_engin}解析引擎") - document_md = converter.convert(self.document_original) - # 获取缓存解析后文件 - md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) + raise ValueError(f"不存在{convert_engin}解析引擎") + document_md = converter.convert(self.document_original) + # 获取缓存解析后文件 + md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) return document_md - def translate(self) -> Self: - convert_engin, convert_config = self.convert_engine, self.config.converter_config + def _pre_translate(self, document: Document): + convert_engin: ConvertEngineType = "identity" if document.suffix == ".md" else self.convert_engine + convert_config = self.config.converter_config translator_config = self.config.translator_config - document_md = self._get_document_md(convert_engin, convert_config) - # 翻译解析后文件 translator = MDTranslator(translator_config) + return convert_engin, convert_config, translator_config, translator + + def translate(self) -> Self: + convert_engin, convert_config, translator_config, translator = self._pre_translate(self.document_original) + document_md = self._get_document_md(convert_engin, convert_config) translator.translate(document_md) self.document_translated = document_md return self async def translate_async(self) -> Self: - convert_engin, convert_config = self.convert_engine, self.config.converter_config - translator_config = self.config.translator_config + convert_engin, convert_config, translator_config, translator = self._pre_translate(self.document_original) document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config) - # 翻译解析后文件 - translator = MDTranslator(translator_config) await translator.translate_async(document_md) self.document_translated = document_md return self - def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str: - export_config = export_config or self.config.html_exporter_config - docu = self._export(MD2HTMLExporter(export_config)) + def export_to_html(self, config: MD2HTMLExporterConfig | None = None) -> str: + config = config or self.config.html_exporter_config + docu = self._export(MD2HTMLExporter(config)) return docu.content.decode() def export_to_markdown(self, config: ExporterConfig | None = None) -> str: @@ -108,19 +110,19 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark return docu.content def save_as_html(self, name: str = None, output_dir: Path | str = "./output", - export_config: MD2HTMLExporterConfig | None = None) -> Self: - export_config = export_config or self.config.html_exporter_config - self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir) + config: MD2HTMLExporterConfig | None = None) -> Self: + config = config or self.config.html_exporter_config + self._save(exporter=MD2HTMLExporter(config=config), name=name, output_dir=output_dir) return self def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", - export_config: ExporterConfig | None = None) -> Self: + _: ExporterConfig | None = None) -> Self: self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir) return self def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output", - export_config: ExporterConfig | None = None) -> Self: + _: ExporterConfig | None = None) -> Self: self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir) return self diff --git a/docutranslate/workflow/txt_workflow.py b/docutranslate/workflow/txt_workflow.py index 7b348df..874e774 100644 --- a/docutranslate/workflow/txt_workflow.py +++ b/docutranslate/workflow/txt_workflow.py @@ -2,28 +2,32 @@ from dataclasses import dataclass from pathlib import Path from typing import Self +from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter from docutranslate.ir.document import Document +from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator from docutranslate.workflow.base import Workflow, WorkflowConfig from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable -from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator + @dataclass(kw_only=True) class TXTWorkflowConfig(WorkflowConfig): translator_config: TXTTranslatorConfig html_exporter_config: TXT2HTMLExporterConfig -class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable, TXTExportable): - def __init__(self,config:TXTWorkflowConfig): + +class TXTWorkflow(Workflow[TXTWorkflowConfig, Document, Document], HTMLExportable[TXT2HTMLExporterConfig], + TXTExportable[ExporterConfig]): + def __init__(self, config: TXTWorkflowConfig): super().__init__(config=config) if config.logger: for sub_config in [self.config.translator_config]: - if sub_config and sub_config.logger is not None: - sub_config.logger=config.logger + if sub_config: + sub_config.logger = config.logger def translate(self) -> Self: - translate_config=self.config.translator_config + translate_config = self.config.translator_config document = self.document_original.copy() # 翻译解析后文件 translator = TXTTranslator(translate_config) @@ -32,7 +36,7 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable, return self async def translate_async(self) -> Self: - translate_config=self.config.translator_config + translate_config = self.config.translator_config document = self.document_original.copy() # 翻译解析后文件 translator = TXTTranslator(translate_config) @@ -40,21 +44,22 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable, self.document_translated = document return self - def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str: - export_config=export_config or self.config.html_exporter_config - docu = self._export(TXT2HTMLExporter(export_config)) + def export_to_html(self, config: TXT2HTMLExporterConfig = None) -> str: + config = config or self.config.html_exporter_config + docu = self._export(TXT2HTMLExporter(config)) return docu.content.decode() - def export_to_txt(self) -> str: + def export_to_txt(self, _: ExporterConfig | None = None) -> str: docu = self._export(TXT2TXTExporter()) return docu.content.decode() def save_as_html(self, name: str = None, output_dir: Path | str = "./output", - export_config: TXT2HTMLExporterConfig | None = None) -> Self: - export_config=export_config or self.config.html_exporter_config - self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir) + config: TXT2HTMLExporterConfig | None = None) -> Self: + config = config or self.config.html_exporter_config + self._save(exporter=TXT2HTMLExporter(config), name=name, output_dir=output_dir) return self - def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", ) -> Self: + def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", + _: ExporterConfig | None = None) -> Self: self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir) return self