From f4b3432f457dc6256d87e722e75b32201e1e8241 Mon Sep 17 00:00:00 2001 From: xunbu Date: Wed, 30 Jul 2025 23:54:55 +0800 Subject: [PATCH] =?UTF-8?q?mabasedworkflow=E6=9E=B6=E6=9E=84=E5=AE=8C?= =?UTF-8?q?=E5=96=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cacher/md_based_convert_cacher.py | 4 +- docutranslate/exporter/md/types.py | 2 +- docutranslate/workflow/base.py | 10 +- docutranslate/workflow/interfaces.py | 2 +- docutranslate/workflow/md_based_workflow.py | 95 +++++++------------ 5 files changed, 43 insertions(+), 70 deletions(-) diff --git a/docutranslate/cacher/md_based_convert_cacher.py b/docutranslate/cacher/md_based_convert_cacher.py index c92498d..213bf2a 100644 --- a/docutranslate/cacher/md_based_convert_cacher.py +++ b/docutranslate/cacher/md_based_convert_cacher.py @@ -19,11 +19,11 @@ class MDBasedCovertCacher: def get_cached_result(self, document: Document, convert_engin: str, convert_config: ConverterConfig) -> MarkdownDocument | None: - return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash())) + return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config)) def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str, convert_config: ConverterConfig) -> MarkdownDocument: - hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash()) + hash_code = self._get_hashcode(document, convert_engin, convert_config) if len(self.cache_dict) > int(CACHE_NUM): self.cache_dict.popitem(last=False) self.cache_dict[hash_code] = convert_result diff --git a/docutranslate/exporter/md/types.py b/docutranslate/exporter/md/types.py index 56ab69b..eab404b 100644 --- a/docutranslate/exporter/md/types.py +++ b/docutranslate/exporter/md/types.py @@ -1,3 +1,3 @@ from typing import Literal -ConvertEnginType = Literal["mineru", "docling"] \ No newline at end of file +ConvertEnginType = Literal["mineru", "docling","identity"] \ No newline at end of file diff --git a/docutranslate/workflow/base.py b/docutranslate/workflow/base.py index 85c9d2a..c924555 100644 --- a/docutranslate/workflow/base.py +++ b/docutranslate/workflow/base.py @@ -6,21 +6,21 @@ from typing import Self, Generic, TypeVar from docutranslate.exporter.base import Exporter from docutranslate.ir.document import Document -from docutranslate.logger import global_logger @dataclass(kw_only=True) class WorkflowConfig: logger: Logger | None = None - +T_Config = TypeVar("T_Config", bound=WorkflowConfig) T_original = TypeVar('T_original', bound=Document) T_Translated = TypeVar('T_Translated', bound=Document) -class Workflow(ABC, Generic[T_original, T_Translated]): - def __init__(self, logger: Logger = global_logger): - self.logger = logger +class Workflow(ABC, Generic[T_Config,T_original, T_Translated]): + def __init__(self, config:T_Config): + self.config=config + self.logger=self.config.logger self.document_original: T_original | None = None self.document_translated: T_Translated | None = None diff --git a/docutranslate/workflow/interfaces.py b/docutranslate/workflow/interfaces.py index 884f645..38dd920 100644 --- a/docutranslate/workflow/interfaces.py +++ b/docutranslate/workflow/interfaces.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import Protocol, Self, TypeVar, runtime_checkable -from docutranslate.exporter.export_config import ExporterConfig +from docutranslate.exporter.base import ExporterConfig T = TypeVar("T", bound=ExporterConfig) diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index ad5e27c..03b21ac 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -2,16 +2,19 @@ import asyncio from dataclasses import dataclass from logging import Logger from pathlib import Path -from typing import Self, Tuple, Any +from typing import Self, Tuple, Type from docutranslate.cacher import md_based_convert_cacher +from docutranslate.exporter.base import ExporterConfig from docutranslate.global_values.conditional_import import DOCLING_EXIST +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument if DOCLING_EXIST: from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling from docutranslate.converter.x2md.converter_identity import ConverterIdentity from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru -from docutranslate.converter.x2md.base import X2MarkdownConverterConfig +from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter from docutranslate.exporter.md.md2md_exporter import MD2MDExporter from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter @@ -23,64 +26,35 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon @dataclass(kw_only=True) class MarkdownBasedWorkflowConfig(WorkflowConfig): - # X2MarkdownConverterConfig - convert_engine: ConvertEnginType | None - formula: bool = True - # ConverterDoclingConfig - code: bool = True - artifact: Path | None = None - # ConverterMineruConfig - mineru_token: str - # MDTranslatorConfig - base_url: str - api_key: str - model_id: str - to_lang: str - custom_prompt: str | None = None - temperature: float = 0.7 - timeout: int = 2000 - chunk_size: int = 3000 - concurrent: int = 30 - # MD2HTMLExporterConfig - cdn: bool = True - # general logger: Logger | None = None + convert_engine: ConvertEnginType + converter_config: X2MarkdownConverterConfig | None + translator_config: MDTranslatorConfig + html_exporter_config: MD2HTMLExporterConfig -class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): +class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable, + MDFormatsExportable): def __init__(self, config: MarkdownBasedWorkflowConfig): super().__init__(config=config) - self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = { + self._converter_factory: dict[ + ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = { "mineru": (ConverterMineru, ConverterMineruConfig), + "identity": (ConverterIdentity, None) } if DOCLING_EXIST: self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig) - self.x2markdown_converter_config:X2MarkdownConverterConfig|None - if config.convert_engine is None: - self.converter_config=None - elif config.convert_engine== "mineru": - self.converter_config = ConverterMineruConfig(formula=config.formula, - mineru_token=config.mineru_token) - elif DOCLING_EXIST and config.convert_engine== "docling": - self.converter_config = ConverterDoclingConfig(code=config.code, - formula=config.formula, - artifact=config.artifact) - self.translator_config = MDTranslatorConfig(base_url=config.base_url, - api_key=config.api_key, - model_id=config.model_id, - to_lang=config.to_lang, - custom_prompt=config.custom_prompt, - temperature=config.temperature, - timeout=config.timeout, - chunk_size=config.chunk_size, - concurrent=config.concurrent, - ) - self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn) - self.convert_engine=config.convert_engine + self.convert_engine = config.convert_engine + self.logger = config.logger + if self.logger: + for config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]: + if config is not None: + config.logger = self.logger - def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig): + def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig): if self.document_original is None: raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") + # 获取缓存的解析后文件 document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, convert_config) @@ -88,7 +62,7 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): if document_cached: document_md = document_cached else: - if convert_engin is None or self.document_original.suffix == ".md": + if self.document_original.suffix == ".md": converter = ConverterIdentity() elif convert_engin in self._converter_factory: converter_class, config_class = self._converter_factory[convert_engin] @@ -103,11 +77,10 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) return document_md - def translate(self) -> Self: - convert_engin,convert_config=self.convert_engine,self.converter_config - translator_config=self.translator_config - document_md = self._get_document_md(convert_engin,convert_config) + convert_engin, convert_config = self.convert_engine, self.config.converter_config + translator_config = self.config.translator_config + document_md = self._get_document_md(convert_engin, convert_config) # 翻译解析后文件 translator = MDTranslator(translator_config) translator.translate(document_md) @@ -115,8 +88,8 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): return self async def translate_async(self) -> Self: - convert_engin,convert_config=self.convert_engine,self.converter_config - translator_config=self.translator_config + convert_engin, convert_config = self.convert_engine, self.config.converter_config + translator_config = self.config.translator_config document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config) # 翻译解析后文件 translator = MDTranslator(translator_config) @@ -125,32 +98,32 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): return self def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str: - export_config=export_config or self.md2html_exporter_config + export_config = export_config or self.config.html_exporter_config docu = self._export(MD2HTMLExporter(export_config)) return docu.content.decode() - def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str: + def export_to_markdown(self, config: ExporterConfig | None = None) -> str: docu = self._export(MD2MDExporter()) return docu.content.decode() - def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes: + def export_to_markdown_zip(self, config: ExporterConfig | None = None) -> bytes: docu = self._export(MD2MDZipExporter()) return docu.content def save_as_html(self, name: str = None, output_dir: Path | str = "./output", export_config: MD2HTMLExporterConfig | None = None) -> Self: - export_config = export_config or self.md2html_exporter_config + export_config = export_config or self.config.html_exporter_config self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir) return self def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", - export_config=None) -> Self: + export_config: ExporterConfig | None = None) -> Self: self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir) return self def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output", - export_config=None) -> Self: + export_config: ExporterConfig | None = None) -> Self: self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir) return self