From f4b3432f457dc6256d87e722e75b32201e1e8241 Mon Sep 17 00:00:00 2001
From: xunbu <xunbu3@qq.com>
Date: Wed, 30 Jul 2025 23:54:55 +0800
Subject: [PATCH] =?UTF-8?q?mabasedworkflow=E6=9E=B6=E6=9E=84=E5=AE=8C?=
 =?UTF-8?q?=E5=96=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cacher/md_based_convert_cacher.py         |  4 +-
 docutranslate/exporter/md/types.py            |  2 +-
 docutranslate/workflow/base.py                | 10 +-
 docutranslate/workflow/interfaces.py          |  2 +-
 docutranslate/workflow/md_based_workflow.py   | 95 +++++++------------
 5 files changed, 43 insertions(+), 70 deletions(-)

diff --git a/docutranslate/cacher/md_based_convert_cacher.py b/docutranslate/cacher/md_based_convert_cacher.py
index c92498d..213bf2a 100644
--- a/docutranslate/cacher/md_based_convert_cacher.py
+++ b/docutranslate/cacher/md_based_convert_cacher.py
@@ -19,11 +19,11 @@ class MDBasedCovertCacher:
 
     def get_cached_result(self, document: Document, convert_engin: str,
                           convert_config: ConverterConfig) -> MarkdownDocument | None:
-        return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash()))
+        return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
 
     def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
                      convert_config: ConverterConfig) -> MarkdownDocument:
-        hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash())
+        hash_code = self._get_hashcode(document, convert_engin, convert_config)
         if len(self.cache_dict) > int(CACHE_NUM):
             self.cache_dict.popitem(last=False)
         self.cache_dict[hash_code] = convert_result
diff --git a/docutranslate/exporter/md/types.py b/docutranslate/exporter/md/types.py
index 56ab69b..eab404b 100644
--- a/docutranslate/exporter/md/types.py
+++ b/docutranslate/exporter/md/types.py
@@ -1,3 +1,3 @@
 from typing import Literal
 
-ConvertEnginType = Literal["mineru", "docling"]
\ No newline at end of file
+ConvertEnginType = Literal["mineru", "docling","identity"]
\ No newline at end of file
diff --git a/docutranslate/workflow/base.py b/docutranslate/workflow/base.py
index 85c9d2a..c924555 100644
--- a/docutranslate/workflow/base.py
+++ b/docutranslate/workflow/base.py
@@ -6,21 +6,21 @@ from typing import Self, Generic, TypeVar
 
 from docutranslate.exporter.base import Exporter
 from docutranslate.ir.document import Document
-from docutranslate.logger import global_logger
 
 
 @dataclass(kw_only=True)
 class WorkflowConfig:
     logger: Logger | None = None
 
-
+T_Config = TypeVar("T_Config", bound=WorkflowConfig)
 T_original = TypeVar('T_original', bound=Document)
 T_Translated = TypeVar('T_Translated', bound=Document)
 
 
-class Workflow(ABC, Generic[T_original, T_Translated]):
-    def __init__(self, logger: Logger = global_logger):
-        self.logger = logger
+class Workflow(ABC, Generic[T_Config,T_original, T_Translated]):
+    def __init__(self, config:T_Config):
+        self.config=config
+        self.logger=self.config.logger
         self.document_original: T_original | None = None
         self.document_translated: T_Translated | None = None
 
diff --git a/docutranslate/workflow/interfaces.py b/docutranslate/workflow/interfaces.py
index 884f645..38dd920 100644
--- a/docutranslate/workflow/interfaces.py
+++ b/docutranslate/workflow/interfaces.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from typing import Protocol, Self, TypeVar, runtime_checkable
 
-from docutranslate.exporter.export_config import ExporterConfig
+from docutranslate.exporter.base import ExporterConfig
 
 T = TypeVar("T", bound=ExporterConfig)
 
diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py
index ad5e27c..03b21ac 100644
--- a/docutranslate/workflow/md_based_workflow.py
+++ b/docutranslate/workflow/md_based_workflow.py
@@ -2,16 +2,19 @@ import asyncio
 from dataclasses import dataclass
 from logging import Logger
 from pathlib import Path
-from typing import Self, Tuple, Any
+from typing import Self, Tuple, Type
 
 from docutranslate.cacher import md_based_convert_cacher
+from docutranslate.exporter.base import ExporterConfig
 from docutranslate.global_values.conditional_import import DOCLING_EXIST
+from docutranslate.ir.document import Document
+from docutranslate.ir.markdown_document import MarkdownDocument
 
 if DOCLING_EXIST:
     from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
 from docutranslate.converter.x2md.converter_identity import ConverterIdentity
 from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
-from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
+from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter
 from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
 from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
 from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
@@ -23,64 +26,35 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
 
 @dataclass(kw_only=True)
 class MarkdownBasedWorkflowConfig(WorkflowConfig):
-    # X2MarkdownConverterConfig
-    convert_engine: ConvertEnginType | None
-    formula: bool = True
-    # ConverterDoclingConfig
-    code: bool = True
-    artifact: Path | None = None
-    # ConverterMineruConfig
-    mineru_token: str
-    # MDTranslatorConfig
-    base_url: str
-    api_key: str
-    model_id: str
-    to_lang: str
-    custom_prompt: str | None = None
-    temperature: float = 0.7
-    timeout: int = 2000
-    chunk_size: int = 3000
-    concurrent: int = 30
-    # MD2HTMLExporterConfig
-    cdn: bool = True
-    # general
     logger: Logger | None = None
+    convert_engine: ConvertEnginType
+    converter_config: X2MarkdownConverterConfig | None
+    translator_config: MDTranslatorConfig
+    html_exporter_config: MD2HTMLExporterConfig
 
 
-class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
+class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable,
+                            MDFormatsExportable):
     def __init__(self, config: MarkdownBasedWorkflowConfig):
         super().__init__(config=config)
-        self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
+        self._converter_factory: dict[
+            ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
             "mineru": (ConverterMineru, ConverterMineruConfig),
+            "identity": (ConverterIdentity, None)
         }
         if DOCLING_EXIST:
             self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
-        self.x2markdown_converter_config:X2MarkdownConverterConfig|None
-        if config.convert_engine is None:
-            self.converter_config=None
-        elif config.convert_engine== "mineru":
-            self.converter_config = ConverterMineruConfig(formula=config.formula,
-                                                          mineru_token=config.mineru_token)
-        elif DOCLING_EXIST and config.convert_engine== "docling":
-            self.converter_config = ConverterDoclingConfig(code=config.code,
-                                                           formula=config.formula,
-                                                           artifact=config.artifact)
-        self.translator_config = MDTranslatorConfig(base_url=config.base_url,
-                                                    api_key=config.api_key,
-                                                    model_id=config.model_id,
-                                                    to_lang=config.to_lang,
-                                                    custom_prompt=config.custom_prompt,
-                                                    temperature=config.temperature,
-                                                    timeout=config.timeout,
-                                                    chunk_size=config.chunk_size,
-                                                    concurrent=config.concurrent,
-                                                    )
-        self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
-        self.convert_engine=config.convert_engine
+        self.convert_engine = config.convert_engine
+        self.logger = config.logger
+        if self.logger:
+            for config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]:
+                if config is not None:
+                    config.logger = self.logger
 
-    def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
+    def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig):
         if self.document_original is None:
             raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
+
         # 获取缓存的解析后文件
         document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
                                                                     convert_config)
@@ -88,7 +62,7 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
         if document_cached:
             document_md = document_cached
         else:
-            if convert_engin is None or self.document_original.suffix == ".md":
+            if self.document_original.suffix == ".md":
                 converter = ConverterIdentity()
             elif convert_engin in self._converter_factory:
                 converter_class, config_class = self._converter_factory[convert_engin]
@@ -103,11 +77,10 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
             md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
         return document_md
 
-
     def translate(self) -> Self:
-        convert_engin,convert_config=self.convert_engine,self.converter_config
-        translator_config=self.translator_config
-        document_md = self._get_document_md(convert_engin,convert_config)
+        convert_engin, convert_config = self.convert_engine, self.config.converter_config
+        translator_config = self.config.translator_config
+        document_md = self._get_document_md(convert_engin, convert_config)
         # 翻译解析后文件
         translator = MDTranslator(translator_config)
         translator.translate(document_md)
@@ -115,8 +88,8 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
         return self
 
     async def translate_async(self) -> Self:
-        convert_engin,convert_config=self.convert_engine,self.converter_config
-        translator_config=self.translator_config
+        convert_engin, convert_config = self.convert_engine, self.config.converter_config
+        translator_config = self.config.translator_config
         document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
         # 翻译解析后文件
         translator = MDTranslator(translator_config)
@@ -125,32 +98,32 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
         return self
 
     def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
-        export_config=export_config or self.md2html_exporter_config
+        export_config = export_config or self.config.html_exporter_config
         docu = self._export(MD2HTMLExporter(export_config))
         return docu.content.decode()
 
-    def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
+    def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
         docu = self._export(MD2MDExporter())
         return docu.content.decode()
 
-    def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
+    def export_to_markdown_zip(self, config: ExporterConfig | None = None) -> bytes:
         docu = self._export(MD2MDZipExporter())
         return docu.content
 
     def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
                      export_config: MD2HTMLExporterConfig | None = None) -> Self:
-        export_config = export_config or self.md2html_exporter_config
+        export_config = export_config or self.config.html_exporter_config
         self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
         return self
 
     def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
-                         export_config=None) -> Self:
+                         export_config: ExporterConfig | None = None) -> Self:
 
         self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
         return self
 
     def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
-                             export_config=None) -> Self:
+                             export_config: ExporterConfig | None = None) -> Self:
 
         self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
         return self