From d25f634e73fc46520abfa30c291deacade3d4125 Mon Sep 17 00:00:00 2001 From: xunbu Date: Wed, 30 Jul 2025 20:48:11 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=9E=84workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 19 ++- docutranslate/app.py | 32 ++-- .../cacher/md_based_convert_cacher.py | 14 +- docutranslate/converter/base.py | 29 ++++ docutranslate/converter/interfaces.py | 11 -- .../converter/x2md/{interfaces.py => base.py} | 15 +- .../converter/x2md/converter_docling.py | 23 ++- .../converter/x2md/converter_identity.py | 4 +- .../converter/x2md/converter_mineru.py | 12 +- docutranslate/exporter/base.py | 20 +++ docutranslate/exporter/export_config.py | 8 - docutranslate/exporter/interfaces.py | 15 -- .../exporter/{md2x => md}/__init__.py | 0 docutranslate/exporter/md/base.py | 18 +++ .../exporter/{md2x => md}/md2html_exporter.py | 12 +- docutranslate/exporter/md/md2md_exporter.py | 8 + .../exporter/md/md2mdzip_exporter.py | 11 ++ docutranslate/exporter/md/types.py | 3 + docutranslate/exporter/md2x/interfaces.py | 9 -- docutranslate/exporter/md2x/md2md_exporter.py | 18 --- .../exporter/md2x/md2mdzip_exporter.py | 21 --- docutranslate/exporter/md2x/types.py | 14 -- .../exporter/{txt2x => txt}/__init__.py | 0 .../{txt2x/interfaces.py => txt/base.py} | 4 +- .../{txt2x => txt}/txt2html_exporter.py | 13 +- .../{txt2x => txt}/txt2txt_exporter.py | 7 +- docutranslate/translater/base.py | 16 -- docutranslate/translater/interfaces.py | 20 --- .../{translater => translator}/__init__.py | 0 .../translator/ai_translator/__init__.py | 0 .../translator/ai_translator/base.py | 35 +++++ .../ai_translator}/md_translator.py | 11 +- .../ai_translator}/txt_translator.py | 11 +- docutranslate/translator/base.py | 27 ++++ .../workflow/{base_workflow.py => base.py} | 14 +- docutranslate/workflow/interfaces.py | 4 +- docutranslate/workflow/md_based_workflow.py | 141 +++++++++++------- docutranslate/workflow/txt_workflow.py | 18 +-- 38 files changed, 351 insertions(+), 286 deletions(-) create mode 100644 docutranslate/converter/base.py delete mode 100644 docutranslate/converter/interfaces.py rename docutranslate/converter/x2md/{interfaces.py => base.py} (55%) create mode 100644 docutranslate/exporter/base.py delete mode 100644 docutranslate/exporter/export_config.py delete mode 100644 docutranslate/exporter/interfaces.py rename docutranslate/exporter/{md2x => md}/__init__.py (100%) create mode 100644 docutranslate/exporter/md/base.py rename docutranslate/exporter/{md2x => md}/md2html_exporter.py (93%) create mode 100644 docutranslate/exporter/md/md2md_exporter.py create mode 100644 docutranslate/exporter/md/md2mdzip_exporter.py create mode 100644 docutranslate/exporter/md/types.py delete mode 100644 docutranslate/exporter/md2x/interfaces.py delete mode 100644 docutranslate/exporter/md2x/md2md_exporter.py delete mode 100644 docutranslate/exporter/md2x/md2mdzip_exporter.py delete mode 100644 docutranslate/exporter/md2x/types.py rename docutranslate/exporter/{txt2x => txt}/__init__.py (100%) rename docutranslate/exporter/{txt2x/interfaces.py => txt/base.py} (66%) rename docutranslate/exporter/{txt2x => txt}/txt2html_exporter.py (76%) rename docutranslate/exporter/{txt2x => txt}/txt2txt_exporter.py (58%) delete mode 100644 docutranslate/translater/base.py delete mode 100644 docutranslate/translater/interfaces.py rename docutranslate/{translater => translator}/__init__.py (100%) create mode 100644 docutranslate/translator/ai_translator/__init__.py create mode 100644 docutranslate/translator/ai_translator/base.py rename docutranslate/{translater => translator/ai_translator}/md_translator.py (90%) rename docutranslate/{translater => translator/ai_translator}/txt_translator.py (86%) create mode 100644 docutranslate/translator/base.py rename docutranslate/workflow/{base_workflow.py => base.py} (83%) diff --git a/README.md b/README.md index b309b71..7f00086 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![github下载数](https://img.shields.io/github/downloads/xunbu/docutranslate/total?logo=github)](https://github.com/xunbu/docutranslate/releases) [![PyPI version](https://img.shields.io/pypi/v/docutranslate)](https://pypi.org/project/docutranslate/) [![python版本](https://img.shields.io/badge/Python-3.11+-3776AB?logo=python&logoColor=white)](https://www.python.org/) -[![开源协议](https://img.shields.io/github/license/xunbu/docutranslate)](./LICENSE) +[![开源协议](https://img.shields.io/github/license/xunbu/docutranslate)](./LICENSE) 文件翻译工具,借助[docling](https://github.com/docling-project/docling)、[minerU](https://mineru.net/)与大语言模型实现多种格式文件的翻译 @@ -43,15 +43,14 @@ 2. `pip install -e .` 3. `uv pip install -e .`#使用uv -# 支持的文件格式 +# 翻译工作流 -| 输入格式 | 输出格式 | -|----------------|--------------| -| PDF | Markdown(推荐) | -| Markdown | HTML | -| HTML、XHTML | PDF(仅交互界面支持) | -| CSV | | -| DOC、DOCX(部分支持) | | +| 工作流 | 代码 | 输入格式 | 输出格式 | +|-------------------------|------------------|----------------------------------------|----------------------| +| `MarkdownBasedWorkflow` | `markdown_based` | `.pdf ` `.md` `.png` `.jpeg` `.docx`等 | `.md` `.html` `.pdf` | +| `TXTWorkflow` | `txt` | `.txt ` | `.txt` `.html` `.pdf` | + +> 所有.pdf的输出只能通过交互式界面获取 > 如果想不使用交互界面获取pdf,可以先下载HTML文件,用浏览器打开并打印 @@ -143,7 +142,7 @@ docutranslate -i -p 8011 ## 翻译文件 ```python -from docutranslate.translater import FileTranslater +from docutranslate.translator import FileTranslater translater = FileTranslater(base_url="", # 大模型的baseurl key="", # 大模型的api-key diff --git a/docutranslate/app.py b/docutranslate/app.py index a3bd0cd..b5450b1 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -22,7 +22,7 @@ from pydantic import BaseModel, Field from docutranslate.global_values.conditional_import import DOCLING_EXIST # --- 核心代码重构后的新 Imports --- -from docutranslate.workflow.base_workflow import BaseWorkflow +from docutranslate.workflow.base import Workflow from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow from docutranslate.workflow.txt_workflow import TXTWorkflow @@ -30,16 +30,16 @@ from docutranslate.workflow.txt_workflow import TXTWorkflow if DOCLING_EXIST or TYPE_CHECKING: from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig -from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig -from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig -from docutranslate.translater.base import AiTranslateConfig -from docutranslate.translater.md_translator import MDTranslateConfig -from docutranslate.translater.txt_translator import TXTTranslateConfig +from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig +from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig +from docutranslate.translator.base import AiTranslateConfig +from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig +from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig # ------------------------------------ from docutranslate import __version__ from docutranslate.logger import global_logger -from docutranslate.translater import default_params +from docutranslate.translator import default_params from docutranslate.utils.resource_utils import resource_path # --- 全局配置 (MODIFIED) --- @@ -50,7 +50,7 @@ MAX_LOG_HISTORY = 200 httpx_client: httpx.AsyncClient # --- [NEW] Workflow字典 --- -WORKFLOW_DICT: Dict[str, type[BaseWorkflow]] = { +WORKFLOW_DICT: Dict[str, type[Workflow]] = { "markdown_based": MarkdownBasedWorkflow, "txt": TXTWorkflow, } @@ -70,7 +70,7 @@ def _create_default_task_state() -> Dict[str, Any]: # --- [KEPT FOR TEMP ENDPOINT] Workflow 工厂函数 (旧逻辑,仅为临时接口保留) --- -def _get_workflow_for_file(filename: str, logger: logging.Logger) -> BaseWorkflow: +def _get_workflow_for_file(filename: str, logger: logging.Logger) -> Workflow: """根据文件名后缀选择并返回合适的 Workflow 实例。这是扩展点。""" suffix = Path(filename).suffix.lower() if suffix == '.txt': @@ -299,7 +299,7 @@ async def _perform_translation( # 4. 根据 payload 的具体类型执行不同的翻译流程 (类型安全!) if isinstance(payload, MarkdownWorkflowParams) and isinstance(workflow, MarkdownBasedWorkflow): task_logger.info("执行 MarkdownBased 翻译流程。") - translate_config = MDTranslateConfig(**ai_config.__dict__) + translate_config = MDTranslatorConfig(**ai_config.__dict__) convert_config = None if payload.convert_engin == 'mineru': @@ -323,7 +323,7 @@ async def _perform_translation( elif isinstance(payload, TextWorkflowParams) and isinstance(workflow, TXTWorkflow): task_logger.info("执行 TXT 翻译流程。") - translate_config = TXTTranslateConfig(**ai_config.__dict__) + translate_config = TXTTranslatorConfig(**ai_config.__dict__) await workflow.translate_async(translate_config=translate_config) else: @@ -750,7 +750,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple if not task_state.get("download_ready") or not task_state.get("workflow_instance"): raise HTTPException(status_code=404, detail="内容尚未准备好。") - workflow: BaseWorkflow = task_state["workflow_instance"] + workflow: Workflow = task_state["workflow_instance"] filename_stem = task_state['original_filename_stem'] try: @@ -759,8 +759,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple filename: str if file_type == 'html' and isinstance(workflow, HTMLExportable): - config = MD2HTMLExportConfig(cdn=True) if isinstance(workflow, - MarkdownBasedWorkflow) else TXT2HTMLExportConfig( + config = MD2HTMLExporterConfig(cdn=True) if isinstance(workflow, + MarkdownBasedWorkflow) else TXT2HTMLExporterConfig( cdn=True) try: await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js", @@ -1073,14 +1073,14 @@ async def temp_translate( workflow.read_bytes(decoded_content, Path(file_name).stem, Path(file_name).suffix) if isinstance(workflow, MarkdownBasedWorkflow): - translate_config = MDTranslateConfig(**ai_config.__dict__) + translate_config = MDTranslatorConfig(**ai_config.__dict__) convert_config = ConverterMineruConfig(mineru_token=mineru_token) if mineru_token else None convert_engin = 'mineru' if mineru_token else None await workflow.translate_async(convert_engin, convert_config, translate_config) return {"success": True, "content": workflow.export_to_markdown()} elif isinstance(workflow, TXTWorkflow): - translate_config = TXTTranslateConfig(**ai_config.__dict__) + translate_config = TXTTranslatorConfig(**ai_config.__dict__) await workflow.translate_async(translate_config) return {"success": True, "content": workflow.export_to_txt()} diff --git a/docutranslate/cacher/md_based_convert_cacher.py b/docutranslate/cacher/md_based_convert_cacher.py index 04bd7cd..c92498d 100644 --- a/docutranslate/cacher/md_based_convert_cacher.py +++ b/docutranslate/cacher/md_based_convert_cacher.py @@ -1,7 +1,7 @@ import os from collections import OrderedDict -from docutranslate.exporter.md2x.types import x2md_convert_config_type +from docutranslate.converter.base import ConverterConfig from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument @@ -13,17 +13,17 @@ class MDBasedCovertCacher: self.cache_dict = OrderedDict() @staticmethod - def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str: - obj = (document.suffix, document.content, convert_engin, convert_config) + def _get_hashcode(document: Document, convert_engin: str, convert_config: ConverterConfig) -> str: + obj = (document.suffix, document.content, convert_engin, convert_config.gethash()) return str(hash(obj)) def get_cached_result(self, document: Document, convert_engin: str, - convert_config: x2md_convert_config_type) -> MarkdownDocument | None: - return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config)) + convert_config: ConverterConfig) -> MarkdownDocument | None: + return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash())) def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str, - convert_config: x2md_convert_config_type) -> MarkdownDocument: - hash_code = self._get_hashcode(document, convert_engin, convert_config) + convert_config: ConverterConfig) -> MarkdownDocument: + hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash()) if len(self.cache_dict) > int(CACHE_NUM): self.cache_dict.popitem(last=False) self.cache_dict[hash_code] = convert_result diff --git a/docutranslate/converter/base.py b/docutranslate/converter/base.py new file mode 100644 index 0000000..1c32396 --- /dev/null +++ b/docutranslate/converter/base.py @@ -0,0 +1,29 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from logging import Logger +from typing import Hashable + +from docutranslate.ir.document import Document +from docutranslate.logger import global_logger + + +@dataclass(kw_only=True) +class ConverterConfig(ABC): + logger: Logger | None = None + @abstractmethod + def gethash(self)->Hashable: + ... + + + +class Converter(ABC): + def __init__(self, config: ConverterConfig | None = None): + self.config = config + self.logger = config.logger or global_logger + + @abstractmethod + def convert(self, document: Document) -> Document: + ... + + async def convert_async(self, document: Document) -> Document: + ... diff --git a/docutranslate/converter/interfaces.py b/docutranslate/converter/interfaces.py deleted file mode 100644 index 54235a2..0000000 --- a/docutranslate/converter/interfaces.py +++ /dev/null @@ -1,11 +0,0 @@ -from typing import Protocol - -from docutranslate.ir.document import Document - - -class Converter(Protocol): - def convert(self, document: Document) -> Document: - ... - - async def convert_async(self, document: Document) -> Document: - ... diff --git a/docutranslate/converter/x2md/interfaces.py b/docutranslate/converter/x2md/base.py similarity index 55% rename from docutranslate/converter/x2md/interfaces.py rename to docutranslate/converter/x2md/base.py index 7f754bb..658e7fc 100644 --- a/docutranslate/converter/x2md/interfaces.py +++ b/docutranslate/converter/x2md/base.py @@ -1,19 +1,28 @@ -from typing import Protocol -from docutranslate.converter.interfaces import Converter +from abc import abstractmethod +from dataclasses import dataclass + +from docutranslate.converter.base import Converter, ConverterConfig from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument +@dataclass(kw_only=True) +class X2MarkdownConverterConfig(ConverterConfig): + ... -class X2MarkdownConverter(Converter,Protocol): +class X2MarkdownConverter(Converter): """ 负责将其它格式的文件转换为markdown """ + + @abstractmethod def convert(self, document: Document) -> MarkdownDocument: ... + @abstractmethod async def convert_async(self, document: Document) -> MarkdownDocument: ... + @abstractmethod def support_format(self)->list[str]: ... \ No newline at end of file diff --git a/docutranslate/converter/x2md/converter_docling.py b/docutranslate/converter/x2md/converter_docling.py index 57f1776..4a64cf7 100644 --- a/docutranslate/converter/x2md/converter_docling.py +++ b/docutranslate/converter/x2md/converter_docling.py @@ -3,7 +3,6 @@ import os import time from dataclasses import dataclass from io import BytesIO -from logging import Logger from pathlib import Path from docling.datamodel.base_models import InputFormat @@ -14,34 +13,34 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from docling_core.types.doc import ImageRefMode from huggingface_hub.errors import LocalEntryNotFoundError -from docutranslate.converter.x2md.interfaces import X2MarkdownConverter +from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument -from docutranslate.logger import global_logger IMAGE_RESOLUTION_SCALE = 4 -@dataclass(frozen=True) -class ConverterDoclingConfig: +@dataclass(kw_only=True) +class ConverterDoclingConfig(X2MarkdownConverterConfig): code: bool = True formula: bool = True artifact: Path | None = None + def gethash(self): + return self.code,self.formula + class ConverterDocling(X2MarkdownConverter): - def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger): - self.logger = logger - self.config = config + def __init__(self, config: ConverterDoclingConfig): + super().__init__(config=config) self.code = config.code self.formula = config.formula - artifact=Path("./docling_artifact") + artifact = Path("./docling_artifact") if artifact.is_dir(): self.logger.info("使用./docling_artifact的本地模型") - self.artifact=artifact + self.artifact = artifact else: - self.artifact=config.artifact - + self.artifact = config.artifact def convert(self, document) -> MarkdownDocument: assert isinstance(document.name, str) diff --git a/docutranslate/converter/x2md/converter_identity.py b/docutranslate/converter/x2md/converter_identity.py index e139f2a..96462bf 100644 --- a/docutranslate/converter/x2md/converter_identity.py +++ b/docutranslate/converter/x2md/converter_identity.py @@ -1,10 +1,10 @@ -from docutranslate.converter.x2md.interfaces import X2MarkdownConverter +from docutranslate.converter.x2md.base import X2MarkdownConverter from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument class ConverterIdentity(X2MarkdownConverter): - + #TODO:支持markdown_zip格式输入 def convert(self, document: Document) -> MarkdownDocument: return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem) diff --git a/docutranslate/converter/x2md/converter_mineru.py b/docutranslate/converter/x2md/converter_mineru.py index 18ec95d..820d89b 100644 --- a/docutranslate/converter/x2md/converter_mineru.py +++ b/docutranslate/converter/x2md/converter_mineru.py @@ -3,10 +3,11 @@ import time import zipfile from dataclasses import dataclass from logging import Logger +from typing import Hashable import httpx -from docutranslate.converter.x2md.interfaces import X2MarkdownConverter +from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.logger import global_logger @@ -15,11 +16,14 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip URL = 'https://mineru.net/api/v4/file-urls/batch' -@dataclass(frozen=True) -class ConverterMineruConfig: +@dataclass(kw_only=True) +class ConverterMineruConfig(X2MarkdownConverterConfig): mineru_token: str formula: bool = True + def gethash(self) ->Hashable: + return self.formula + timeout = httpx.Timeout( connect=5.0, # 连接超时 (建立连接的最长时间) @@ -34,7 +38,7 @@ client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, v class ConverterMineru(X2MarkdownConverter): def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger): - self.config = config + super().__init__(config=config) self.mineru_token = config.mineru_token.strip() self.formula = config.formula self.logger = logger diff --git a/docutranslate/exporter/base.py b/docutranslate/exporter/base.py new file mode 100644 index 0000000..9d47bae --- /dev/null +++ b/docutranslate/exporter/base.py @@ -0,0 +1,20 @@ +from abc import ABC,abstractmethod +from typing import Generic,TypeVar, Any + +from dataclasses import dataclass + +from docutranslate.ir.document import Document + +D_in = TypeVar('D_in', bound=Document) + +@dataclass(kw_only=True) +class ExporterConfig: + ... + +class Exporter(ABC,Generic[D_in]): + def __init__(self,config:ExporterConfig|None=None): + self.config=config + + @abstractmethod + def export(self, document: D_in) -> Any: + ... diff --git a/docutranslate/exporter/export_config.py b/docutranslate/exporter/export_config.py deleted file mode 100644 index 315f4f7..0000000 --- a/docutranslate/exporter/export_config.py +++ /dev/null @@ -1,8 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class ExportConfig: - pass - - - diff --git a/docutranslate/exporter/interfaces.py b/docutranslate/exporter/interfaces.py deleted file mode 100644 index 3854493..0000000 --- a/docutranslate/exporter/interfaces.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Protocol, TypeVar, Any, Self - -from docutranslate.exporter.export_config import ExportConfig -from docutranslate.ir.document import Document - -D_in = TypeVar('D_in', bound=Document) - - -class Exporter(Protocol[D_in]): - @classmethod - def from_config(cls, export_config: ExportConfig | None = None) -> Self: - ... - - def export(self, document: D_in) -> Any: - ... diff --git a/docutranslate/exporter/md2x/__init__.py b/docutranslate/exporter/md/__init__.py similarity index 100% rename from docutranslate/exporter/md2x/__init__.py rename to docutranslate/exporter/md/__init__.py diff --git a/docutranslate/exporter/md/base.py b/docutranslate/exporter/md/base.py new file mode 100644 index 0000000..f47a41f --- /dev/null +++ b/docutranslate/exporter/md/base.py @@ -0,0 +1,18 @@ +from dataclasses import dataclass + +from docutranslate.exporter.base import Exporter, ExporterConfig +from docutranslate.ir.document import Document +from docutranslate.ir.markdown_document import MarkdownDocument + + +@dataclass(kw_only=True) +class MDExporterConfig(ExporterConfig): + ... + + +class MDExporter(Exporter): + def __init__(self, config: MDExporterConfig|None=None): + super().__init__(config=config) + + def export(self, document: MarkdownDocument) -> Document: + ... diff --git a/docutranslate/exporter/md2x/md2html_exporter.py b/docutranslate/exporter/md/md2html_exporter.py similarity index 93% rename from docutranslate/exporter/md2x/md2html_exporter.py rename to docutranslate/exporter/md/md2html_exporter.py index 9755e30..f45437f 100644 --- a/docutranslate/exporter/md2x/md2html_exporter.py +++ b/docutranslate/exporter/md/md2html_exporter.py @@ -3,20 +3,20 @@ from dataclasses import dataclass import jinja2 import markdown2 -from docutranslate.exporter.export_config import ExportConfig -from docutranslate.exporter.md2x.interfaces import MDExporter +from docutranslate.exporter.md.base import MDExporter, MDExporterConfig from docutranslate.ir.document import Document from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.utils.resource_utils import resource_path @dataclass -class MD2HTMLExportConfig(ExportConfig): +class MD2HTMLExporterConfig(MDExporterConfig): cdn: bool = True class MD2HTMLExporter(MDExporter): - def __init__(self, export_config: MD2HTMLExportConfig = None): - export_config = export_config or MD2HTMLExportConfig() - self.cdn=export_config.cdn + def __init__(self, config: MD2HTMLExporterConfig = None): + config = config or MD2HTMLExporterConfig() + super().__init__(config=config) + self.cdn=config.cdn def export(self, document: MarkdownDocument) -> Document: cdn = self.cdn diff --git a/docutranslate/exporter/md/md2md_exporter.py b/docutranslate/exporter/md/md2md_exporter.py new file mode 100644 index 0000000..74be502 --- /dev/null +++ b/docutranslate/exporter/md/md2md_exporter.py @@ -0,0 +1,8 @@ +from docutranslate.exporter.md.base import MDExporter +from docutranslate.ir.markdown_document import MarkdownDocument, Document + + +class MD2MDExporter(MDExporter): + + def export(self, document: MarkdownDocument) -> Document: + return Document.from_bytes(suffix=".md", content=document.content, stem=document.stem) diff --git a/docutranslate/exporter/md/md2mdzip_exporter.py b/docutranslate/exporter/md/md2mdzip_exporter.py new file mode 100644 index 0000000..517e2ab --- /dev/null +++ b/docutranslate/exporter/md/md2mdzip_exporter.py @@ -0,0 +1,11 @@ +from docutranslate.exporter.md.base import MDExporter +from docutranslate.ir.markdown_document import MarkdownDocument, Document +from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip + + +class MD2MDZipExporter(MDExporter): + + def export(self, document: MarkdownDocument) -> Document: + return Document.from_bytes(suffix=".zip", content=unembed_base64_images_to_zip(document.content.decode(), + markdown_name=document.name), + stem=document.stem) diff --git a/docutranslate/exporter/md/types.py b/docutranslate/exporter/md/types.py new file mode 100644 index 0000000..56ab69b --- /dev/null +++ b/docutranslate/exporter/md/types.py @@ -0,0 +1,3 @@ +from typing import Literal + +ConvertEnginType = Literal["mineru", "docling"] \ No newline at end of file diff --git a/docutranslate/exporter/md2x/interfaces.py b/docutranslate/exporter/md2x/interfaces.py deleted file mode 100644 index 863c7ed..0000000 --- a/docutranslate/exporter/md2x/interfaces.py +++ /dev/null @@ -1,9 +0,0 @@ -from docutranslate.exporter.interfaces import Exporter -from docutranslate.ir.document import Document -from docutranslate.ir.markdown_document import MarkdownDocument - - -class MDExporter(Exporter): - - def export(self,document:MarkdownDocument)->Document: - ... diff --git a/docutranslate/exporter/md2x/md2md_exporter.py b/docutranslate/exporter/md2x/md2md_exporter.py deleted file mode 100644 index e03730c..0000000 --- a/docutranslate/exporter/md2x/md2md_exporter.py +++ /dev/null @@ -1,18 +0,0 @@ -from dataclasses import dataclass - -from docutranslate.exporter.export_config import ExportConfig -from docutranslate.exporter.md2x.interfaces import MDExporter -from docutranslate.ir.markdown_document import MarkdownDocument,Document - - -@dataclass -class MD2MDExportConfig(ExportConfig): - pass - - -class MD2MDExporter(MDExporter): - def __init__(self, export_config: MD2MDExportConfig | None=None): - pass - - def export(self,document:MarkdownDocument)->Document: - return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem) diff --git a/docutranslate/exporter/md2x/md2mdzip_exporter.py b/docutranslate/exporter/md2x/md2mdzip_exporter.py deleted file mode 100644 index effb66a..0000000 --- a/docutranslate/exporter/md2x/md2mdzip_exporter.py +++ /dev/null @@ -1,21 +0,0 @@ -from dataclasses import dataclass - -from docutranslate.exporter.export_config import ExportConfig -from docutranslate.exporter.md2x.interfaces import MDExporter -from docutranslate.ir.markdown_document import MarkdownDocument,Document -from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip - - -@dataclass -class MD2MDZIPExportConfig(ExportConfig): - pass - - -class MD2MDZipExporter(MDExporter): - def __init__(self, export_config: MD2MDZIPExportConfig | None=None): - pass - - def export(self,document:MarkdownDocument)->Document: - return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem) - - diff --git a/docutranslate/exporter/md2x/types.py b/docutranslate/exporter/md2x/types.py deleted file mode 100644 index a483241..0000000 --- a/docutranslate/exporter/md2x/types.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Literal, TYPE_CHECKING - -from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig -from docutranslate.global_values.conditional_import import DOCLING_EXIST - -if DOCLING_EXIST or TYPE_CHECKING: - from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig - -convert_engin_type = Literal["mineru", "docling"] - -if DOCLING_EXIST or TYPE_CHECKING: - x2md_convert_config_type = ConverterDoclingConfig | ConverterMineruConfig -else: - x2md_convert_config_type = ConverterMineruConfig diff --git a/docutranslate/exporter/txt2x/__init__.py b/docutranslate/exporter/txt/__init__.py similarity index 100% rename from docutranslate/exporter/txt2x/__init__.py rename to docutranslate/exporter/txt/__init__.py diff --git a/docutranslate/exporter/txt2x/interfaces.py b/docutranslate/exporter/txt/base.py similarity index 66% rename from docutranslate/exporter/txt2x/interfaces.py rename to docutranslate/exporter/txt/base.py index fb29634..2b2a542 100644 --- a/docutranslate/exporter/txt2x/interfaces.py +++ b/docutranslate/exporter/txt/base.py @@ -1,8 +1,8 @@ -from docutranslate.exporter.interfaces import Exporter +from docutranslate.exporter.base import Exporter from docutranslate.ir.document import Document #TODO:看情况是否需要为TXT单独写一个document类型 -class TXTExporter(Exporter): +class TXTExporter(Exporter[Document]): def export(self,document:Document)->Document: ... \ No newline at end of file diff --git a/docutranslate/exporter/txt2x/txt2html_exporter.py b/docutranslate/exporter/txt/txt2html_exporter.py similarity index 76% rename from docutranslate/exporter/txt2x/txt2html_exporter.py rename to docutranslate/exporter/txt/txt2html_exporter.py index f9b7ccb..e251eca 100644 --- a/docutranslate/exporter/txt2x/txt2html_exporter.py +++ b/docutranslate/exporter/txt/txt2html_exporter.py @@ -2,21 +2,22 @@ from dataclasses import dataclass import jinja2 -from docutranslate.exporter.export_config import ExportConfig -from docutranslate.exporter.txt2x.interfaces import TXTExporter +from docutranslate.exporter.base import ExporterConfig +from docutranslate.exporter.txt.base import TXTExporter from docutranslate.ir.document import Document from docutranslate.utils.resource_utils import resource_path @dataclass -class TXT2HTMLExportConfig(ExportConfig): +class TXT2HTMLExporterConfig(ExporterConfig): cdn: bool = True class TXT2HTMLExporter(TXTExporter): - def __init__(self, export_config: TXT2HTMLExportConfig = None): - export_config = export_config or TXT2HTMLExportConfig() - self.cdn = export_config.cdn + def __init__(self, config: TXT2HTMLExporterConfig = None): + config = config or TXT2HTMLExporterConfig() + super().__init__(config=config) + self.cdn = config.cdn def export(self, document: Document) -> Document: cdn = self.cdn diff --git a/docutranslate/exporter/txt2x/txt2txt_exporter.py b/docutranslate/exporter/txt/txt2txt_exporter.py similarity index 58% rename from docutranslate/exporter/txt2x/txt2txt_exporter.py rename to docutranslate/exporter/txt/txt2txt_exporter.py index 9d46e29..9d940cf 100644 --- a/docutranslate/exporter/txt2x/txt2txt_exporter.py +++ b/docutranslate/exporter/txt/txt2txt_exporter.py @@ -1,10 +1,7 @@ -from docutranslate.exporter.txt2x.interfaces import TXTExporter +from docutranslate.exporter.txt.base import TXTExporter from docutranslate.ir.document import Document - - - class TXT2TXTExporter(TXTExporter): def export(self, document: Document) -> Document: - return document.copy() + return document.copy() diff --git a/docutranslate/translater/base.py b/docutranslate/translater/base.py deleted file mode 100644 index 97ff2e6..0000000 --- a/docutranslate/translater/base.py +++ /dev/null @@ -1,16 +0,0 @@ -from dataclasses import dataclass -from logging import Logger - - -@dataclass -class AiTranslateConfig: - base_url: str - api_key: str - model_id: str - to_lang: str - custom_prompt: str | None = None - temperature: float = 0.7 - timeout: int = 2000 - chunk_size: int = 3000 - concurrent: int = 30 - logger: Logger | None = None \ No newline at end of file diff --git a/docutranslate/translater/interfaces.py b/docutranslate/translater/interfaces.py deleted file mode 100644 index 656841c..0000000 --- a/docutranslate/translater/interfaces.py +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Protocol, TypeVar - -from docutranslate.agents import Agent -from docutranslate.ir.document import Document - -T=TypeVar('T',bound=Document) -V=TypeVar('V',bound=Agent) - -class Translator(Protocol[T,V]): - """ - 翻译中间文本(原地替换),Translator不做格式转换 - """ - def translate(self, document:T) -> Document: - ... - - async def translate_async(self, document: T) -> Document: - ... - - def log(self,info:str): - ... \ No newline at end of file diff --git a/docutranslate/translater/__init__.py b/docutranslate/translator/__init__.py similarity index 100% rename from docutranslate/translater/__init__.py rename to docutranslate/translator/__init__.py diff --git a/docutranslate/translator/ai_translator/__init__.py b/docutranslate/translator/ai_translator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/translator/ai_translator/base.py b/docutranslate/translator/ai_translator/base.py new file mode 100644 index 0000000..9cfea05 --- /dev/null +++ b/docutranslate/translator/ai_translator/base.py @@ -0,0 +1,35 @@ +from abc import abstractmethod +from dataclasses import dataclass +from logging import Logger +from typing import TypeVar + +from docutranslate.ir.document import Document +from docutranslate.translator.base import Translator, TranslatorConfig + + +@dataclass(kw_only=True) +class AiTranslatorConfig(TranslatorConfig): + base_url: str + api_key: str + model_id: str + to_lang: str + custom_prompt: str | None = None + temperature: float = 0.7 + timeout: int = 2000 + chunk_size: int = 3000 + concurrent: int = 30 + +T=TypeVar('T',bound=Document) + +class AiTranslator(Translator[T]): + """ + 翻译中间文本(原地替换),Translator不做格式转换 + """ + def __init__(self,config:AiTranslatorConfig,logger:Logger|None=None): + super().__init__(config=config,logger=logger) + @abstractmethod + def translate(self, document:T) -> Document: + ... + @abstractmethod + async def translate_async(self, document: T) -> Document: + ... \ No newline at end of file diff --git a/docutranslate/translater/md_translator.py b/docutranslate/translator/ai_translator/md_translator.py similarity index 90% rename from docutranslate/translater/md_translator.py rename to docutranslate/translator/ai_translator/md_translator.py index ea79691..80b07fe 100644 --- a/docutranslate/translater/md_translator.py +++ b/docutranslate/translator/ai_translator/md_translator.py @@ -5,22 +5,21 @@ from typing import Self from docutranslate.agents import MDTranslateAgent from docutranslate.context.md_mask_context import MDMaskUrisContext from docutranslate.ir.markdown_document import MarkdownDocument -from docutranslate.logger import global_logger -from docutranslate.translater.base import AiTranslateConfig -from docutranslate.translater.interfaces import Translator +from docutranslate.translator.ai_translator.base import AiTranslatorConfig +from docutranslate.translator.base import Translator from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts from docutranslate.utils.markdown_utils import clean_markdown_math_block @dataclass -class MDTranslateConfig(AiTranslateConfig): +class MDTranslatorConfig(AiTranslatorConfig): ... class MDTranslator(Translator): - def __init__(self, config: MDTranslateConfig): - self.logger = config.logger or global_logger + def __init__(self, config: MDTranslatorConfig): + super().__init__(config=config) self.chunk_size = config.chunk_size self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt, to_lang=config.to_lang, diff --git a/docutranslate/translater/txt_translator.py b/docutranslate/translator/ai_translator/txt_translator.py similarity index 86% rename from docutranslate/translater/txt_translator.py rename to docutranslate/translator/ai_translator/txt_translator.py index 05200e4..128bf5f 100644 --- a/docutranslate/translater/txt_translator.py +++ b/docutranslate/translator/ai_translator/txt_translator.py @@ -3,20 +3,19 @@ from typing import Self from docutranslate.agents.txt_agent import TXTTranslateAgent from docutranslate.ir.document import Document -from docutranslate.logger import global_logger -from docutranslate.translater.base import AiTranslateConfig -from docutranslate.translater.interfaces import Translator +from docutranslate.translator.ai_translator.base import AiTranslatorConfig +from docutranslate.translator.base import Translator from docutranslate.utils.markdown_splitter import split_markdown_text @dataclass -class TXTTranslateConfig(AiTranslateConfig): +class TXTTranslatorConfig(AiTranslatorConfig): ... class TXTTranslator(Translator): - def __init__(self, config: TXTTranslateConfig): - self.logger = config.logger or global_logger + def __init__(self, config: TXTTranslatorConfig): + super().__init__(config=config) self.chunk_size = config.chunk_size self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt, to_lang=config.to_lang, diff --git a/docutranslate/translator/base.py b/docutranslate/translator/base.py new file mode 100644 index 0000000..8c53f8f --- /dev/null +++ b/docutranslate/translator/base.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from logging import Logger +from typing import TypeVar,Generic +from abc import ABC,abstractmethod +from docutranslate.ir.document import Document +from docutranslate.logger import global_logger + + +@dataclass(kw_only=True) +class TranslatorConfig: + logger:Logger|None=None + +T=TypeVar('T',bound=Document) + +class Translator(ABC,Generic[T]): + """ + 翻译中间文本(原地替换),Translator不做格式转换 + """ + def __init__(self,config:TranslatorConfig|None=None): + self.config=config + self.logger=config.logger or global_logger + @abstractmethod + def translate(self, document:T) -> Document: + ... + @abstractmethod + async def translate_async(self, document: T) -> Document: + ... \ No newline at end of file diff --git a/docutranslate/workflow/base_workflow.py b/docutranslate/workflow/base.py similarity index 83% rename from docutranslate/workflow/base_workflow.py rename to docutranslate/workflow/base.py index 3ce028f..85c9d2a 100644 --- a/docutranslate/workflow/base_workflow.py +++ b/docutranslate/workflow/base.py @@ -1,19 +1,27 @@ from abc import ABC, abstractmethod +from dataclasses import dataclass from logging import Logger from pathlib import Path from typing import Self, Generic, TypeVar -from docutranslate.exporter.interfaces import Exporter +from docutranslate.exporter.base import Exporter from docutranslate.ir.document import Document from docutranslate.logger import global_logger + +@dataclass(kw_only=True) +class WorkflowConfig: + logger: Logger | None = None + + +T_original = TypeVar('T_original', bound=Document) T_Translated = TypeVar('T_Translated', bound=Document) -class BaseWorkflow(ABC, Generic[T_Translated]): +class Workflow(ABC, Generic[T_original, T_Translated]): def __init__(self, logger: Logger = global_logger): self.logger = logger - self.document_original: Document | None = None + self.document_original: T_original | None = None self.document_translated: T_Translated | None = None def read_path(self, path: Path | str) -> Self: diff --git a/docutranslate/workflow/interfaces.py b/docutranslate/workflow/interfaces.py index 996d03d..884f645 100644 --- a/docutranslate/workflow/interfaces.py +++ b/docutranslate/workflow/interfaces.py @@ -1,9 +1,9 @@ from pathlib import Path from typing import Protocol, Self, TypeVar, runtime_checkable -from docutranslate.exporter.export_config import ExportConfig +from docutranslate.exporter.export_config import ExporterConfig -T = TypeVar("T", bound=ExportConfig) +T = TypeVar("T", bound=ExporterConfig) @runtime_checkable class HTMLExportable(Protocol[T]): diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index 6f482f2..ad5e27c 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -1,40 +1,84 @@ import asyncio +from dataclasses import dataclass +from logging import Logger from pathlib import Path -from typing import Self, Literal, overload, TYPE_CHECKING +from typing import Self, Tuple, Any from docutranslate.cacher import md_based_convert_cacher from docutranslate.global_values.conditional_import import DOCLING_EXIST -if DOCLING_EXIST or TYPE_CHECKING: +if DOCLING_EXIST: from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling from docutranslate.converter.x2md.converter_identity import ConverterIdentity from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru -from docutranslate.converter.x2md.interfaces import X2MarkdownConverter -from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter -from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter -from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter -from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type -from docutranslate.workflow.base_workflow import BaseWorkflow +from docutranslate.converter.x2md.base import X2MarkdownConverterConfig +from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter +from docutranslate.exporter.md.md2md_exporter import MD2MDExporter +from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter +from docutranslate.exporter.md.types import ConvertEnginType +from docutranslate.workflow.base import Workflow, WorkflowConfig from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable -from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator +from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator -class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) +@dataclass(kw_only=True) +class MarkdownBasedWorkflowConfig(WorkflowConfig): + # X2MarkdownConverterConfig + convert_engine: ConvertEnginType | None + formula: bool = True + # ConverterDoclingConfig + code: bool = True + artifact: Path | None = None + # ConverterMineruConfig + mineru_token: str + # MDTranslatorConfig + base_url: str + api_key: str + model_id: str + to_lang: str + custom_prompt: str | None = None + temperature: float = 0.7 + timeout: int = 2000 + chunk_size: int = 3000 + concurrent: int = 30 + # MD2HTMLExporterConfig + cdn: bool = True + # general + logger: Logger | None = None - if DOCLING_EXIST or TYPE_CHECKING: - self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = { - "mineru": (ConverterMineru, ConverterMineruConfig), - "docling": (ConverterDocling, ConverterDoclingConfig) - } - else: - self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = { - "mineru": (ConverterMineru, ConverterMineruConfig), - } - def _get_document_md(self, convert_engin: convert_engin_type | None, - convert_config: x2md_convert_config_type | None): +class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): + def __init__(self, config: MarkdownBasedWorkflowConfig): + super().__init__(config=config) + self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = { + "mineru": (ConverterMineru, ConverterMineruConfig), + } + if DOCLING_EXIST: + self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig) + self.x2markdown_converter_config:X2MarkdownConverterConfig|None + if config.convert_engine is None: + self.converter_config=None + elif config.convert_engine== "mineru": + self.converter_config = ConverterMineruConfig(formula=config.formula, + mineru_token=config.mineru_token) + elif DOCLING_EXIST and config.convert_engine== "docling": + self.converter_config = ConverterDoclingConfig(code=config.code, + formula=config.formula, + artifact=config.artifact) + self.translator_config = MDTranslatorConfig(base_url=config.base_url, + api_key=config.api_key, + model_id=config.model_id, + to_lang=config.to_lang, + custom_prompt=config.custom_prompt, + temperature=config.temperature, + timeout=config.timeout, + chunk_size=config.chunk_size, + concurrent=config.concurrent, + ) + self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn) + self.convert_engine=config.convert_engine + + def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig): if self.document_original is None: raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") # 获取缓存的解析后文件 @@ -51,7 +95,7 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable): if not isinstance(convert_config, config_class): raise TypeError( f"未传入正确的convert_config,应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型") - converter = converter_class(convert_config, logger=self.logger) + converter = converter_class(convert_config) else: raise ValueError(f"不存在{convert_engin}解析引擎") document_md = converter.convert(self.document_original) @@ -59,67 +103,54 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable): md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) return document_md - @overload - def translate(self, convert_engin: None, - convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self: - ... - @overload - def translate(self, convert_engin: Literal["docling"], - convert_config: "ConverterDoclingConfig", translate_config: MDTranslateConfig) -> Self: - ... - - @overload - def translate(self, convert_engin: Literal["mineru"], - convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self: - ... - - def translate(self, convert_engin: convert_engin_type | None, - convert_config: x2md_convert_config_type | None, - translate_config: MDTranslateConfig) -> Self: - document_md = self._get_document_md(convert_engin, convert_config) + def translate(self) -> Self: + convert_engin,convert_config=self.convert_engine,self.converter_config + translator_config=self.translator_config + document_md = self._get_document_md(convert_engin,convert_config) # 翻译解析后文件 - translator = MDTranslator(translate_config) + translator = MDTranslator(translator_config) translator.translate(document_md) self.document_translated = document_md return self - async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None, - convert_config: x2md_convert_config_type | None, - translate_config: MDTranslateConfig) -> Self: - + async def translate_async(self) -> Self: + convert_engin,convert_config=self.convert_engine,self.converter_config + translator_config=self.translator_config document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config) # 翻译解析后文件 - translator = MDTranslator(translate_config) + translator = MDTranslator(translator_config) await translator.translate_async(document_md) self.document_translated = document_md return self - def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str: + def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str: + export_config=export_config or self.md2html_exporter_config docu = self._export(MD2HTMLExporter(export_config)) return docu.content.decode() - def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str: + def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str: docu = self._export(MD2MDExporter()) return docu.content.decode() - def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes: + def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes: docu = self._export(MD2MDZipExporter()) return docu.content def save_as_html(self, name: str = None, output_dir: Path | str = "./output", - export_config: MD2HTMLExportConfig | None = None) -> Self: - self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir) + export_config: MD2HTMLExporterConfig | None = None) -> Self: + export_config = export_config or self.md2html_exporter_config + self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir) return self def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", - export_config: MD2MDExportConfig | None = None) -> Self: + export_config=None) -> Self: self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir) return self def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output", - export_config: MD2MDZIPExportConfig | None = None) -> Self: + export_config=None) -> Self: self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir) return self diff --git a/docutranslate/workflow/txt_workflow.py b/docutranslate/workflow/txt_workflow.py index 0a1014d..4def08e 100644 --- a/docutranslate/workflow/txt_workflow.py +++ b/docutranslate/workflow/txt_workflow.py @@ -1,17 +1,17 @@ from pathlib import Path from typing import Self -from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter -from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter -from docutranslate.workflow.base_workflow import BaseWorkflow +from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter +from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter +from docutranslate.workflow.base import Workflow from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable -from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator +from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator -class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable): +class TXTWorkflow(Workflow, HTMLExportable, TXTExportable): - def translate(self, translate_config: TXTTranslateConfig) -> Self: + def translate(self, translate_config: TXTTranslatorConfig) -> Self: document = self.document_original.copy() # 翻译解析后文件 translator = TXTTranslator(translate_config) @@ -19,7 +19,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable): self.document_translated = document return self - async def translate_async(self, translate_config: TXTTranslateConfig) -> Self: + async def translate_async(self, translate_config: TXTTranslatorConfig) -> Self: document = self.document_original.copy() # 翻译解析后文件 translator = TXTTranslator(translate_config) @@ -27,7 +27,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable): self.document_translated = document return self - def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str: + def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str: docu = self._export(TXT2HTMLExporter(export_config)) return docu.content.decode() @@ -36,7 +36,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable): return docu.content.decode() def save_as_html(self, name: str = None, output_dir: Path | str = "./output", - export_config: TXT2HTMLExportConfig | None = None) -> Self: + export_config: TXT2HTMLExporterConfig | None = None) -> Self: self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir) return self