重构workflow

This commit is contained in:
xunbu
2025-07-30 20:48:11 +08:00
parent 8987e4ef60
commit d25f634e73
38 changed files with 351 additions and 286 deletions

View File

@@ -1,19 +1,27 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from logging import Logger
from pathlib import Path
from typing import Self, Generic, TypeVar
from docutranslate.exporter.interfaces import Exporter
from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True)
class WorkflowConfig:
logger: Logger | None = None
T_original = TypeVar('T_original', bound=Document)
T_Translated = TypeVar('T_Translated', bound=Document)
class BaseWorkflow(ABC, Generic[T_Translated]):
class Workflow(ABC, Generic[T_original, T_Translated]):
def __init__(self, logger: Logger = global_logger):
self.logger = logger
self.document_original: Document | None = None
self.document_original: T_original | None = None
self.document_translated: T_Translated | None = None
def read_path(self, path: Path | str) -> Self:

View File

@@ -1,9 +1,9 @@
from pathlib import Path
from typing import Protocol, Self, TypeVar, runtime_checkable
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.export_config import ExporterConfig
T = TypeVar("T", bound=ExportConfig)
T = TypeVar("T", bound=ExporterConfig)
@runtime_checkable
class HTMLExportable(Protocol[T]):

View File

@@ -1,40 +1,84 @@
import asyncio
from dataclasses import dataclass
from logging import Logger
from pathlib import Path
from typing import Self, Literal, overload, TYPE_CHECKING
from typing import Self, Tuple, Any
from docutranslate.cacher import md_based_convert_cacher
from docutranslate.global_values.conditional_import import DOCLING_EXIST
if DOCLING_EXIST or TYPE_CHECKING:
if DOCLING_EXIST:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
from docutranslate.exporter.md.types import ConvertEnginType
from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@dataclass(kw_only=True)
class MarkdownBasedWorkflowConfig(WorkflowConfig):
# X2MarkdownConverterConfig
convert_engine: ConvertEnginType | None
formula: bool = True
# ConverterDoclingConfig
code: bool = True
artifact: Path | None = None
# ConverterMineruConfig
mineru_token: str
# MDTranslatorConfig
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
# MD2HTMLExporterConfig
cdn: bool = True
# general
logger: Logger | None = None
if DOCLING_EXIST or TYPE_CHECKING:
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
"docling": (ConverterDocling, ConverterDoclingConfig)
}
else:
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
}
def _get_document_md(self, convert_engin: convert_engin_type | None,
convert_config: x2md_convert_config_type | None):
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
def __init__(self, config: MarkdownBasedWorkflowConfig):
super().__init__(config=config)
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
}
if DOCLING_EXIST:
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
if config.convert_engine is None:
self.converter_config=None
elif config.convert_engine== "mineru":
self.converter_config = ConverterMineruConfig(formula=config.formula,
mineru_token=config.mineru_token)
elif DOCLING_EXIST and config.convert_engine== "docling":
self.converter_config = ConverterDoclingConfig(code=config.code,
formula=config.formula,
artifact=config.artifact)
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
api_key=config.api_key,
model_id=config.model_id,
to_lang=config.to_lang,
custom_prompt=config.custom_prompt,
temperature=config.temperature,
timeout=config.timeout,
chunk_size=config.chunk_size,
concurrent=config.concurrent,
)
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
self.convert_engine=config.convert_engine
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件
@@ -51,7 +95,7 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
if not isinstance(convert_config, config_class):
raise TypeError(
f"未传入正确的convert_config应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
converter = converter_class(convert_config, logger=self.logger)
converter = converter_class(convert_config)
else:
raise ValueError(f"不存在{convert_engin}解析引擎")
document_md = converter.convert(self.document_original)
@@ -59,67 +103,54 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md
@overload
def translate(self, convert_engin: None,
convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["docling"],
convert_config: "ConverterDoclingConfig", translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["mineru"],
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
...
def translate(self, convert_engin: convert_engin_type | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
document_md = self._get_document_md(convert_engin, convert_config)
def translate(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config
translator_config=self.translator_config
document_md = self._get_document_md(convert_engin,convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
translator = MDTranslator(translator_config)
translator.translate(document_md)
self.document_translated = document_md
return self
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
async def translate_async(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config
translator_config=self.translator_config
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
translator = MDTranslator(translator_config)
await translator.translate_async(document_md)
self.document_translated = document_md
return self
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
export_config=export_config or self.md2html_exporter_config
docu = self._export(MD2HTMLExporter(export_config))
return docu.content.decode()
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
docu = self._export(MD2MDExporter())
return docu.content.decode()
def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes:
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
docu = self._export(MD2MDZipExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExportConfig | None = None) -> Self:
self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir)
export_config: MD2HTMLExporterConfig | None = None) -> Self:
export_config = export_config or self.md2html_exporter_config
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDExportConfig | None = None) -> Self:
export_config=None) -> Self:
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDZIPExportConfig | None = None) -> Self:
export_config=None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self

View File

@@ -1,17 +1,17 @@
from pathlib import Path
from typing import Self
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
from docutranslate.workflow.base import Workflow
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
class TXTWorkflow(Workflow, HTMLExportable, TXTExportable):
def translate(self, translate_config: TXTTranslateConfig) -> Self:
def translate(self, translate_config: TXTTranslatorConfig) -> Self:
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
@@ -19,7 +19,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
self.document_translated = document
return self
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
async def translate_async(self, translate_config: TXTTranslatorConfig) -> Self:
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
@@ -27,7 +27,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
self.document_translated = document
return self
def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str:
def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str:
docu = self._export(TXT2HTMLExporter(export_config))
return docu.content.decode()
@@ -36,7 +36,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
return docu.content.decode()
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: TXT2HTMLExportConfig | None = None) -> Self:
export_config: TXT2HTMLExporterConfig | None = None) -> Self:
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
return self