重构workflow
This commit is contained in:
@@ -1,19 +1,27 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
from typing import Self, Generic, TypeVar
|
||||
|
||||
from docutranslate.exporter.interfaces import Exporter
|
||||
from docutranslate.exporter.base import Exporter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
|
||||
@dataclass(kw_only=True)
|
||||
class WorkflowConfig:
|
||||
logger: Logger | None = None
|
||||
|
||||
|
||||
T_original = TypeVar('T_original', bound=Document)
|
||||
T_Translated = TypeVar('T_Translated', bound=Document)
|
||||
|
||||
|
||||
class BaseWorkflow(ABC, Generic[T_Translated]):
|
||||
class Workflow(ABC, Generic[T_original, T_Translated]):
|
||||
def __init__(self, logger: Logger = global_logger):
|
||||
self.logger = logger
|
||||
self.document_original: Document | None = None
|
||||
self.document_original: T_original | None = None
|
||||
self.document_translated: T_Translated | None = None
|
||||
|
||||
def read_path(self, path: Path | str) -> Self:
|
||||
@@ -1,9 +1,9 @@
|
||||
from pathlib import Path
|
||||
from typing import Protocol, Self, TypeVar, runtime_checkable
|
||||
|
||||
from docutranslate.exporter.export_config import ExportConfig
|
||||
from docutranslate.exporter.export_config import ExporterConfig
|
||||
|
||||
T = TypeVar("T", bound=ExportConfig)
|
||||
T = TypeVar("T", bound=ExporterConfig)
|
||||
|
||||
@runtime_checkable
|
||||
class HTMLExportable(Protocol[T]):
|
||||
|
||||
@@ -1,40 +1,84 @@
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
from typing import Self, Literal, overload, TYPE_CHECKING
|
||||
from typing import Self, Tuple, Any
|
||||
|
||||
from docutranslate.cacher import md_based_convert_cacher
|
||||
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
||||
|
||||
if DOCLING_EXIST or TYPE_CHECKING:
|
||||
if DOCLING_EXIST:
|
||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
||||
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
|
||||
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
|
||||
from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter
|
||||
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
|
||||
from docutranslate.workflow.base_workflow import BaseWorkflow
|
||||
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
|
||||
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
|
||||
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
|
||||
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
|
||||
from docutranslate.exporter.md.types import ConvertEnginType
|
||||
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
||||
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
|
||||
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
|
||||
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
|
||||
|
||||
|
||||
class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@dataclass(kw_only=True)
|
||||
class MarkdownBasedWorkflowConfig(WorkflowConfig):
|
||||
# X2MarkdownConverterConfig
|
||||
convert_engine: ConvertEnginType | None
|
||||
formula: bool = True
|
||||
# ConverterDoclingConfig
|
||||
code: bool = True
|
||||
artifact: Path | None = None
|
||||
# ConverterMineruConfig
|
||||
mineru_token: str
|
||||
# MDTranslatorConfig
|
||||
base_url: str
|
||||
api_key: str
|
||||
model_id: str
|
||||
to_lang: str
|
||||
custom_prompt: str | None = None
|
||||
temperature: float = 0.7
|
||||
timeout: int = 2000
|
||||
chunk_size: int = 3000
|
||||
concurrent: int = 30
|
||||
# MD2HTMLExporterConfig
|
||||
cdn: bool = True
|
||||
# general
|
||||
logger: Logger | None = None
|
||||
|
||||
if DOCLING_EXIST or TYPE_CHECKING:
|
||||
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
|
||||
"mineru": (ConverterMineru, ConverterMineruConfig),
|
||||
"docling": (ConverterDocling, ConverterDoclingConfig)
|
||||
}
|
||||
else:
|
||||
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
|
||||
"mineru": (ConverterMineru, ConverterMineruConfig),
|
||||
}
|
||||
|
||||
def _get_document_md(self, convert_engin: convert_engin_type | None,
|
||||
convert_config: x2md_convert_config_type | None):
|
||||
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
||||
def __init__(self, config: MarkdownBasedWorkflowConfig):
|
||||
super().__init__(config=config)
|
||||
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
|
||||
"mineru": (ConverterMineru, ConverterMineruConfig),
|
||||
}
|
||||
if DOCLING_EXIST:
|
||||
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
|
||||
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
|
||||
if config.convert_engine is None:
|
||||
self.converter_config=None
|
||||
elif config.convert_engine== "mineru":
|
||||
self.converter_config = ConverterMineruConfig(formula=config.formula,
|
||||
mineru_token=config.mineru_token)
|
||||
elif DOCLING_EXIST and config.convert_engine== "docling":
|
||||
self.converter_config = ConverterDoclingConfig(code=config.code,
|
||||
formula=config.formula,
|
||||
artifact=config.artifact)
|
||||
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
|
||||
api_key=config.api_key,
|
||||
model_id=config.model_id,
|
||||
to_lang=config.to_lang,
|
||||
custom_prompt=config.custom_prompt,
|
||||
temperature=config.temperature,
|
||||
timeout=config.timeout,
|
||||
chunk_size=config.chunk_size,
|
||||
concurrent=config.concurrent,
|
||||
)
|
||||
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
|
||||
self.convert_engine=config.convert_engine
|
||||
|
||||
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
|
||||
if self.document_original is None:
|
||||
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
||||
# 获取缓存的解析后文件
|
||||
@@ -51,7 +95,7 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
|
||||
if not isinstance(convert_config, config_class):
|
||||
raise TypeError(
|
||||
f"未传入正确的convert_config,应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
|
||||
converter = converter_class(convert_config, logger=self.logger)
|
||||
converter = converter_class(convert_config)
|
||||
else:
|
||||
raise ValueError(f"不存在{convert_engin}解析引擎")
|
||||
document_md = converter.convert(self.document_original)
|
||||
@@ -59,67 +103,54 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
|
||||
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
||||
return document_md
|
||||
|
||||
@overload
|
||||
def translate(self, convert_engin: None,
|
||||
convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self:
|
||||
...
|
||||
|
||||
@overload
|
||||
def translate(self, convert_engin: Literal["docling"],
|
||||
convert_config: "ConverterDoclingConfig", translate_config: MDTranslateConfig) -> Self:
|
||||
...
|
||||
|
||||
@overload
|
||||
def translate(self, convert_engin: Literal["mineru"],
|
||||
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
|
||||
...
|
||||
|
||||
def translate(self, convert_engin: convert_engin_type | None,
|
||||
convert_config: x2md_convert_config_type | None,
|
||||
translate_config: MDTranslateConfig) -> Self:
|
||||
document_md = self._get_document_md(convert_engin, convert_config)
|
||||
def translate(self) -> Self:
|
||||
convert_engin,convert_config=self.convert_engine,self.converter_config
|
||||
translator_config=self.translator_config
|
||||
document_md = self._get_document_md(convert_engin,convert_config)
|
||||
# 翻译解析后文件
|
||||
translator = MDTranslator(translate_config)
|
||||
translator = MDTranslator(translator_config)
|
||||
translator.translate(document_md)
|
||||
self.document_translated = document_md
|
||||
return self
|
||||
|
||||
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
|
||||
convert_config: x2md_convert_config_type | None,
|
||||
translate_config: MDTranslateConfig) -> Self:
|
||||
|
||||
async def translate_async(self) -> Self:
|
||||
convert_engin,convert_config=self.convert_engine,self.converter_config
|
||||
translator_config=self.translator_config
|
||||
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
||||
# 翻译解析后文件
|
||||
translator = MDTranslator(translate_config)
|
||||
translator = MDTranslator(translator_config)
|
||||
await translator.translate_async(document_md)
|
||||
self.document_translated = document_md
|
||||
return self
|
||||
|
||||
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
|
||||
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
|
||||
export_config=export_config or self.md2html_exporter_config
|
||||
docu = self._export(MD2HTMLExporter(export_config))
|
||||
return docu.content.decode()
|
||||
|
||||
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
|
||||
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
|
||||
docu = self._export(MD2MDExporter())
|
||||
return docu.content.decode()
|
||||
|
||||
def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes:
|
||||
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
|
||||
docu = self._export(MD2MDZipExporter())
|
||||
return docu.content
|
||||
|
||||
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||
export_config: MD2HTMLExportConfig | None = None) -> Self:
|
||||
self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir)
|
||||
export_config: MD2HTMLExporterConfig | None = None) -> Self:
|
||||
export_config = export_config or self.md2html_exporter_config
|
||||
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
||||
export_config: MD2MDExportConfig | None = None) -> Self:
|
||||
export_config=None) -> Self:
|
||||
|
||||
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
||||
export_config: MD2MDZIPExportConfig | None = None) -> Self:
|
||||
export_config=None) -> Self:
|
||||
|
||||
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
from pathlib import Path
|
||||
from typing import Self
|
||||
|
||||
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
|
||||
from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter
|
||||
from docutranslate.workflow.base_workflow import BaseWorkflow
|
||||
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
|
||||
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
|
||||
from docutranslate.workflow.base import Workflow
|
||||
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
|
||||
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
|
||||
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
|
||||
|
||||
|
||||
|
||||
class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
||||
class TXTWorkflow(Workflow, HTMLExportable, TXTExportable):
|
||||
|
||||
def translate(self, translate_config: TXTTranslateConfig) -> Self:
|
||||
def translate(self, translate_config: TXTTranslatorConfig) -> Self:
|
||||
document = self.document_original.copy()
|
||||
# 翻译解析后文件
|
||||
translator = TXTTranslator(translate_config)
|
||||
@@ -19,7 +19,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
||||
self.document_translated = document
|
||||
return self
|
||||
|
||||
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
|
||||
async def translate_async(self, translate_config: TXTTranslatorConfig) -> Self:
|
||||
document = self.document_original.copy()
|
||||
# 翻译解析后文件
|
||||
translator = TXTTranslator(translate_config)
|
||||
@@ -27,7 +27,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
||||
self.document_translated = document
|
||||
return self
|
||||
|
||||
def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str:
|
||||
def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str:
|
||||
docu = self._export(TXT2HTMLExporter(export_config))
|
||||
return docu.content.decode()
|
||||
|
||||
@@ -36,7 +36,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
||||
return docu.content.decode()
|
||||
|
||||
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||
export_config: TXT2HTMLExportConfig | None = None) -> Self:
|
||||
export_config: TXT2HTMLExporterConfig | None = None) -> Self:
|
||||
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
|
||||
Reference in New Issue
Block a user