manager改名workflow

This commit is contained in:
xunbu
2025-07-29 23:35:13 +08:00
parent b7e9c16c9f
commit 236640c177
7 changed files with 73 additions and 73 deletions

View File

View File

@@ -0,0 +1,50 @@
from abc import ABC, abstractmethod
from logging import Logger
from pathlib import Path
from typing import Self, Generic, TypeVar
from docutranslate.exporter.interfaces import Exporter
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
T_Translated = TypeVar('T_Translated', bound=Document)
class BaseWorkflow(ABC, Generic[T_Translated]):
def __init__(self, logger: Logger = global_logger):
self.logger = logger
self.document_original: Document | None = None
self.document_translated: T_Translated | None = None
def read_path(self, path: Path | str) -> Self:
document = Document.from_path(path)
self.document_original = document
return self
def read_bytes(self, content: bytes, stem: str, suffix: str) -> Self:
document = Document.from_bytes(content=content, stem=stem, suffix=suffix)
self.document_original = document
return self
@abstractmethod
def translate(self, *args, **kwargs) -> Self:
...
@abstractmethod
async def translate_async(self, *args, **kwargs) -> Self:
...
def _export(self, exporter: Exporter) -> Document:
if self.document_translated is None:
raise RuntimeError("Document has not been translated yet. Call translate() first.")
docu = exporter.export(self.document_translated)
return docu
def _save(self, exporter: Exporter, name: str = None, output_dir: Path | str = "./output"):
docu = self._export(exporter)
name = name or docu.name
output_path = Path(output_dir) / Path(name)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(docu.content)
self.logger.info(f"文件已保存到{output_path.resolve()}")
return self

View File

@@ -0,0 +1,44 @@
from pathlib import Path
from typing import Protocol, Self, TypeVar, runtime_checkable
from docutranslate.exporter.export_config import ExportConfig
T = TypeVar("T", bound=ExportConfig)
@runtime_checkable
class HTMLExportable(Protocol[T]):
def export_to_html(self, export_config: T | None = None) -> str:
...
def save_as_html(self, name: str, output_dir: Path | str, export_config: T | None = None) -> Self:
...
@runtime_checkable
class MDExportable(Protocol[T]):
def export_to_markdown(self, export_config: T | None = None) -> str:
...
def save_as_markdown(self, name: str, output_dir: Path | str, export_config: T | None = None) -> Self:
...
@runtime_checkable
class MDZIPExportable(Protocol[T]):
def export_to_markdown_zip(self, export_config: T | None = None) -> bytes:
...
def save_as_markdown_zip(self, name: str, output_dir: Path | str, export_config: T | None = None) -> Self:
...
@runtime_checkable
class MDFormatsExportable(MDZIPExportable[T], MDExportable[T], Protocol):
...
@runtime_checkable
class TXTExportable(Protocol[T]):
def export_to_txt(self) -> str:
...
def save_as_txt(self, name: str, output_dir: Path | str) -> Self:
...

View File

@@ -0,0 +1,125 @@
import asyncio
from pathlib import Path
from typing import Self, Literal, overload, TYPE_CHECKING
from docutranslate.cacher import md_based_convert_cacher
from docutranslate.global_values.conditional_import import DOCLING_EXIST
if DOCLING_EXIST or TYPE_CHECKING:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if DOCLING_EXIST or TYPE_CHECKING:
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
"docling": (ConverterDocling, ConverterDoclingConfig)
}
else:
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
}
def _get_document_md(self, convert_engin: convert_engin_type | None,
convert_config: x2md_convert_config_type | None):
if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
convert_config)
# 获取解析文件
if document_cached:
document_md = document_cached
else:
if convert_engin is None or self.document_original.suffix == ".md":
converter = ConverterIdentity()
elif convert_engin in self._converter_factory:
converter_class, config_class = self._converter_factory[convert_engin]
if not isinstance(convert_config, config_class):
raise TypeError(
f"未传入正确的convert_config应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
converter = converter_class(convert_config, logger=self.logger)
else:
raise ValueError(f"不存在{convert_engin}解析引擎")
document_md = converter.convert(self.document_original)
# 获取缓存解析后文件
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md
@overload
def translate(self, convert_engin: None,
convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["docling"],
convert_config: "ConverterDoclingConfig", translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["mineru"],
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
...
def translate(self, convert_engin: convert_engin_type | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
document_md = self._get_document_md(convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
translator.translate(document_md)
self.document_translated = document_md
return self
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
await translator.translate_async(document_md)
self.document_translated = document_md
return self
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
docu = self._export(MD2HTMLExporter(export_config))
return docu.content.decode()
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
docu = self._export(MD2MDExporter())
return docu.content.decode()
def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes:
docu = self._export(MD2MDZipExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExportConfig | None = None) -> Self:
self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir)
return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDExportConfig | None = None) -> Self:
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDZIPExportConfig | None = None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self

View File

@@ -0,0 +1,45 @@
from pathlib import Path
from typing import Self
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
def translate(self, translate_config: TXTTranslateConfig) -> Self:
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
translator.translate(document)
self.document_translated = document
return self
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
await translator.translate_async(document)
self.document_translated = document
return self
def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str:
docu = self._export(TXT2HTMLExporter(export_config))
return docu.content.decode()
def export_to_txt(self) -> str:
docu = self._export(TXT2TXTExporter())
return docu.content.decode()
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: TXT2HTMLExportConfig | None = None) -> Self:
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
return self
def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", ) -> Self:
self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir)
return self