完善mdbasedworkflow

This commit is contained in:
xunbu
2025-07-31 09:41:46 +08:00
parent b484ba60bc
commit 10d528afbb
4 changed files with 75 additions and 63 deletions

View File

@@ -1,3 +1,3 @@
from typing import Literal from typing import Literal
ConvertEnginType = Literal["mineru", "docling","identity"] ConvertEngineType = Literal["mineru", "docling", "identity"]

View File

@@ -5,6 +5,7 @@ from docutranslate.exporter.base import ExporterConfig
T_ExporterConfig = TypeVar("T_ExporterConfig", bound=ExporterConfig) T_ExporterConfig = TypeVar("T_ExporterConfig", bound=ExporterConfig)
@runtime_checkable @runtime_checkable
class HTMLExportable(Protocol[T_ExporterConfig]): class HTMLExportable(Protocol[T_ExporterConfig]):
def export_to_html(self, config: T_ExporterConfig | None = None) -> str: def export_to_html(self, config: T_ExporterConfig | None = None) -> str:
@@ -13,6 +14,7 @@ class HTMLExportable(Protocol[T_ExporterConfig]):
def save_as_html(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_html(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class MDExportable(Protocol[T_ExporterConfig]): class MDExportable(Protocol[T_ExporterConfig]):
@@ -22,6 +24,7 @@ class MDExportable(Protocol[T_ExporterConfig]):
def save_as_markdown(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_markdown(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class MDZIPExportable(Protocol[T_ExporterConfig]): class MDZIPExportable(Protocol[T_ExporterConfig]):
@@ -31,14 +34,16 @@ class MDZIPExportable(Protocol[T_ExporterConfig]):
def save_as_markdown_zip(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_markdown_zip(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig]): class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig]):
... ...
@runtime_checkable @runtime_checkable
class TXTExportable(Protocol[T_ExporterConfig]): class TXTExportable(Protocol[T_ExporterConfig]):
def export_to_txt(self) -> str: def export_to_txt(self, config: T_ExporterConfig | None = None) -> str:
... ...
def save_as_txt(self, name: str, output_dir: Path | str) -> Self: def save_as_txt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...

View File

@@ -17,7 +17,7 @@ from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2Markd
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
from docutranslate.exporter.md.types import ConvertEnginType from docutranslate.exporter.md.types import ConvertEngineType
from docutranslate.workflow.base import Workflow, WorkflowConfig from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
@@ -25,78 +25,80 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
@dataclass(kw_only=True) @dataclass(kw_only=True)
class MarkdownBasedWorkflowConfig(WorkflowConfig): class MarkdownBasedWorkflowConfig(WorkflowConfig):
convert_engine: ConvertEnginType convert_engine: ConvertEngineType
converter_config: X2MarkdownConverterConfig | None converter_config: X2MarkdownConverterConfig | None
translator_config: MDTranslatorConfig translator_config: MDTranslatorConfig
html_exporter_config: MD2HTMLExporterConfig html_exporter_config: MD2HTMLExporterConfig
class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable, class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument],
MDFormatsExportable): HTMLExportable[MD2HTMLExporterConfig],
MDFormatsExportable[ExporterConfig]):
_converter_factory: dict[
ConvertEngineType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
"identity": (ConverterIdentity, None)
}
if DOCLING_EXIST:
_converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
def __init__(self, config: MarkdownBasedWorkflowConfig): def __init__(self, config: MarkdownBasedWorkflowConfig):
super().__init__(config=config) super().__init__(config=config)
self._converter_factory: dict[
ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
"identity": (ConverterIdentity, None)
}
if DOCLING_EXIST:
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
self.convert_engine = config.convert_engine self.convert_engine = config.convert_engine
if config.logger: if config.logger:
for sub_config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]: for sub_config in [self.config.converter_config, self.config.translator_config,
if sub_config and sub_config.logger is not None: self.config.html_exporter_config]:
if sub_config:
sub_config.logger = config.logger sub_config.logger = config.logger
def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig): def _get_document_md(self, convert_engin: ConvertEngineType, convert_config: X2MarkdownConverterConfig):
if self.document_original is None: if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件 # 获取缓存的解析后文件
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
convert_config) convert_config)
# 获取解析文件
if document_cached: if document_cached:
document_md = document_cached return document_cached
# 未缓存则解析文件
if convert_engin in self._converter_factory:
converter_class, config_class = self._converter_factory[convert_engin]
if config_class and not isinstance(convert_config, config_class):
raise TypeError(
f"未传入正确的convert_config应为{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
converter = converter_class(convert_config)
else: else:
if self.document_original.suffix == ".md": raise ValueError(f"不存在{convert_engin}解析引擎")
converter = ConverterIdentity() document_md = converter.convert(self.document_original)
elif convert_engin in self._converter_factory: # 获取缓存解析后文件
converter_class, config_class = self._converter_factory[convert_engin] md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
if not isinstance(convert_config, config_class):
raise TypeError(
f"未传入正确的convert_config应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
converter = converter_class(convert_config)
else:
raise ValueError(f"不存在{convert_engin}解析引擎")
document_md = converter.convert(self.document_original)
# 获取缓存解析后文件
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md return document_md
def translate(self) -> Self: def _pre_translate(self, document: Document):
convert_engin, convert_config = self.convert_engine, self.config.converter_config convert_engin: ConvertEngineType = "identity" if document.suffix == ".md" else self.convert_engine
convert_config = self.config.converter_config
translator_config = self.config.translator_config translator_config = self.config.translator_config
document_md = self._get_document_md(convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translator_config) translator = MDTranslator(translator_config)
return convert_engin, convert_config, translator_config, translator
def translate(self) -> Self:
convert_engin, convert_config, translator_config, translator = self._pre_translate(self.document_original)
document_md = self._get_document_md(convert_engin, convert_config)
translator.translate(document_md) translator.translate(document_md)
self.document_translated = document_md self.document_translated = document_md
return self return self
async def translate_async(self) -> Self: async def translate_async(self) -> Self:
convert_engin, convert_config = self.convert_engine, self.config.converter_config convert_engin, convert_config, translator_config, translator = self._pre_translate(self.document_original)
translator_config = self.config.translator_config
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config) document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translator_config)
await translator.translate_async(document_md) await translator.translate_async(document_md)
self.document_translated = document_md self.document_translated = document_md
return self return self
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str: def export_to_html(self, config: MD2HTMLExporterConfig | None = None) -> str:
export_config = export_config or self.config.html_exporter_config config = config or self.config.html_exporter_config
docu = self._export(MD2HTMLExporter(export_config)) docu = self._export(MD2HTMLExporter(config))
return docu.content.decode() return docu.content.decode()
def export_to_markdown(self, config: ExporterConfig | None = None) -> str: def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
@@ -108,19 +110,19 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
return docu.content return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output", def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExporterConfig | None = None) -> Self: config: MD2HTMLExporterConfig | None = None) -> Self:
export_config = export_config or self.config.html_exporter_config config = config or self.config.html_exporter_config
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir) self._save(exporter=MD2HTMLExporter(config=config), name=name, output_dir=output_dir)
return self return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config: ExporterConfig | None = None) -> Self: _: ExporterConfig | None = None) -> Self:
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir) self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output", def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config: ExporterConfig | None = None) -> Self: _: ExporterConfig | None = None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir) self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self return self

View File

@@ -2,28 +2,32 @@ from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Self from typing import Self
from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
from docutranslate.workflow.base import Workflow, WorkflowConfig from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
@dataclass(kw_only=True) @dataclass(kw_only=True)
class TXTWorkflowConfig(WorkflowConfig): class TXTWorkflowConfig(WorkflowConfig):
translator_config: TXTTranslatorConfig translator_config: TXTTranslatorConfig
html_exporter_config: TXT2HTMLExporterConfig html_exporter_config: TXT2HTMLExporterConfig
class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable, TXTExportable):
def __init__(self,config:TXTWorkflowConfig): class TXTWorkflow(Workflow[TXTWorkflowConfig, Document, Document], HTMLExportable[TXT2HTMLExporterConfig],
TXTExportable[ExporterConfig]):
def __init__(self, config: TXTWorkflowConfig):
super().__init__(config=config) super().__init__(config=config)
if config.logger: if config.logger:
for sub_config in [self.config.translator_config]: for sub_config in [self.config.translator_config]:
if sub_config and sub_config.logger is not None: if sub_config:
sub_config.logger=config.logger sub_config.logger = config.logger
def translate(self) -> Self: def translate(self) -> Self:
translate_config=self.config.translator_config translate_config = self.config.translator_config
document = self.document_original.copy() document = self.document_original.copy()
# 翻译解析后文件 # 翻译解析后文件
translator = TXTTranslator(translate_config) translator = TXTTranslator(translate_config)
@@ -32,7 +36,7 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable,
return self return self
async def translate_async(self) -> Self: async def translate_async(self) -> Self:
translate_config=self.config.translator_config translate_config = self.config.translator_config
document = self.document_original.copy() document = self.document_original.copy()
# 翻译解析后文件 # 翻译解析后文件
translator = TXTTranslator(translate_config) translator = TXTTranslator(translate_config)
@@ -40,21 +44,22 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable,
self.document_translated = document self.document_translated = document
return self return self
def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str: def export_to_html(self, config: TXT2HTMLExporterConfig = None) -> str:
export_config=export_config or self.config.html_exporter_config config = config or self.config.html_exporter_config
docu = self._export(TXT2HTMLExporter(export_config)) docu = self._export(TXT2HTMLExporter(config))
return docu.content.decode() return docu.content.decode()
def export_to_txt(self) -> str: def export_to_txt(self, _: ExporterConfig | None = None) -> str:
docu = self._export(TXT2TXTExporter()) docu = self._export(TXT2TXTExporter())
return docu.content.decode() return docu.content.decode()
def save_as_html(self, name: str = None, output_dir: Path | str = "./output", def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: TXT2HTMLExporterConfig | None = None) -> Self: config: TXT2HTMLExporterConfig | None = None) -> Self:
export_config=export_config or self.config.html_exporter_config config = config or self.config.html_exporter_config
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir) self._save(exporter=TXT2HTMLExporter(config), name=name, output_dir=output_dir)
return self return self
def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", ) -> Self: def save_as_txt(self, name: str = None, output_dir: Path | str = "./output",
_: ExporterConfig | None = None) -> Self:
self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir) self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir)
return self return self