完善mdbasedworkflow
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
ConvertEnginType = Literal["mineru", "docling","identity"]
|
ConvertEngineType = Literal["mineru", "docling", "identity"]
|
||||||
@@ -5,6 +5,7 @@ from docutranslate.exporter.base import ExporterConfig
|
|||||||
|
|
||||||
T_ExporterConfig = TypeVar("T_ExporterConfig", bound=ExporterConfig)
|
T_ExporterConfig = TypeVar("T_ExporterConfig", bound=ExporterConfig)
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class HTMLExportable(Protocol[T_ExporterConfig]):
|
class HTMLExportable(Protocol[T_ExporterConfig]):
|
||||||
def export_to_html(self, config: T_ExporterConfig | None = None) -> str:
|
def export_to_html(self, config: T_ExporterConfig | None = None) -> str:
|
||||||
@@ -13,6 +14,7 @@ class HTMLExportable(Protocol[T_ExporterConfig]):
|
|||||||
def save_as_html(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
def save_as_html(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class MDExportable(Protocol[T_ExporterConfig]):
|
class MDExportable(Protocol[T_ExporterConfig]):
|
||||||
|
|
||||||
@@ -22,6 +24,7 @@ class MDExportable(Protocol[T_ExporterConfig]):
|
|||||||
def save_as_markdown(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
def save_as_markdown(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class MDZIPExportable(Protocol[T_ExporterConfig]):
|
class MDZIPExportable(Protocol[T_ExporterConfig]):
|
||||||
|
|
||||||
@@ -31,14 +34,16 @@ class MDZIPExportable(Protocol[T_ExporterConfig]):
|
|||||||
def save_as_markdown_zip(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
def save_as_markdown_zip(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig]):
|
class MDFormatsExportable(MDZIPExportable[T_ExporterConfig], MDExportable[T_ExporterConfig]):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class TXTExportable(Protocol[T_ExporterConfig]):
|
class TXTExportable(Protocol[T_ExporterConfig]):
|
||||||
def export_to_txt(self) -> str:
|
def export_to_txt(self, config: T_ExporterConfig | None = None) -> str:
|
||||||
...
|
...
|
||||||
|
|
||||||
def save_as_txt(self, name: str, output_dir: Path | str) -> Self:
|
def save_as_txt(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
...
|
...
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2Markd
|
|||||||
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
|
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
|
||||||
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
|
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
|
||||||
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
|
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
|
||||||
from docutranslate.exporter.md.types import ConvertEnginType
|
from docutranslate.exporter.md.types import ConvertEngineType
|
||||||
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
||||||
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
|
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
|
||||||
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
|
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
|
||||||
@@ -25,47 +25,48 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
|
|||||||
|
|
||||||
@dataclass(kw_only=True)
|
@dataclass(kw_only=True)
|
||||||
class MarkdownBasedWorkflowConfig(WorkflowConfig):
|
class MarkdownBasedWorkflowConfig(WorkflowConfig):
|
||||||
convert_engine: ConvertEnginType
|
convert_engine: ConvertEngineType
|
||||||
converter_config: X2MarkdownConverterConfig | None
|
converter_config: X2MarkdownConverterConfig | None
|
||||||
translator_config: MDTranslatorConfig
|
translator_config: MDTranslatorConfig
|
||||||
html_exporter_config: MD2HTMLExporterConfig
|
html_exporter_config: MD2HTMLExporterConfig
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable,
|
class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument],
|
||||||
MDFormatsExportable):
|
HTMLExportable[MD2HTMLExporterConfig],
|
||||||
def __init__(self, config: MarkdownBasedWorkflowConfig):
|
MDFormatsExportable[ExporterConfig]):
|
||||||
super().__init__(config=config)
|
_converter_factory: dict[
|
||||||
self._converter_factory: dict[
|
ConvertEngineType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
|
||||||
ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
|
|
||||||
"mineru": (ConverterMineru, ConverterMineruConfig),
|
"mineru": (ConverterMineru, ConverterMineruConfig),
|
||||||
"identity": (ConverterIdentity, None)
|
"identity": (ConverterIdentity, None)
|
||||||
}
|
}
|
||||||
if DOCLING_EXIST:
|
if DOCLING_EXIST:
|
||||||
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
|
_converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
|
||||||
|
|
||||||
|
def __init__(self, config: MarkdownBasedWorkflowConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
self.convert_engine = config.convert_engine
|
self.convert_engine = config.convert_engine
|
||||||
if config.logger:
|
if config.logger:
|
||||||
for sub_config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]:
|
for sub_config in [self.config.converter_config, self.config.translator_config,
|
||||||
if sub_config and sub_config.logger is not None:
|
self.config.html_exporter_config]:
|
||||||
|
if sub_config:
|
||||||
sub_config.logger = config.logger
|
sub_config.logger = config.logger
|
||||||
|
|
||||||
def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig):
|
def _get_document_md(self, convert_engin: ConvertEngineType, convert_config: X2MarkdownConverterConfig):
|
||||||
if self.document_original is None:
|
if self.document_original is None:
|
||||||
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
||||||
|
|
||||||
# 获取缓存的解析后文件
|
# 获取缓存的解析后文件
|
||||||
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
|
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
|
||||||
convert_config)
|
convert_config)
|
||||||
# 获取解析文件
|
|
||||||
if document_cached:
|
if document_cached:
|
||||||
document_md = document_cached
|
return document_cached
|
||||||
else:
|
|
||||||
if self.document_original.suffix == ".md":
|
# 未缓存则解析文件
|
||||||
converter = ConverterIdentity()
|
if convert_engin in self._converter_factory:
|
||||||
elif convert_engin in self._converter_factory:
|
|
||||||
converter_class, config_class = self._converter_factory[convert_engin]
|
converter_class, config_class = self._converter_factory[convert_engin]
|
||||||
if not isinstance(convert_config, config_class):
|
if config_class and not isinstance(convert_config, config_class):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"未传入正确的convert_config,应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
|
f"未传入正确的convert_config,应为{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
|
||||||
converter = converter_class(convert_config)
|
converter = converter_class(convert_config)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"不存在{convert_engin}解析引擎")
|
raise ValueError(f"不存在{convert_engin}解析引擎")
|
||||||
@@ -74,29 +75,30 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
|
|||||||
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
||||||
return document_md
|
return document_md
|
||||||
|
|
||||||
def translate(self) -> Self:
|
def _pre_translate(self, document: Document):
|
||||||
convert_engin, convert_config = self.convert_engine, self.config.converter_config
|
convert_engin: ConvertEngineType = "identity" if document.suffix == ".md" else self.convert_engine
|
||||||
|
convert_config = self.config.converter_config
|
||||||
translator_config = self.config.translator_config
|
translator_config = self.config.translator_config
|
||||||
document_md = self._get_document_md(convert_engin, convert_config)
|
|
||||||
# 翻译解析后文件
|
|
||||||
translator = MDTranslator(translator_config)
|
translator = MDTranslator(translator_config)
|
||||||
|
return convert_engin, convert_config, translator_config, translator
|
||||||
|
|
||||||
|
def translate(self) -> Self:
|
||||||
|
convert_engin, convert_config, translator_config, translator = self._pre_translate(self.document_original)
|
||||||
|
document_md = self._get_document_md(convert_engin, convert_config)
|
||||||
translator.translate(document_md)
|
translator.translate(document_md)
|
||||||
self.document_translated = document_md
|
self.document_translated = document_md
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_async(self) -> Self:
|
async def translate_async(self) -> Self:
|
||||||
convert_engin, convert_config = self.convert_engine, self.config.converter_config
|
convert_engin, convert_config, translator_config, translator = self._pre_translate(self.document_original)
|
||||||
translator_config = self.config.translator_config
|
|
||||||
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
||||||
# 翻译解析后文件
|
|
||||||
translator = MDTranslator(translator_config)
|
|
||||||
await translator.translate_async(document_md)
|
await translator.translate_async(document_md)
|
||||||
self.document_translated = document_md
|
self.document_translated = document_md
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
|
def export_to_html(self, config: MD2HTMLExporterConfig | None = None) -> str:
|
||||||
export_config = export_config or self.config.html_exporter_config
|
config = config or self.config.html_exporter_config
|
||||||
docu = self._export(MD2HTMLExporter(export_config))
|
docu = self._export(MD2HTMLExporter(config))
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
|
def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
|
||||||
@@ -108,19 +110,19 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
|
|||||||
return docu.content
|
return docu.content
|
||||||
|
|
||||||
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: MD2HTMLExporterConfig | None = None) -> Self:
|
config: MD2HTMLExporterConfig | None = None) -> Self:
|
||||||
export_config = export_config or self.config.html_exporter_config
|
config = config or self.config.html_exporter_config
|
||||||
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
|
self._save(exporter=MD2HTMLExporter(config=config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: ExporterConfig | None = None) -> Self:
|
_: ExporterConfig | None = None) -> Self:
|
||||||
|
|
||||||
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: ExporterConfig | None = None) -> Self:
|
_: ExporterConfig | None = None) -> Self:
|
||||||
|
|
||||||
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|||||||
@@ -2,28 +2,32 @@ from dataclasses import dataclass
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
|
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
|
||||||
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
|
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
|
||||||
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
||||||
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
|
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
|
||||||
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
|
|
||||||
|
|
||||||
@dataclass(kw_only=True)
|
@dataclass(kw_only=True)
|
||||||
class TXTWorkflowConfig(WorkflowConfig):
|
class TXTWorkflowConfig(WorkflowConfig):
|
||||||
translator_config: TXTTranslatorConfig
|
translator_config: TXTTranslatorConfig
|
||||||
html_exporter_config: TXT2HTMLExporterConfig
|
html_exporter_config: TXT2HTMLExporterConfig
|
||||||
|
|
||||||
class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable, TXTExportable):
|
|
||||||
def __init__(self,config:TXTWorkflowConfig):
|
class TXTWorkflow(Workflow[TXTWorkflowConfig, Document, Document], HTMLExportable[TXT2HTMLExporterConfig],
|
||||||
|
TXTExportable[ExporterConfig]):
|
||||||
|
def __init__(self, config: TXTWorkflowConfig):
|
||||||
super().__init__(config=config)
|
super().__init__(config=config)
|
||||||
if config.logger:
|
if config.logger:
|
||||||
for sub_config in [self.config.translator_config]:
|
for sub_config in [self.config.translator_config]:
|
||||||
if sub_config and sub_config.logger is not None:
|
if sub_config:
|
||||||
sub_config.logger=config.logger
|
sub_config.logger = config.logger
|
||||||
|
|
||||||
def translate(self) -> Self:
|
def translate(self) -> Self:
|
||||||
translate_config=self.config.translator_config
|
translate_config = self.config.translator_config
|
||||||
document = self.document_original.copy()
|
document = self.document_original.copy()
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = TXTTranslator(translate_config)
|
translator = TXTTranslator(translate_config)
|
||||||
@@ -32,7 +36,7 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable,
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_async(self) -> Self:
|
async def translate_async(self) -> Self:
|
||||||
translate_config=self.config.translator_config
|
translate_config = self.config.translator_config
|
||||||
document = self.document_original.copy()
|
document = self.document_original.copy()
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = TXTTranslator(translate_config)
|
translator = TXTTranslator(translate_config)
|
||||||
@@ -40,21 +44,22 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig,Document,Document], HTMLExportable,
|
|||||||
self.document_translated = document
|
self.document_translated = document
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str:
|
def export_to_html(self, config: TXT2HTMLExporterConfig = None) -> str:
|
||||||
export_config=export_config or self.config.html_exporter_config
|
config = config or self.config.html_exporter_config
|
||||||
docu = self._export(TXT2HTMLExporter(export_config))
|
docu = self._export(TXT2HTMLExporter(config))
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def export_to_txt(self) -> str:
|
def export_to_txt(self, _: ExporterConfig | None = None) -> str:
|
||||||
docu = self._export(TXT2TXTExporter())
|
docu = self._export(TXT2TXTExporter())
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: TXT2HTMLExporterConfig | None = None) -> Self:
|
config: TXT2HTMLExporterConfig | None = None) -> Self:
|
||||||
export_config=export_config or self.config.html_exporter_config
|
config = config or self.config.html_exporter_config
|
||||||
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
|
self._save(exporter=TXT2HTMLExporter(config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", ) -> Self:
|
def save_as_txt(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
|
_: ExporterConfig | None = None) -> Self:
|
||||||
self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir)
|
self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|||||||
Reference in New Issue
Block a user