|
|
|
@@ -2,16 +2,19 @@ import asyncio
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from logging import Logger
|
|
|
|
from logging import Logger
|
|
|
|
from pathlib import Path
|
|
|
|
from pathlib import Path
|
|
|
|
from typing import Self, Tuple, Any
|
|
|
|
from typing import Self, Tuple, Type
|
|
|
|
|
|
|
|
|
|
|
|
from docutranslate.cacher import md_based_convert_cacher
|
|
|
|
from docutranslate.cacher import md_based_convert_cacher
|
|
|
|
|
|
|
|
from docutranslate.exporter.base import ExporterConfig
|
|
|
|
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
|
|
|
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
|
|
|
|
|
|
|
from docutranslate.ir.document import Document
|
|
|
|
|
|
|
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
|
|
|
|
|
|
|
|
|
|
|
if DOCLING_EXIST:
|
|
|
|
if DOCLING_EXIST:
|
|
|
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
|
|
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
|
|
|
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
|
|
|
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
|
|
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
|
|
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
|
|
|
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
|
|
|
|
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter
|
|
|
|
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
|
|
|
|
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
|
|
|
|
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
|
|
|
|
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
|
|
|
|
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
|
|
|
|
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
|
|
|
|
@@ -23,64 +26,35 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass(kw_only=True)
|
|
|
|
@dataclass(kw_only=True)
|
|
|
|
class MarkdownBasedWorkflowConfig(WorkflowConfig):
|
|
|
|
class MarkdownBasedWorkflowConfig(WorkflowConfig):
|
|
|
|
# X2MarkdownConverterConfig
|
|
|
|
|
|
|
|
convert_engine: ConvertEnginType | None
|
|
|
|
|
|
|
|
formula: bool = True
|
|
|
|
|
|
|
|
# ConverterDoclingConfig
|
|
|
|
|
|
|
|
code: bool = True
|
|
|
|
|
|
|
|
artifact: Path | None = None
|
|
|
|
|
|
|
|
# ConverterMineruConfig
|
|
|
|
|
|
|
|
mineru_token: str
|
|
|
|
|
|
|
|
# MDTranslatorConfig
|
|
|
|
|
|
|
|
base_url: str
|
|
|
|
|
|
|
|
api_key: str
|
|
|
|
|
|
|
|
model_id: str
|
|
|
|
|
|
|
|
to_lang: str
|
|
|
|
|
|
|
|
custom_prompt: str | None = None
|
|
|
|
|
|
|
|
temperature: float = 0.7
|
|
|
|
|
|
|
|
timeout: int = 2000
|
|
|
|
|
|
|
|
chunk_size: int = 3000
|
|
|
|
|
|
|
|
concurrent: int = 30
|
|
|
|
|
|
|
|
# MD2HTMLExporterConfig
|
|
|
|
|
|
|
|
cdn: bool = True
|
|
|
|
|
|
|
|
# general
|
|
|
|
|
|
|
|
logger: Logger | None = None
|
|
|
|
logger: Logger | None = None
|
|
|
|
|
|
|
|
convert_engine: ConvertEnginType
|
|
|
|
|
|
|
|
converter_config: X2MarkdownConverterConfig | None
|
|
|
|
|
|
|
|
translator_config: MDTranslatorConfig
|
|
|
|
|
|
|
|
html_exporter_config: MD2HTMLExporterConfig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
|
|
|
class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable,
|
|
|
|
|
|
|
|
MDFormatsExportable):
|
|
|
|
def __init__(self, config: MarkdownBasedWorkflowConfig):
|
|
|
|
def __init__(self, config: MarkdownBasedWorkflowConfig):
|
|
|
|
super().__init__(config=config)
|
|
|
|
super().__init__(config=config)
|
|
|
|
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
|
|
|
|
self._converter_factory: dict[
|
|
|
|
|
|
|
|
ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
|
|
|
|
"mineru": (ConverterMineru, ConverterMineruConfig),
|
|
|
|
"mineru": (ConverterMineru, ConverterMineruConfig),
|
|
|
|
|
|
|
|
"identity": (ConverterIdentity, None)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if DOCLING_EXIST:
|
|
|
|
if DOCLING_EXIST:
|
|
|
|
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
|
|
|
|
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
|
|
|
|
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
|
|
|
|
|
|
|
|
if config.convert_engine is None:
|
|
|
|
|
|
|
|
self.converter_config=None
|
|
|
|
|
|
|
|
elif config.convert_engine== "mineru":
|
|
|
|
|
|
|
|
self.converter_config = ConverterMineruConfig(formula=config.formula,
|
|
|
|
|
|
|
|
mineru_token=config.mineru_token)
|
|
|
|
|
|
|
|
elif DOCLING_EXIST and config.convert_engine== "docling":
|
|
|
|
|
|
|
|
self.converter_config = ConverterDoclingConfig(code=config.code,
|
|
|
|
|
|
|
|
formula=config.formula,
|
|
|
|
|
|
|
|
artifact=config.artifact)
|
|
|
|
|
|
|
|
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
|
|
|
|
|
|
|
|
api_key=config.api_key,
|
|
|
|
|
|
|
|
model_id=config.model_id,
|
|
|
|
|
|
|
|
to_lang=config.to_lang,
|
|
|
|
|
|
|
|
custom_prompt=config.custom_prompt,
|
|
|
|
|
|
|
|
temperature=config.temperature,
|
|
|
|
|
|
|
|
timeout=config.timeout,
|
|
|
|
|
|
|
|
chunk_size=config.chunk_size,
|
|
|
|
|
|
|
|
concurrent=config.concurrent,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
|
|
|
|
|
|
|
|
self.convert_engine = config.convert_engine
|
|
|
|
self.convert_engine = config.convert_engine
|
|
|
|
|
|
|
|
self.logger = config.logger
|
|
|
|
|
|
|
|
if self.logger:
|
|
|
|
|
|
|
|
for config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]:
|
|
|
|
|
|
|
|
if config is not None:
|
|
|
|
|
|
|
|
config.logger = self.logger
|
|
|
|
|
|
|
|
|
|
|
|
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
|
|
|
|
def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig):
|
|
|
|
if self.document_original is None:
|
|
|
|
if self.document_original is None:
|
|
|
|
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
|
|
|
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
|
|
|
|
|
|
|
|
|
|
|
# 获取缓存的解析后文件
|
|
|
|
# 获取缓存的解析后文件
|
|
|
|
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
|
|
|
|
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
|
|
|
|
convert_config)
|
|
|
|
convert_config)
|
|
|
|
@@ -88,7 +62,7 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
|
|
|
if document_cached:
|
|
|
|
if document_cached:
|
|
|
|
document_md = document_cached
|
|
|
|
document_md = document_cached
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
if convert_engin is None or self.document_original.suffix == ".md":
|
|
|
|
if self.document_original.suffix == ".md":
|
|
|
|
converter = ConverterIdentity()
|
|
|
|
converter = ConverterIdentity()
|
|
|
|
elif convert_engin in self._converter_factory:
|
|
|
|
elif convert_engin in self._converter_factory:
|
|
|
|
converter_class, config_class = self._converter_factory[convert_engin]
|
|
|
|
converter_class, config_class = self._converter_factory[convert_engin]
|
|
|
|
@@ -103,10 +77,9 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
|
|
|
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
|
|
|
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
|
|
|
return document_md
|
|
|
|
return document_md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def translate(self) -> Self:
|
|
|
|
def translate(self) -> Self:
|
|
|
|
convert_engin,convert_config=self.convert_engine,self.converter_config
|
|
|
|
convert_engin, convert_config = self.convert_engine, self.config.converter_config
|
|
|
|
translator_config=self.translator_config
|
|
|
|
translator_config = self.config.translator_config
|
|
|
|
document_md = self._get_document_md(convert_engin, convert_config)
|
|
|
|
document_md = self._get_document_md(convert_engin, convert_config)
|
|
|
|
# 翻译解析后文件
|
|
|
|
# 翻译解析后文件
|
|
|
|
translator = MDTranslator(translator_config)
|
|
|
|
translator = MDTranslator(translator_config)
|
|
|
|
@@ -115,8 +88,8 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
|
|
|
return self
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
async def translate_async(self) -> Self:
|
|
|
|
async def translate_async(self) -> Self:
|
|
|
|
convert_engin,convert_config=self.convert_engine,self.converter_config
|
|
|
|
convert_engin, convert_config = self.convert_engine, self.config.converter_config
|
|
|
|
translator_config=self.translator_config
|
|
|
|
translator_config = self.config.translator_config
|
|
|
|
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
|
|
|
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
|
|
|
# 翻译解析后文件
|
|
|
|
# 翻译解析后文件
|
|
|
|
translator = MDTranslator(translator_config)
|
|
|
|
translator = MDTranslator(translator_config)
|
|
|
|
@@ -125,32 +98,32 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
|
|
|
return self
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
|
|
|
|
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
|
|
|
|
export_config=export_config or self.md2html_exporter_config
|
|
|
|
export_config = export_config or self.config.html_exporter_config
|
|
|
|
docu = self._export(MD2HTMLExporter(export_config))
|
|
|
|
docu = self._export(MD2HTMLExporter(export_config))
|
|
|
|
return docu.content.decode()
|
|
|
|
return docu.content.decode()
|
|
|
|
|
|
|
|
|
|
|
|
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
|
|
|
|
def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
|
|
|
|
docu = self._export(MD2MDExporter())
|
|
|
|
docu = self._export(MD2MDExporter())
|
|
|
|
return docu.content.decode()
|
|
|
|
return docu.content.decode()
|
|
|
|
|
|
|
|
|
|
|
|
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
|
|
|
|
def export_to_markdown_zip(self, config: ExporterConfig | None = None) -> bytes:
|
|
|
|
docu = self._export(MD2MDZipExporter())
|
|
|
|
docu = self._export(MD2MDZipExporter())
|
|
|
|
return docu.content
|
|
|
|
return docu.content
|
|
|
|
|
|
|
|
|
|
|
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
|
|
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
|
|
|
export_config: MD2HTMLExporterConfig | None = None) -> Self:
|
|
|
|
export_config: MD2HTMLExporterConfig | None = None) -> Self:
|
|
|
|
export_config = export_config or self.md2html_exporter_config
|
|
|
|
export_config = export_config or self.config.html_exporter_config
|
|
|
|
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
|
|
|
|
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
|
|
|
|
return self
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
|
|
|
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
|
|
|
export_config=None) -> Self:
|
|
|
|
export_config: ExporterConfig | None = None) -> Self:
|
|
|
|
|
|
|
|
|
|
|
|
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
|
|
|
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
|
|
|
return self
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
|
|
|
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
|
|
|
export_config=None) -> Self:
|
|
|
|
export_config: ExporterConfig | None = None) -> Self:
|
|
|
|
|
|
|
|
|
|
|
|
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
|
|
|
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
|
|
|
return self
|
|
|
|
return self
|
|
|
|
|