mabasedworkflow架构完善

This commit is contained in:
xunbu
2025-07-30 23:54:55 +08:00
parent d25f634e73
commit f4b3432f45
5 changed files with 43 additions and 70 deletions

View File

@@ -19,11 +19,11 @@ class MDBasedCovertCacher:
def get_cached_result(self, document: Document, convert_engin: str,
convert_config: ConverterConfig) -> MarkdownDocument | None:
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash()))
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
convert_config: ConverterConfig) -> MarkdownDocument:
hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash())
hash_code = self._get_hashcode(document, convert_engin, convert_config)
if len(self.cache_dict) > int(CACHE_NUM):
self.cache_dict.popitem(last=False)
self.cache_dict[hash_code] = convert_result

View File

@@ -1,3 +1,3 @@
from typing import Literal
ConvertEnginType = Literal["mineru", "docling"]
ConvertEnginType = Literal["mineru", "docling","identity"]

View File

@@ -6,21 +6,21 @@ from typing import Self, Generic, TypeVar
from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True)
class WorkflowConfig:
logger: Logger | None = None
T_Config = TypeVar("T_Config", bound=WorkflowConfig)
T_original = TypeVar('T_original', bound=Document)
T_Translated = TypeVar('T_Translated', bound=Document)
class Workflow(ABC, Generic[T_original, T_Translated]):
def __init__(self, logger: Logger = global_logger):
self.logger = logger
class Workflow(ABC, Generic[T_Config,T_original, T_Translated]):
def __init__(self, config:T_Config):
self.config=config
self.logger=self.config.logger
self.document_original: T_original | None = None
self.document_translated: T_Translated | None = None

View File

@@ -1,7 +1,7 @@
from pathlib import Path
from typing import Protocol, Self, TypeVar, runtime_checkable
from docutranslate.exporter.export_config import ExporterConfig
from docutranslate.exporter.base import ExporterConfig
T = TypeVar("T", bound=ExporterConfig)

View File

@@ -2,16 +2,19 @@ import asyncio
from dataclasses import dataclass
from logging import Logger
from pathlib import Path
from typing import Self, Tuple, Any
from typing import Self, Tuple, Type
from docutranslate.cacher import md_based_convert_cacher
from docutranslate.exporter.base import ExporterConfig
from docutranslate.global_values.conditional_import import DOCLING_EXIST
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
if DOCLING_EXIST:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
@@ -23,64 +26,35 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
@dataclass(kw_only=True)
class MarkdownBasedWorkflowConfig(WorkflowConfig):
# X2MarkdownConverterConfig
convert_engine: ConvertEnginType | None
formula: bool = True
# ConverterDoclingConfig
code: bool = True
artifact: Path | None = None
# ConverterMineruConfig
mineru_token: str
# MDTranslatorConfig
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
# MD2HTMLExporterConfig
cdn: bool = True
# general
logger: Logger | None = None
convert_engine: ConvertEnginType
converter_config: X2MarkdownConverterConfig | None
translator_config: MDTranslatorConfig
html_exporter_config: MD2HTMLExporterConfig
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable,
MDFormatsExportable):
def __init__(self, config: MarkdownBasedWorkflowConfig):
super().__init__(config=config)
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
self._converter_factory: dict[
ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
"identity": (ConverterIdentity, None)
}
if DOCLING_EXIST:
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
if config.convert_engine is None:
self.converter_config=None
elif config.convert_engine== "mineru":
self.converter_config = ConverterMineruConfig(formula=config.formula,
mineru_token=config.mineru_token)
elif DOCLING_EXIST and config.convert_engine== "docling":
self.converter_config = ConverterDoclingConfig(code=config.code,
formula=config.formula,
artifact=config.artifact)
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
api_key=config.api_key,
model_id=config.model_id,
to_lang=config.to_lang,
custom_prompt=config.custom_prompt,
temperature=config.temperature,
timeout=config.timeout,
chunk_size=config.chunk_size,
concurrent=config.concurrent,
)
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
self.convert_engine = config.convert_engine
self.logger = config.logger
if self.logger:
for config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]:
if config is not None:
config.logger = self.logger
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig):
if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
convert_config)
@@ -88,7 +62,7 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
if document_cached:
document_md = document_cached
else:
if convert_engin is None or self.document_original.suffix == ".md":
if self.document_original.suffix == ".md":
converter = ConverterIdentity()
elif convert_engin in self._converter_factory:
converter_class, config_class = self._converter_factory[convert_engin]
@@ -103,10 +77,9 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md
def translate(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config
translator_config=self.translator_config
convert_engin, convert_config = self.convert_engine, self.config.converter_config
translator_config = self.config.translator_config
document_md = self._get_document_md(convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translator_config)
@@ -115,8 +88,8 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
return self
async def translate_async(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config
translator_config=self.translator_config
convert_engin, convert_config = self.convert_engine, self.config.converter_config
translator_config = self.config.translator_config
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translator_config)
@@ -125,32 +98,32 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
return self
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
export_config=export_config or self.md2html_exporter_config
export_config = export_config or self.config.html_exporter_config
docu = self._export(MD2HTMLExporter(export_config))
return docu.content.decode()
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
docu = self._export(MD2MDExporter())
return docu.content.decode()
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
def export_to_markdown_zip(self, config: ExporterConfig | None = None) -> bytes:
docu = self._export(MD2MDZipExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExporterConfig | None = None) -> Self:
export_config = export_config or self.md2html_exporter_config
export_config = export_config or self.config.html_exporter_config
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config=None) -> Self:
export_config: ExporterConfig | None = None) -> Self:
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config=None) -> Self:
export_config: ExporterConfig | None = None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self