mabasedworkflow架构完善

This commit is contained in:
xunbu
2025-07-30 23:54:55 +08:00
parent d25f634e73
commit f4b3432f45
5 changed files with 43 additions and 70 deletions

View File

@@ -19,11 +19,11 @@ class MDBasedCovertCacher:
def get_cached_result(self, document: Document, convert_engin: str, def get_cached_result(self, document: Document, convert_engin: str,
convert_config: ConverterConfig) -> MarkdownDocument | None: convert_config: ConverterConfig) -> MarkdownDocument | None:
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash())) return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str, def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
convert_config: ConverterConfig) -> MarkdownDocument: convert_config: ConverterConfig) -> MarkdownDocument:
hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash()) hash_code = self._get_hashcode(document, convert_engin, convert_config)
if len(self.cache_dict) > int(CACHE_NUM): if len(self.cache_dict) > int(CACHE_NUM):
self.cache_dict.popitem(last=False) self.cache_dict.popitem(last=False)
self.cache_dict[hash_code] = convert_result self.cache_dict[hash_code] = convert_result

View File

@@ -1,3 +1,3 @@
from typing import Literal from typing import Literal
ConvertEnginType = Literal["mineru", "docling"] ConvertEnginType = Literal["mineru", "docling","identity"]

View File

@@ -6,21 +6,21 @@ from typing import Self, Generic, TypeVar
from docutranslate.exporter.base import Exporter from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True) @dataclass(kw_only=True)
class WorkflowConfig: class WorkflowConfig:
logger: Logger | None = None logger: Logger | None = None
T_Config = TypeVar("T_Config", bound=WorkflowConfig)
T_original = TypeVar('T_original', bound=Document) T_original = TypeVar('T_original', bound=Document)
T_Translated = TypeVar('T_Translated', bound=Document) T_Translated = TypeVar('T_Translated', bound=Document)
class Workflow(ABC, Generic[T_original, T_Translated]): class Workflow(ABC, Generic[T_Config,T_original, T_Translated]):
def __init__(self, logger: Logger = global_logger): def __init__(self, config:T_Config):
self.logger = logger self.config=config
self.logger=self.config.logger
self.document_original: T_original | None = None self.document_original: T_original | None = None
self.document_translated: T_Translated | None = None self.document_translated: T_Translated | None = None

View File

@@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
from typing import Protocol, Self, TypeVar, runtime_checkable from typing import Protocol, Self, TypeVar, runtime_checkable
from docutranslate.exporter.export_config import ExporterConfig from docutranslate.exporter.base import ExporterConfig
T = TypeVar("T", bound=ExporterConfig) T = TypeVar("T", bound=ExporterConfig)

View File

@@ -2,16 +2,19 @@ import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from logging import Logger from logging import Logger
from pathlib import Path from pathlib import Path
from typing import Self, Tuple, Any from typing import Self, Tuple, Type
from docutranslate.cacher import md_based_convert_cacher from docutranslate.cacher import md_based_convert_cacher
from docutranslate.exporter.base import ExporterConfig
from docutranslate.global_values.conditional_import import DOCLING_EXIST from docutranslate.global_values.conditional_import import DOCLING_EXIST
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
if DOCLING_EXIST: if DOCLING_EXIST:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity from docutranslate.converter.x2md.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig from docutranslate.converter.x2md.base import X2MarkdownConverterConfig, X2MarkdownConverter
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
@@ -23,64 +26,35 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
@dataclass(kw_only=True) @dataclass(kw_only=True)
class MarkdownBasedWorkflowConfig(WorkflowConfig): class MarkdownBasedWorkflowConfig(WorkflowConfig):
# X2MarkdownConverterConfig
convert_engine: ConvertEnginType | None
formula: bool = True
# ConverterDoclingConfig
code: bool = True
artifact: Path | None = None
# ConverterMineruConfig
mineru_token: str
# MDTranslatorConfig
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
# MD2HTMLExporterConfig
cdn: bool = True
# general
logger: Logger | None = None logger: Logger | None = None
convert_engine: ConvertEnginType
converter_config: X2MarkdownConverterConfig | None
translator_config: MDTranslatorConfig
html_exporter_config: MD2HTMLExporterConfig
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable): class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, MarkdownDocument], HTMLExportable,
MDFormatsExportable):
def __init__(self, config: MarkdownBasedWorkflowConfig): def __init__(self, config: MarkdownBasedWorkflowConfig):
super().__init__(config=config) super().__init__(config=config)
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = { self._converter_factory: dict[
ConvertEnginType, Tuple[Type[X2MarkdownConverter], Type[X2MarkdownConverterConfig]] | None] = {
"mineru": (ConverterMineru, ConverterMineruConfig), "mineru": (ConverterMineru, ConverterMineruConfig),
"identity": (ConverterIdentity, None)
} }
if DOCLING_EXIST: if DOCLING_EXIST:
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig) self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
if config.convert_engine is None:
self.converter_config=None
elif config.convert_engine== "mineru":
self.converter_config = ConverterMineruConfig(formula=config.formula,
mineru_token=config.mineru_token)
elif DOCLING_EXIST and config.convert_engine== "docling":
self.converter_config = ConverterDoclingConfig(code=config.code,
formula=config.formula,
artifact=config.artifact)
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
api_key=config.api_key,
model_id=config.model_id,
to_lang=config.to_lang,
custom_prompt=config.custom_prompt,
temperature=config.temperature,
timeout=config.timeout,
chunk_size=config.chunk_size,
concurrent=config.concurrent,
)
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
self.convert_engine = config.convert_engine self.convert_engine = config.convert_engine
self.logger = config.logger
if self.logger:
for config in [self.config.converter_config, self.config.translator_config, self.config.html_exporter_config]:
if config is not None:
config.logger = self.logger
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig): def _get_document_md(self, convert_engin: ConvertEnginType, convert_config: X2MarkdownConverterConfig):
if self.document_original is None: if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件 # 获取缓存的解析后文件
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
convert_config) convert_config)
@@ -88,7 +62,7 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
if document_cached: if document_cached:
document_md = document_cached document_md = document_cached
else: else:
if convert_engin is None or self.document_original.suffix == ".md": if self.document_original.suffix == ".md":
converter = ConverterIdentity() converter = ConverterIdentity()
elif convert_engin in self._converter_factory: elif convert_engin in self._converter_factory:
converter_class, config_class = self._converter_factory[convert_engin] converter_class, config_class = self._converter_factory[convert_engin]
@@ -103,10 +77,9 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md return document_md
def translate(self) -> Self: def translate(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config convert_engin, convert_config = self.convert_engine, self.config.converter_config
translator_config=self.translator_config translator_config = self.config.translator_config
document_md = self._get_document_md(convert_engin, convert_config) document_md = self._get_document_md(convert_engin, convert_config)
# 翻译解析后文件 # 翻译解析后文件
translator = MDTranslator(translator_config) translator = MDTranslator(translator_config)
@@ -115,8 +88,8 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
return self return self
async def translate_async(self) -> Self: async def translate_async(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config convert_engin, convert_config = self.convert_engine, self.config.converter_config
translator_config=self.translator_config translator_config = self.config.translator_config
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config) document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件 # 翻译解析后文件
translator = MDTranslator(translator_config) translator = MDTranslator(translator_config)
@@ -125,32 +98,32 @@ class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
return self return self
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str: def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
export_config=export_config or self.md2html_exporter_config export_config = export_config or self.config.html_exporter_config
docu = self._export(MD2HTMLExporter(export_config)) docu = self._export(MD2HTMLExporter(export_config))
return docu.content.decode() return docu.content.decode()
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str: def export_to_markdown(self, config: ExporterConfig | None = None) -> str:
docu = self._export(MD2MDExporter()) docu = self._export(MD2MDExporter())
return docu.content.decode() return docu.content.decode()
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes: def export_to_markdown_zip(self, config: ExporterConfig | None = None) -> bytes:
docu = self._export(MD2MDZipExporter()) docu = self._export(MD2MDZipExporter())
return docu.content return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output", def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExporterConfig | None = None) -> Self: export_config: MD2HTMLExporterConfig | None = None) -> Self:
export_config = export_config or self.md2html_exporter_config export_config = export_config or self.config.html_exporter_config
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir) self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
return self return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config=None) -> Self: export_config: ExporterConfig | None = None) -> Self:
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir) self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output", def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config=None) -> Self: export_config: ExporterConfig | None = None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir) self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self return self