重构workflow

This commit is contained in:
xunbu
2025-07-30 20:48:11 +08:00
parent 8987e4ef60
commit d25f634e73
38 changed files with 351 additions and 286 deletions

View File

@@ -43,15 +43,14 @@
2. `pip install -e .`
3. `uv pip install -e .`#使用uv
# 支持的文件格式
# 翻译工作流
| 输入格式 | 输出格式 |
|----------------|--------------|
| PDF | Markdown推荐 |
| Markdown | HTML |
| HTML、XHTML | PDF(仅交互界面支持) |
| CSV | |
| DOC、DOCX部分支持 | |
| 工作流 | 代码 | 输入格式 | 输出格式 |
|-------------------------|------------------|----------------------------------------|----------------------|
| `MarkdownBasedWorkflow` | `markdown_based` | `.pdf ` `.md` `.png` `.jpeg` `.docx`等 | `.md` `.html` `.pdf` |
| `TXTWorkflow` | `txt` | `.txt ` | `.txt` `.html` `.pdf` |
> 所有.pdf的输出只能通过交互式界面获取
> 如果想不使用交互界面获取pdf可以先下载HTML文件用浏览器打开并打印
@@ -143,7 +142,7 @@ docutranslate -i -p 8011
## 翻译文件
```python
from docutranslate.translater import FileTranslater
from docutranslate.translator import FileTranslater
translater = FileTranslater(base_url="<baseurl>", # 大模型的baseurl
key="<api-key>", # 大模型的api-key

View File

@@ -22,7 +22,7 @@ from pydantic import BaseModel, Field
from docutranslate.global_values.conditional_import import DOCLING_EXIST
# --- 核心代码重构后的新 Imports ---
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.workflow.base import Workflow
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow
from docutranslate.workflow.txt_workflow import TXTWorkflow
@@ -30,16 +30,16 @@ from docutranslate.workflow.txt_workflow import TXTWorkflow
if DOCLING_EXIST or TYPE_CHECKING:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig
from docutranslate.translater.base import AiTranslateConfig
from docutranslate.translater.md_translator import MDTranslateConfig
from docutranslate.translater.txt_translator import TXTTranslateConfig
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig
from docutranslate.translator.base import AiTranslateConfig
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig
# ------------------------------------
from docutranslate import __version__
from docutranslate.logger import global_logger
from docutranslate.translater import default_params
from docutranslate.translator import default_params
from docutranslate.utils.resource_utils import resource_path
# --- 全局配置 (MODIFIED) ---
@@ -50,7 +50,7 @@ MAX_LOG_HISTORY = 200
httpx_client: httpx.AsyncClient
# --- [NEW] Workflow字典 ---
WORKFLOW_DICT: Dict[str, type[BaseWorkflow]] = {
WORKFLOW_DICT: Dict[str, type[Workflow]] = {
"markdown_based": MarkdownBasedWorkflow,
"txt": TXTWorkflow,
}
@@ -70,7 +70,7 @@ def _create_default_task_state() -> Dict[str, Any]:
# --- [KEPT FOR TEMP ENDPOINT] Workflow 工厂函数 (旧逻辑,仅为临时接口保留) ---
def _get_workflow_for_file(filename: str, logger: logging.Logger) -> BaseWorkflow:
def _get_workflow_for_file(filename: str, logger: logging.Logger) -> Workflow:
"""根据文件名后缀选择并返回合适的 Workflow 实例。这是扩展点。"""
suffix = Path(filename).suffix.lower()
if suffix == '.txt':
@@ -299,7 +299,7 @@ async def _perform_translation(
# 4. 根据 payload 的具体类型执行不同的翻译流程 (类型安全!)
if isinstance(payload, MarkdownWorkflowParams) and isinstance(workflow, MarkdownBasedWorkflow):
task_logger.info("执行 MarkdownBased 翻译流程。")
translate_config = MDTranslateConfig(**ai_config.__dict__)
translate_config = MDTranslatorConfig(**ai_config.__dict__)
convert_config = None
if payload.convert_engin == 'mineru':
@@ -323,7 +323,7 @@ async def _perform_translation(
elif isinstance(payload, TextWorkflowParams) and isinstance(workflow, TXTWorkflow):
task_logger.info("执行 TXT 翻译流程。")
translate_config = TXTTranslateConfig(**ai_config.__dict__)
translate_config = TXTTranslatorConfig(**ai_config.__dict__)
await workflow.translate_async(translate_config=translate_config)
else:
@@ -750,7 +750,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
if not task_state.get("download_ready") or not task_state.get("workflow_instance"):
raise HTTPException(status_code=404, detail="内容尚未准备好。")
workflow: BaseWorkflow = task_state["workflow_instance"]
workflow: Workflow = task_state["workflow_instance"]
filename_stem = task_state['original_filename_stem']
try:
@@ -759,8 +759,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
filename: str
if file_type == 'html' and isinstance(workflow, HTMLExportable):
config = MD2HTMLExportConfig(cdn=True) if isinstance(workflow,
MarkdownBasedWorkflow) else TXT2HTMLExportConfig(
config = MD2HTMLExporterConfig(cdn=True) if isinstance(workflow,
MarkdownBasedWorkflow) else TXT2HTMLExporterConfig(
cdn=True)
try:
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js",
@@ -1073,14 +1073,14 @@ async def temp_translate(
workflow.read_bytes(decoded_content, Path(file_name).stem, Path(file_name).suffix)
if isinstance(workflow, MarkdownBasedWorkflow):
translate_config = MDTranslateConfig(**ai_config.__dict__)
translate_config = MDTranslatorConfig(**ai_config.__dict__)
convert_config = ConverterMineruConfig(mineru_token=mineru_token) if mineru_token else None
convert_engin = 'mineru' if mineru_token else None
await workflow.translate_async(convert_engin, convert_config, translate_config)
return {"success": True, "content": workflow.export_to_markdown()}
elif isinstance(workflow, TXTWorkflow):
translate_config = TXTTranslateConfig(**ai_config.__dict__)
translate_config = TXTTranslatorConfig(**ai_config.__dict__)
await workflow.translate_async(translate_config)
return {"success": True, "content": workflow.export_to_txt()}

View File

@@ -1,7 +1,7 @@
import os
from collections import OrderedDict
from docutranslate.exporter.md2x.types import x2md_convert_config_type
from docutranslate.converter.base import ConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@@ -13,17 +13,17 @@ class MDBasedCovertCacher:
self.cache_dict = OrderedDict()
@staticmethod
def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str:
obj = (document.suffix, document.content, convert_engin, convert_config)
def _get_hashcode(document: Document, convert_engin: str, convert_config: ConverterConfig) -> str:
obj = (document.suffix, document.content, convert_engin, convert_config.gethash())
return str(hash(obj))
def get_cached_result(self, document: Document, convert_engin: str,
convert_config: x2md_convert_config_type) -> MarkdownDocument | None:
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
convert_config: ConverterConfig) -> MarkdownDocument | None:
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash()))
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
convert_config: x2md_convert_config_type) -> MarkdownDocument:
hash_code = self._get_hashcode(document, convert_engin, convert_config)
convert_config: ConverterConfig) -> MarkdownDocument:
hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash())
if len(self.cache_dict) > int(CACHE_NUM):
self.cache_dict.popitem(last=False)
self.cache_dict[hash_code] = convert_result

View File

@@ -0,0 +1,29 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from logging import Logger
from typing import Hashable
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True)
class ConverterConfig(ABC):
logger: Logger | None = None
@abstractmethod
def gethash(self)->Hashable:
...
class Converter(ABC):
def __init__(self, config: ConverterConfig | None = None):
self.config = config
self.logger = config.logger or global_logger
@abstractmethod
def convert(self, document: Document) -> Document:
...
async def convert_async(self, document: Document) -> Document:
...

View File

@@ -1,11 +0,0 @@
from typing import Protocol
from docutranslate.ir.document import Document
class Converter(Protocol):
def convert(self, document: Document) -> Document:
...
async def convert_async(self, document: Document) -> Document:
...

View File

@@ -1,19 +1,28 @@
from typing import Protocol
from docutranslate.converter.interfaces import Converter
from abc import abstractmethod
from dataclasses import dataclass
from docutranslate.converter.base import Converter, ConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@dataclass(kw_only=True)
class X2MarkdownConverterConfig(ConverterConfig):
...
class X2MarkdownConverter(Converter,Protocol):
class X2MarkdownConverter(Converter):
"""
负责将其它格式的文件转换为markdown
"""
@abstractmethod
def convert(self, document: Document) -> MarkdownDocument:
...
@abstractmethod
async def convert_async(self, document: Document) -> MarkdownDocument:
...
@abstractmethod
def support_format(self)->list[str]:
...

View File

@@ -3,7 +3,6 @@ import os
import time
from dataclasses import dataclass
from io import BytesIO
from logging import Logger
from pathlib import Path
from docling.datamodel.base_models import InputFormat
@@ -14,34 +13,34 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub.errors import LocalEntryNotFoundError
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
IMAGE_RESOLUTION_SCALE = 4
@dataclass(frozen=True)
class ConverterDoclingConfig:
@dataclass(kw_only=True)
class ConverterDoclingConfig(X2MarkdownConverterConfig):
code: bool = True
formula: bool = True
artifact: Path | None = None
def gethash(self):
return self.code,self.formula
class ConverterDocling(X2MarkdownConverter):
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
self.logger = logger
self.config = config
def __init__(self, config: ConverterDoclingConfig):
super().__init__(config=config)
self.code = config.code
self.formula = config.formula
artifact=Path("./docling_artifact")
artifact = Path("./docling_artifact")
if artifact.is_dir():
self.logger.info("使用./docling_artifact的本地模型")
self.artifact=artifact
self.artifact = artifact
else:
self.artifact=config.artifact
self.artifact = config.artifact
def convert(self, document) -> MarkdownDocument:
assert isinstance(document.name, str)

View File

@@ -1,10 +1,10 @@
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.converter.x2md.base import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class ConverterIdentity(X2MarkdownConverter):
#TODO:支持markdown_zip格式输入
def convert(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)

View File

@@ -3,10 +3,11 @@ import time
import zipfile
from dataclasses import dataclass
from logging import Logger
from typing import Hashable
import httpx
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
@@ -15,11 +16,14 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
URL = 'https://mineru.net/api/v4/file-urls/batch'
@dataclass(frozen=True)
class ConverterMineruConfig:
@dataclass(kw_only=True)
class ConverterMineruConfig(X2MarkdownConverterConfig):
mineru_token: str
formula: bool = True
def gethash(self) ->Hashable:
return self.formula
timeout = httpx.Timeout(
connect=5.0, # 连接超时 (建立连接的最长时间)
@@ -34,7 +38,7 @@ client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, v
class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
self.config = config
super().__init__(config=config)
self.mineru_token = config.mineru_token.strip()
self.formula = config.formula
self.logger = logger

View File

@@ -0,0 +1,20 @@
from abc import ABC,abstractmethod
from typing import Generic,TypeVar, Any
from dataclasses import dataclass
from docutranslate.ir.document import Document
D_in = TypeVar('D_in', bound=Document)
@dataclass(kw_only=True)
class ExporterConfig:
...
class Exporter(ABC,Generic[D_in]):
def __init__(self,config:ExporterConfig|None=None):
self.config=config
@abstractmethod
def export(self, document: D_in) -> Any:
...

View File

@@ -1,8 +0,0 @@
from dataclasses import dataclass
@dataclass
class ExportConfig:
pass

View File

@@ -1,15 +0,0 @@
from typing import Protocol, TypeVar, Any, Self
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.ir.document import Document
D_in = TypeVar('D_in', bound=Document)
class Exporter(Protocol[D_in]):
@classmethod
def from_config(cls, export_config: ExportConfig | None = None) -> Self:
...
def export(self, document: D_in) -> Any:
...

View File

@@ -0,0 +1,18 @@
from dataclasses import dataclass
from docutranslate.exporter.base import Exporter, ExporterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@dataclass(kw_only=True)
class MDExporterConfig(ExporterConfig):
...
class MDExporter(Exporter):
def __init__(self, config: MDExporterConfig|None=None):
super().__init__(config=config)
def export(self, document: MarkdownDocument) -> Document:
...

View File

@@ -3,20 +3,20 @@ from dataclasses import dataclass
import jinja2
import markdown2
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.md2x.interfaces import MDExporter
from docutranslate.exporter.md.base import MDExporter, MDExporterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.utils.resource_utils import resource_path
@dataclass
class MD2HTMLExportConfig(ExportConfig):
class MD2HTMLExporterConfig(MDExporterConfig):
cdn: bool = True
class MD2HTMLExporter(MDExporter):
def __init__(self, export_config: MD2HTMLExportConfig = None):
export_config = export_config or MD2HTMLExportConfig()
self.cdn=export_config.cdn
def __init__(self, config: MD2HTMLExporterConfig = None):
config = config or MD2HTMLExporterConfig()
super().__init__(config=config)
self.cdn=config.cdn
def export(self, document: MarkdownDocument) -> Document:
cdn = self.cdn

View File

@@ -0,0 +1,8 @@
from docutranslate.exporter.md.base import MDExporter
from docutranslate.ir.markdown_document import MarkdownDocument, Document
class MD2MDExporter(MDExporter):
def export(self, document: MarkdownDocument) -> Document:
return Document.from_bytes(suffix=".md", content=document.content, stem=document.stem)

View File

@@ -0,0 +1,11 @@
from docutranslate.exporter.md.base import MDExporter
from docutranslate.ir.markdown_document import MarkdownDocument, Document
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
class MD2MDZipExporter(MDExporter):
def export(self, document: MarkdownDocument) -> Document:
return Document.from_bytes(suffix=".zip", content=unembed_base64_images_to_zip(document.content.decode(),
markdown_name=document.name),
stem=document.stem)

View File

@@ -0,0 +1,3 @@
from typing import Literal
ConvertEnginType = Literal["mineru", "docling"]

View File

@@ -1,9 +0,0 @@
from docutranslate.exporter.interfaces import Exporter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class MDExporter(Exporter):
def export(self,document:MarkdownDocument)->Document:
...

View File

@@ -1,18 +0,0 @@
from dataclasses import dataclass
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.md2x.interfaces import MDExporter
from docutranslate.ir.markdown_document import MarkdownDocument,Document
@dataclass
class MD2MDExportConfig(ExportConfig):
pass
class MD2MDExporter(MDExporter):
def __init__(self, export_config: MD2MDExportConfig | None=None):
pass
def export(self,document:MarkdownDocument)->Document:
return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)

View File

@@ -1,21 +0,0 @@
from dataclasses import dataclass
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.md2x.interfaces import MDExporter
from docutranslate.ir.markdown_document import MarkdownDocument,Document
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
@dataclass
class MD2MDZIPExportConfig(ExportConfig):
pass
class MD2MDZipExporter(MDExporter):
def __init__(self, export_config: MD2MDZIPExportConfig | None=None):
pass
def export(self,document:MarkdownDocument)->Document:
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)

View File

@@ -1,14 +0,0 @@
from typing import Literal, TYPE_CHECKING
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
from docutranslate.global_values.conditional_import import DOCLING_EXIST
if DOCLING_EXIST or TYPE_CHECKING:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
convert_engin_type = Literal["mineru", "docling"]
if DOCLING_EXIST or TYPE_CHECKING:
x2md_convert_config_type = ConverterDoclingConfig | ConverterMineruConfig
else:
x2md_convert_config_type = ConverterMineruConfig

View File

@@ -1,8 +1,8 @@
from docutranslate.exporter.interfaces import Exporter
from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document
#TODO:看情况是否需要为TXT单独写一个document类型
class TXTExporter(Exporter):
class TXTExporter(Exporter[Document]):
def export(self,document:Document)->Document:
...

View File

@@ -2,21 +2,22 @@ from dataclasses import dataclass
import jinja2
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.txt2x.interfaces import TXTExporter
from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.txt.base import TXTExporter
from docutranslate.ir.document import Document
from docutranslate.utils.resource_utils import resource_path
@dataclass
class TXT2HTMLExportConfig(ExportConfig):
class TXT2HTMLExporterConfig(ExporterConfig):
cdn: bool = True
class TXT2HTMLExporter(TXTExporter):
def __init__(self, export_config: TXT2HTMLExportConfig = None):
export_config = export_config or TXT2HTMLExportConfig()
self.cdn = export_config.cdn
def __init__(self, config: TXT2HTMLExporterConfig = None):
config = config or TXT2HTMLExporterConfig()
super().__init__(config=config)
self.cdn = config.cdn
def export(self, document: Document) -> Document:
cdn = self.cdn

View File

@@ -1,10 +1,7 @@
from docutranslate.exporter.txt2x.interfaces import TXTExporter
from docutranslate.exporter.txt.base import TXTExporter
from docutranslate.ir.document import Document
class TXT2TXTExporter(TXTExporter):
def export(self, document: Document) -> Document:
return document.copy()

View File

@@ -1,16 +0,0 @@
from dataclasses import dataclass
from logging import Logger
@dataclass
class AiTranslateConfig:
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
logger: Logger | None = None

View File

@@ -1,20 +0,0 @@
from typing import Protocol, TypeVar
from docutranslate.agents import Agent
from docutranslate.ir.document import Document
T=TypeVar('T',bound=Document)
V=TypeVar('V',bound=Agent)
class Translator(Protocol[T,V]):
"""
翻译中间文本原地替换Translator不做格式转换
"""
def translate(self, document:T) -> Document:
...
async def translate_async(self, document: T) -> Document:
...
def log(self,info:str):
...

View File

@@ -0,0 +1,35 @@
from abc import abstractmethod
from dataclasses import dataclass
from logging import Logger
from typing import TypeVar
from docutranslate.ir.document import Document
from docutranslate.translator.base import Translator, TranslatorConfig
@dataclass(kw_only=True)
class AiTranslatorConfig(TranslatorConfig):
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
T=TypeVar('T',bound=Document)
class AiTranslator(Translator[T]):
"""
翻译中间文本原地替换Translator不做格式转换
"""
def __init__(self,config:AiTranslatorConfig,logger:Logger|None=None):
super().__init__(config=config,logger=logger)
@abstractmethod
def translate(self, document:T) -> Document:
...
@abstractmethod
async def translate_async(self, document: T) -> Document:
...

View File

@@ -5,22 +5,21 @@ from typing import Self
from docutranslate.agents import MDTranslateAgent
from docutranslate.context.md_mask_context import MDMaskUrisContext
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
from docutranslate.translater.base import AiTranslateConfig
from docutranslate.translater.interfaces import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
from docutranslate.utils.markdown_utils import clean_markdown_math_block
@dataclass
class MDTranslateConfig(AiTranslateConfig):
class MDTranslatorConfig(AiTranslatorConfig):
...
class MDTranslator(Translator):
def __init__(self, config: MDTranslateConfig):
self.logger = config.logger or global_logger
def __init__(self, config: MDTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt,
to_lang=config.to_lang,

View File

@@ -3,20 +3,19 @@ from typing import Self
from docutranslate.agents.txt_agent import TXTTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
from docutranslate.translater.base import AiTranslateConfig
from docutranslate.translater.interfaces import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.utils.markdown_splitter import split_markdown_text
@dataclass
class TXTTranslateConfig(AiTranslateConfig):
class TXTTranslatorConfig(AiTranslatorConfig):
...
class TXTTranslator(Translator):
def __init__(self, config: TXTTranslateConfig):
self.logger = config.logger or global_logger
def __init__(self, config: TXTTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt,
to_lang=config.to_lang,

View File

@@ -0,0 +1,27 @@
from dataclasses import dataclass
from logging import Logger
from typing import TypeVar,Generic
from abc import ABC,abstractmethod
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True)
class TranslatorConfig:
logger:Logger|None=None
T=TypeVar('T',bound=Document)
class Translator(ABC,Generic[T]):
"""
翻译中间文本原地替换Translator不做格式转换
"""
def __init__(self,config:TranslatorConfig|None=None):
self.config=config
self.logger=config.logger or global_logger
@abstractmethod
def translate(self, document:T) -> Document:
...
@abstractmethod
async def translate_async(self, document: T) -> Document:
...

View File

@@ -1,19 +1,27 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from logging import Logger
from pathlib import Path
from typing import Self, Generic, TypeVar
from docutranslate.exporter.interfaces import Exporter
from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True)
class WorkflowConfig:
logger: Logger | None = None
T_original = TypeVar('T_original', bound=Document)
T_Translated = TypeVar('T_Translated', bound=Document)
class BaseWorkflow(ABC, Generic[T_Translated]):
class Workflow(ABC, Generic[T_original, T_Translated]):
def __init__(self, logger: Logger = global_logger):
self.logger = logger
self.document_original: Document | None = None
self.document_original: T_original | None = None
self.document_translated: T_Translated | None = None
def read_path(self, path: Path | str) -> Self:

View File

@@ -1,9 +1,9 @@
from pathlib import Path
from typing import Protocol, Self, TypeVar, runtime_checkable
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.export_config import ExporterConfig
T = TypeVar("T", bound=ExportConfig)
T = TypeVar("T", bound=ExporterConfig)
@runtime_checkable
class HTMLExportable(Protocol[T]):

View File

@@ -1,40 +1,84 @@
import asyncio
from dataclasses import dataclass
from logging import Logger
from pathlib import Path
from typing import Self, Literal, overload, TYPE_CHECKING
from typing import Self, Tuple, Any
from docutranslate.cacher import md_based_convert_cacher
from docutranslate.global_values.conditional_import import DOCLING_EXIST
if DOCLING_EXIST or TYPE_CHECKING:
if DOCLING_EXIST:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
from docutranslate.exporter.md.types import ConvertEnginType
from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@dataclass(kw_only=True)
class MarkdownBasedWorkflowConfig(WorkflowConfig):
# X2MarkdownConverterConfig
convert_engine: ConvertEnginType | None
formula: bool = True
# ConverterDoclingConfig
code: bool = True
artifact: Path | None = None
# ConverterMineruConfig
mineru_token: str
# MDTranslatorConfig
base_url: str
api_key: str
model_id: str
to_lang: str
custom_prompt: str | None = None
temperature: float = 0.7
timeout: int = 2000
chunk_size: int = 3000
concurrent: int = 30
# MD2HTMLExporterConfig
cdn: bool = True
# general
logger: Logger | None = None
if DOCLING_EXIST or TYPE_CHECKING:
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
"docling": (ConverterDocling, ConverterDoclingConfig)
}
else:
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
def __init__(self, config: MarkdownBasedWorkflowConfig):
super().__init__(config=config)
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
"mineru": (ConverterMineru, ConverterMineruConfig),
}
if DOCLING_EXIST:
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
if config.convert_engine is None:
self.converter_config=None
elif config.convert_engine== "mineru":
self.converter_config = ConverterMineruConfig(formula=config.formula,
mineru_token=config.mineru_token)
elif DOCLING_EXIST and config.convert_engine== "docling":
self.converter_config = ConverterDoclingConfig(code=config.code,
formula=config.formula,
artifact=config.artifact)
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
api_key=config.api_key,
model_id=config.model_id,
to_lang=config.to_lang,
custom_prompt=config.custom_prompt,
temperature=config.temperature,
timeout=config.timeout,
chunk_size=config.chunk_size,
concurrent=config.concurrent,
)
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
self.convert_engine=config.convert_engine
def _get_document_md(self, convert_engin: convert_engin_type | None,
convert_config: x2md_convert_config_type | None):
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
if self.document_original is None:
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
# 获取缓存的解析后文件
@@ -51,7 +95,7 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
if not isinstance(convert_config, config_class):
raise TypeError(
f"未传入正确的convert_config应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
converter = converter_class(convert_config, logger=self.logger)
converter = converter_class(convert_config)
else:
raise ValueError(f"不存在{convert_engin}解析引擎")
document_md = converter.convert(self.document_original)
@@ -59,67 +103,54 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
return document_md
@overload
def translate(self, convert_engin: None,
convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["docling"],
convert_config: "ConverterDoclingConfig", translate_config: MDTranslateConfig) -> Self:
...
@overload
def translate(self, convert_engin: Literal["mineru"],
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
...
def translate(self, convert_engin: convert_engin_type | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
document_md = self._get_document_md(convert_engin, convert_config)
def translate(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config
translator_config=self.translator_config
document_md = self._get_document_md(convert_engin,convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
translator = MDTranslator(translator_config)
translator.translate(document_md)
self.document_translated = document_md
return self
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
convert_config: x2md_convert_config_type | None,
translate_config: MDTranslateConfig) -> Self:
async def translate_async(self) -> Self:
convert_engin,convert_config=self.convert_engine,self.converter_config
translator_config=self.translator_config
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
# 翻译解析后文件
translator = MDTranslator(translate_config)
translator = MDTranslator(translator_config)
await translator.translate_async(document_md)
self.document_translated = document_md
return self
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
export_config=export_config or self.md2html_exporter_config
docu = self._export(MD2HTMLExporter(export_config))
return docu.content.decode()
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
docu = self._export(MD2MDExporter())
return docu.content.decode()
def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes:
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
docu = self._export(MD2MDZipExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExportConfig | None = None) -> Self:
self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir)
export_config: MD2HTMLExporterConfig | None = None) -> Self:
export_config = export_config or self.md2html_exporter_config
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDExportConfig | None = None) -> Self:
export_config=None) -> Self:
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDZIPExportConfig | None = None) -> Self:
export_config=None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self

View File

@@ -1,17 +1,17 @@
from pathlib import Path
from typing import Self
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter
from docutranslate.workflow.base_workflow import BaseWorkflow
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
from docutranslate.workflow.base import Workflow
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
class TXTWorkflow(Workflow, HTMLExportable, TXTExportable):
def translate(self, translate_config: TXTTranslateConfig) -> Self:
def translate(self, translate_config: TXTTranslatorConfig) -> Self:
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
@@ -19,7 +19,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
self.document_translated = document
return self
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
async def translate_async(self, translate_config: TXTTranslatorConfig) -> Self:
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
@@ -27,7 +27,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
self.document_translated = document
return self
def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str:
def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str:
docu = self._export(TXT2HTMLExporter(export_config))
return docu.content.decode()
@@ -36,7 +36,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
return docu.content.decode()
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: TXT2HTMLExportConfig | None = None) -> Self:
export_config: TXT2HTMLExporterConfig | None = None) -> Self:
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
return self