重构workflow
This commit is contained in:
17
README.md
17
README.md
@@ -43,15 +43,14 @@
|
|||||||
2. `pip install -e .`
|
2. `pip install -e .`
|
||||||
3. `uv pip install -e .`#使用uv
|
3. `uv pip install -e .`#使用uv
|
||||||
|
|
||||||
# 支持的文件格式
|
# 翻译工作流
|
||||||
|
|
||||||
| 输入格式 | 输出格式 |
|
| 工作流 | 代码 | 输入格式 | 输出格式 |
|
||||||
|----------------|--------------|
|
|-------------------------|------------------|----------------------------------------|----------------------|
|
||||||
| PDF | Markdown(推荐) |
|
| `MarkdownBasedWorkflow` | `markdown_based` | `.pdf ` `.md` `.png` `.jpeg` `.docx`等 | `.md` `.html` `.pdf` |
|
||||||
| Markdown | HTML |
|
| `TXTWorkflow` | `txt` | `.txt ` | `.txt` `.html` `.pdf` |
|
||||||
| HTML、XHTML | PDF(仅交互界面支持) |
|
|
||||||
| CSV | |
|
> 所有.pdf的输出只能通过交互式界面获取
|
||||||
| DOC、DOCX(部分支持) | |
|
|
||||||
|
|
||||||
> 如果想不使用交互界面获取pdf,可以先下载HTML文件,用浏览器打开并打印
|
> 如果想不使用交互界面获取pdf,可以先下载HTML文件,用浏览器打开并打印
|
||||||
|
|
||||||
@@ -143,7 +142,7 @@ docutranslate -i -p 8011
|
|||||||
## 翻译文件
|
## 翻译文件
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docutranslate.translater import FileTranslater
|
from docutranslate.translator import FileTranslater
|
||||||
|
|
||||||
translater = FileTranslater(base_url="<baseurl>", # 大模型的baseurl
|
translater = FileTranslater(base_url="<baseurl>", # 大模型的baseurl
|
||||||
key="<api-key>", # 大模型的api-key
|
key="<api-key>", # 大模型的api-key
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ from pydantic import BaseModel, Field
|
|||||||
|
|
||||||
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
||||||
# --- 核心代码重构后的新 Imports ---
|
# --- 核心代码重构后的新 Imports ---
|
||||||
from docutranslate.workflow.base_workflow import BaseWorkflow
|
from docutranslate.workflow.base import Workflow
|
||||||
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable
|
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable
|
||||||
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow
|
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow
|
||||||
from docutranslate.workflow.txt_workflow import TXTWorkflow
|
from docutranslate.workflow.txt_workflow import TXTWorkflow
|
||||||
@@ -30,16 +30,16 @@ from docutranslate.workflow.txt_workflow import TXTWorkflow
|
|||||||
if DOCLING_EXIST or TYPE_CHECKING:
|
if DOCLING_EXIST or TYPE_CHECKING:
|
||||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
||||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
||||||
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig
|
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig
|
||||||
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig
|
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig
|
||||||
from docutranslate.translater.base import AiTranslateConfig
|
from docutranslate.translator.base import AiTranslateConfig
|
||||||
from docutranslate.translater.md_translator import MDTranslateConfig
|
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig
|
||||||
from docutranslate.translater.txt_translator import TXTTranslateConfig
|
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig
|
||||||
# ------------------------------------
|
# ------------------------------------
|
||||||
|
|
||||||
from docutranslate import __version__
|
from docutranslate import __version__
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
from docutranslate.translater import default_params
|
from docutranslate.translator import default_params
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
# --- 全局配置 (MODIFIED) ---
|
# --- 全局配置 (MODIFIED) ---
|
||||||
@@ -50,7 +50,7 @@ MAX_LOG_HISTORY = 200
|
|||||||
httpx_client: httpx.AsyncClient
|
httpx_client: httpx.AsyncClient
|
||||||
|
|
||||||
# --- [NEW] Workflow字典 ---
|
# --- [NEW] Workflow字典 ---
|
||||||
WORKFLOW_DICT: Dict[str, type[BaseWorkflow]] = {
|
WORKFLOW_DICT: Dict[str, type[Workflow]] = {
|
||||||
"markdown_based": MarkdownBasedWorkflow,
|
"markdown_based": MarkdownBasedWorkflow,
|
||||||
"txt": TXTWorkflow,
|
"txt": TXTWorkflow,
|
||||||
}
|
}
|
||||||
@@ -70,7 +70,7 @@ def _create_default_task_state() -> Dict[str, Any]:
|
|||||||
|
|
||||||
|
|
||||||
# --- [KEPT FOR TEMP ENDPOINT] Workflow 工厂函数 (旧逻辑,仅为临时接口保留) ---
|
# --- [KEPT FOR TEMP ENDPOINT] Workflow 工厂函数 (旧逻辑,仅为临时接口保留) ---
|
||||||
def _get_workflow_for_file(filename: str, logger: logging.Logger) -> BaseWorkflow:
|
def _get_workflow_for_file(filename: str, logger: logging.Logger) -> Workflow:
|
||||||
"""根据文件名后缀选择并返回合适的 Workflow 实例。这是扩展点。"""
|
"""根据文件名后缀选择并返回合适的 Workflow 实例。这是扩展点。"""
|
||||||
suffix = Path(filename).suffix.lower()
|
suffix = Path(filename).suffix.lower()
|
||||||
if suffix == '.txt':
|
if suffix == '.txt':
|
||||||
@@ -299,7 +299,7 @@ async def _perform_translation(
|
|||||||
# 4. 根据 payload 的具体类型执行不同的翻译流程 (类型安全!)
|
# 4. 根据 payload 的具体类型执行不同的翻译流程 (类型安全!)
|
||||||
if isinstance(payload, MarkdownWorkflowParams) and isinstance(workflow, MarkdownBasedWorkflow):
|
if isinstance(payload, MarkdownWorkflowParams) and isinstance(workflow, MarkdownBasedWorkflow):
|
||||||
task_logger.info("执行 MarkdownBased 翻译流程。")
|
task_logger.info("执行 MarkdownBased 翻译流程。")
|
||||||
translate_config = MDTranslateConfig(**ai_config.__dict__)
|
translate_config = MDTranslatorConfig(**ai_config.__dict__)
|
||||||
|
|
||||||
convert_config = None
|
convert_config = None
|
||||||
if payload.convert_engin == 'mineru':
|
if payload.convert_engin == 'mineru':
|
||||||
@@ -323,7 +323,7 @@ async def _perform_translation(
|
|||||||
|
|
||||||
elif isinstance(payload, TextWorkflowParams) and isinstance(workflow, TXTWorkflow):
|
elif isinstance(payload, TextWorkflowParams) and isinstance(workflow, TXTWorkflow):
|
||||||
task_logger.info("执行 TXT 翻译流程。")
|
task_logger.info("执行 TXT 翻译流程。")
|
||||||
translate_config = TXTTranslateConfig(**ai_config.__dict__)
|
translate_config = TXTTranslatorConfig(**ai_config.__dict__)
|
||||||
await workflow.translate_async(translate_config=translate_config)
|
await workflow.translate_async(translate_config=translate_config)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@@ -750,7 +750,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
if not task_state.get("download_ready") or not task_state.get("workflow_instance"):
|
if not task_state.get("download_ready") or not task_state.get("workflow_instance"):
|
||||||
raise HTTPException(status_code=404, detail="内容尚未准备好。")
|
raise HTTPException(status_code=404, detail="内容尚未准备好。")
|
||||||
|
|
||||||
workflow: BaseWorkflow = task_state["workflow_instance"]
|
workflow: Workflow = task_state["workflow_instance"]
|
||||||
filename_stem = task_state['original_filename_stem']
|
filename_stem = task_state['original_filename_stem']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -759,8 +759,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
filename: str
|
filename: str
|
||||||
|
|
||||||
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
||||||
config = MD2HTMLExportConfig(cdn=True) if isinstance(workflow,
|
config = MD2HTMLExporterConfig(cdn=True) if isinstance(workflow,
|
||||||
MarkdownBasedWorkflow) else TXT2HTMLExportConfig(
|
MarkdownBasedWorkflow) else TXT2HTMLExporterConfig(
|
||||||
cdn=True)
|
cdn=True)
|
||||||
try:
|
try:
|
||||||
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js",
|
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js",
|
||||||
@@ -1073,14 +1073,14 @@ async def temp_translate(
|
|||||||
workflow.read_bytes(decoded_content, Path(file_name).stem, Path(file_name).suffix)
|
workflow.read_bytes(decoded_content, Path(file_name).stem, Path(file_name).suffix)
|
||||||
|
|
||||||
if isinstance(workflow, MarkdownBasedWorkflow):
|
if isinstance(workflow, MarkdownBasedWorkflow):
|
||||||
translate_config = MDTranslateConfig(**ai_config.__dict__)
|
translate_config = MDTranslatorConfig(**ai_config.__dict__)
|
||||||
convert_config = ConverterMineruConfig(mineru_token=mineru_token) if mineru_token else None
|
convert_config = ConverterMineruConfig(mineru_token=mineru_token) if mineru_token else None
|
||||||
convert_engin = 'mineru' if mineru_token else None
|
convert_engin = 'mineru' if mineru_token else None
|
||||||
await workflow.translate_async(convert_engin, convert_config, translate_config)
|
await workflow.translate_async(convert_engin, convert_config, translate_config)
|
||||||
return {"success": True, "content": workflow.export_to_markdown()}
|
return {"success": True, "content": workflow.export_to_markdown()}
|
||||||
|
|
||||||
elif isinstance(workflow, TXTWorkflow):
|
elif isinstance(workflow, TXTWorkflow):
|
||||||
translate_config = TXTTranslateConfig(**ai_config.__dict__)
|
translate_config = TXTTranslatorConfig(**ai_config.__dict__)
|
||||||
await workflow.translate_async(translate_config)
|
await workflow.translate_async(translate_config)
|
||||||
return {"success": True, "content": workflow.export_to_txt()}
|
return {"success": True, "content": workflow.export_to_txt()}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
from docutranslate.exporter.md2x.types import x2md_convert_config_type
|
from docutranslate.converter.base import ConverterConfig
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
@@ -13,17 +13,17 @@ class MDBasedCovertCacher:
|
|||||||
self.cache_dict = OrderedDict()
|
self.cache_dict = OrderedDict()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_hashcode(document: Document, convert_engin: str, convert_config: x2md_convert_config_type) -> str:
|
def _get_hashcode(document: Document, convert_engin: str, convert_config: ConverterConfig) -> str:
|
||||||
obj = (document.suffix, document.content, convert_engin, convert_config)
|
obj = (document.suffix, document.content, convert_engin, convert_config.gethash())
|
||||||
return str(hash(obj))
|
return str(hash(obj))
|
||||||
|
|
||||||
def get_cached_result(self, document: Document, convert_engin: str,
|
def get_cached_result(self, document: Document, convert_engin: str,
|
||||||
convert_config: x2md_convert_config_type) -> MarkdownDocument | None:
|
convert_config: ConverterConfig) -> MarkdownDocument | None:
|
||||||
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config))
|
return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config.gethash()))
|
||||||
|
|
||||||
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
|
def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str,
|
||||||
convert_config: x2md_convert_config_type) -> MarkdownDocument:
|
convert_config: ConverterConfig) -> MarkdownDocument:
|
||||||
hash_code = self._get_hashcode(document, convert_engin, convert_config)
|
hash_code = self._get_hashcode(document, convert_engin, convert_config.gethash())
|
||||||
if len(self.cache_dict) > int(CACHE_NUM):
|
if len(self.cache_dict) > int(CACHE_NUM):
|
||||||
self.cache_dict.popitem(last=False)
|
self.cache_dict.popitem(last=False)
|
||||||
self.cache_dict[hash_code] = convert_result
|
self.cache_dict[hash_code] = convert_result
|
||||||
|
|||||||
29
docutranslate/converter/base.py
Normal file
29
docutranslate/converter/base.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
from typing import Hashable
|
||||||
|
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.logger import global_logger
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class ConverterConfig(ABC):
|
||||||
|
logger: Logger | None = None
|
||||||
|
@abstractmethod
|
||||||
|
def gethash(self)->Hashable:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Converter(ABC):
|
||||||
|
def __init__(self, config: ConverterConfig | None = None):
|
||||||
|
self.config = config
|
||||||
|
self.logger = config.logger or global_logger
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def convert(self, document: Document) -> Document:
|
||||||
|
...
|
||||||
|
|
||||||
|
async def convert_async(self, document: Document) -> Document:
|
||||||
|
...
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
from typing import Protocol
|
|
||||||
|
|
||||||
from docutranslate.ir.document import Document
|
|
||||||
|
|
||||||
|
|
||||||
class Converter(Protocol):
|
|
||||||
def convert(self, document: Document) -> Document:
|
|
||||||
...
|
|
||||||
|
|
||||||
async def convert_async(self, document: Document) -> Document:
|
|
||||||
...
|
|
||||||
@@ -1,19 +1,28 @@
|
|||||||
from typing import Protocol
|
from abc import abstractmethod
|
||||||
from docutranslate.converter.interfaces import Converter
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from docutranslate.converter.base import Converter, ConverterConfig
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class X2MarkdownConverterConfig(ConverterConfig):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
class X2MarkdownConverter(Converter,Protocol):
|
class X2MarkdownConverter(Converter):
|
||||||
"""
|
"""
|
||||||
负责将其它格式的文件转换为markdown
|
负责将其它格式的文件转换为markdown
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
def convert(self, document: Document) -> MarkdownDocument:
|
def convert(self, document: Document) -> MarkdownDocument:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
def support_format(self)->list[str]:
|
def support_format(self)->list[str]:
|
||||||
...
|
...
|
||||||
@@ -3,7 +3,6 @@ import os
|
|||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from logging import Logger
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@@ -14,34 +13,34 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||||
|
|
||||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.logger import global_logger
|
|
||||||
|
|
||||||
IMAGE_RESOLUTION_SCALE = 4
|
IMAGE_RESOLUTION_SCALE = 4
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(kw_only=True)
|
||||||
class ConverterDoclingConfig:
|
class ConverterDoclingConfig(X2MarkdownConverterConfig):
|
||||||
code: bool = True
|
code: bool = True
|
||||||
formula: bool = True
|
formula: bool = True
|
||||||
artifact: Path | None = None
|
artifact: Path | None = None
|
||||||
|
|
||||||
|
def gethash(self):
|
||||||
|
return self.code,self.formula
|
||||||
|
|
||||||
|
|
||||||
class ConverterDocling(X2MarkdownConverter):
|
class ConverterDocling(X2MarkdownConverter):
|
||||||
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
|
def __init__(self, config: ConverterDoclingConfig):
|
||||||
self.logger = logger
|
super().__init__(config=config)
|
||||||
self.config = config
|
|
||||||
self.code = config.code
|
self.code = config.code
|
||||||
self.formula = config.formula
|
self.formula = config.formula
|
||||||
artifact=Path("./docling_artifact")
|
artifact = Path("./docling_artifact")
|
||||||
if artifact.is_dir():
|
if artifact.is_dir():
|
||||||
self.logger.info("使用./docling_artifact的本地模型")
|
self.logger.info("使用./docling_artifact的本地模型")
|
||||||
self.artifact=artifact
|
self.artifact = artifact
|
||||||
else:
|
else:
|
||||||
self.artifact=config.artifact
|
self.artifact = config.artifact
|
||||||
|
|
||||||
|
|
||||||
def convert(self, document) -> MarkdownDocument:
|
def convert(self, document) -> MarkdownDocument:
|
||||||
assert isinstance(document.name, str)
|
assert isinstance(document.name, str)
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
from docutranslate.converter.x2md.base import X2MarkdownConverter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
|
||||||
class ConverterIdentity(X2MarkdownConverter):
|
class ConverterIdentity(X2MarkdownConverter):
|
||||||
|
#TODO:支持markdown_zip格式输入
|
||||||
def convert(self, document: Document) -> MarkdownDocument:
|
def convert(self, document: Document) -> MarkdownDocument:
|
||||||
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||||
|
|
||||||
|
|||||||
@@ -3,10 +3,11 @@ import time
|
|||||||
import zipfile
|
import zipfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
|
from typing import Hashable
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
@@ -15,11 +16,14 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
|||||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(kw_only=True)
|
||||||
class ConverterMineruConfig:
|
class ConverterMineruConfig(X2MarkdownConverterConfig):
|
||||||
mineru_token: str
|
mineru_token: str
|
||||||
formula: bool = True
|
formula: bool = True
|
||||||
|
|
||||||
|
def gethash(self) ->Hashable:
|
||||||
|
return self.formula
|
||||||
|
|
||||||
|
|
||||||
timeout = httpx.Timeout(
|
timeout = httpx.Timeout(
|
||||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||||
@@ -34,7 +38,7 @@ client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, v
|
|||||||
|
|
||||||
class ConverterMineru(X2MarkdownConverter):
|
class ConverterMineru(X2MarkdownConverter):
|
||||||
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
|
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
|
||||||
self.config = config
|
super().__init__(config=config)
|
||||||
self.mineru_token = config.mineru_token.strip()
|
self.mineru_token = config.mineru_token.strip()
|
||||||
self.formula = config.formula
|
self.formula = config.formula
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
|||||||
20
docutranslate/exporter/base.py
Normal file
20
docutranslate/exporter/base.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from abc import ABC,abstractmethod
|
||||||
|
from typing import Generic,TypeVar, Any
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
D_in = TypeVar('D_in', bound=Document)
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class ExporterConfig:
|
||||||
|
...
|
||||||
|
|
||||||
|
class Exporter(ABC,Generic[D_in]):
|
||||||
|
def __init__(self,config:ExporterConfig|None=None):
|
||||||
|
self.config=config
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def export(self, document: D_in) -> Any:
|
||||||
|
...
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ExportConfig:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
from typing import Protocol, TypeVar, Any, Self
|
|
||||||
|
|
||||||
from docutranslate.exporter.export_config import ExportConfig
|
|
||||||
from docutranslate.ir.document import Document
|
|
||||||
|
|
||||||
D_in = TypeVar('D_in', bound=Document)
|
|
||||||
|
|
||||||
|
|
||||||
class Exporter(Protocol[D_in]):
|
|
||||||
@classmethod
|
|
||||||
def from_config(cls, export_config: ExportConfig | None = None) -> Self:
|
|
||||||
...
|
|
||||||
|
|
||||||
def export(self, document: D_in) -> Any:
|
|
||||||
...
|
|
||||||
18
docutranslate/exporter/md/base.py
Normal file
18
docutranslate/exporter/md/base.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import Exporter, ExporterConfig
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class MDExporterConfig(ExporterConfig):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class MDExporter(Exporter):
|
||||||
|
def __init__(self, config: MDExporterConfig|None=None):
|
||||||
|
super().__init__(config=config)
|
||||||
|
|
||||||
|
def export(self, document: MarkdownDocument) -> Document:
|
||||||
|
...
|
||||||
@@ -3,20 +3,20 @@ from dataclasses import dataclass
|
|||||||
import jinja2
|
import jinja2
|
||||||
import markdown2
|
import markdown2
|
||||||
|
|
||||||
from docutranslate.exporter.export_config import ExportConfig
|
from docutranslate.exporter.md.base import MDExporter, MDExporterConfig
|
||||||
from docutranslate.exporter.md2x.interfaces import MDExporter
|
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MD2HTMLExportConfig(ExportConfig):
|
class MD2HTMLExporterConfig(MDExporterConfig):
|
||||||
cdn: bool = True
|
cdn: bool = True
|
||||||
|
|
||||||
class MD2HTMLExporter(MDExporter):
|
class MD2HTMLExporter(MDExporter):
|
||||||
def __init__(self, export_config: MD2HTMLExportConfig = None):
|
def __init__(self, config: MD2HTMLExporterConfig = None):
|
||||||
export_config = export_config or MD2HTMLExportConfig()
|
config = config or MD2HTMLExporterConfig()
|
||||||
self.cdn=export_config.cdn
|
super().__init__(config=config)
|
||||||
|
self.cdn=config.cdn
|
||||||
|
|
||||||
def export(self, document: MarkdownDocument) -> Document:
|
def export(self, document: MarkdownDocument) -> Document:
|
||||||
cdn = self.cdn
|
cdn = self.cdn
|
||||||
8
docutranslate/exporter/md/md2md_exporter.py
Normal file
8
docutranslate/exporter/md/md2md_exporter.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from docutranslate.exporter.md.base import MDExporter
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument, Document
|
||||||
|
|
||||||
|
|
||||||
|
class MD2MDExporter(MDExporter):
|
||||||
|
|
||||||
|
def export(self, document: MarkdownDocument) -> Document:
|
||||||
|
return Document.from_bytes(suffix=".md", content=document.content, stem=document.stem)
|
||||||
11
docutranslate/exporter/md/md2mdzip_exporter.py
Normal file
11
docutranslate/exporter/md/md2mdzip_exporter.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from docutranslate.exporter.md.base import MDExporter
|
||||||
|
from docutranslate.ir.markdown_document import MarkdownDocument, Document
|
||||||
|
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
|
||||||
|
|
||||||
|
|
||||||
|
class MD2MDZipExporter(MDExporter):
|
||||||
|
|
||||||
|
def export(self, document: MarkdownDocument) -> Document:
|
||||||
|
return Document.from_bytes(suffix=".zip", content=unembed_base64_images_to_zip(document.content.decode(),
|
||||||
|
markdown_name=document.name),
|
||||||
|
stem=document.stem)
|
||||||
3
docutranslate/exporter/md/types.py
Normal file
3
docutranslate/exporter/md/types.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
ConvertEnginType = Literal["mineru", "docling"]
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
from docutranslate.exporter.interfaces import Exporter
|
|
||||||
from docutranslate.ir.document import Document
|
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
|
||||||
|
|
||||||
|
|
||||||
class MDExporter(Exporter):
|
|
||||||
|
|
||||||
def export(self,document:MarkdownDocument)->Document:
|
|
||||||
...
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
from docutranslate.exporter.export_config import ExportConfig
|
|
||||||
from docutranslate.exporter.md2x.interfaces import MDExporter
|
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument,Document
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class MD2MDExportConfig(ExportConfig):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MD2MDExporter(MDExporter):
|
|
||||||
def __init__(self, export_config: MD2MDExportConfig | None=None):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def export(self,document:MarkdownDocument)->Document:
|
|
||||||
return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
from docutranslate.exporter.export_config import ExportConfig
|
|
||||||
from docutranslate.exporter.md2x.interfaces import MDExporter
|
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument,Document
|
|
||||||
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class MD2MDZIPExportConfig(ExportConfig):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class MD2MDZipExporter(MDExporter):
|
|
||||||
def __init__(self, export_config: MD2MDZIPExportConfig | None=None):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def export(self,document:MarkdownDocument)->Document:
|
|
||||||
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
from typing import Literal, TYPE_CHECKING
|
|
||||||
|
|
||||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
|
||||||
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
|
||||||
|
|
||||||
if DOCLING_EXIST or TYPE_CHECKING:
|
|
||||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
|
||||||
|
|
||||||
convert_engin_type = Literal["mineru", "docling"]
|
|
||||||
|
|
||||||
if DOCLING_EXIST or TYPE_CHECKING:
|
|
||||||
x2md_convert_config_type = ConverterDoclingConfig | ConverterMineruConfig
|
|
||||||
else:
|
|
||||||
x2md_convert_config_type = ConverterMineruConfig
|
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
from docutranslate.exporter.interfaces import Exporter
|
from docutranslate.exporter.base import Exporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
#TODO:看情况是否需要为TXT单独写一个document类型
|
#TODO:看情况是否需要为TXT单独写一个document类型
|
||||||
class TXTExporter(Exporter):
|
class TXTExporter(Exporter[Document]):
|
||||||
|
|
||||||
def export(self,document:Document)->Document:
|
def export(self,document:Document)->Document:
|
||||||
...
|
...
|
||||||
@@ -2,21 +2,22 @@ from dataclasses import dataclass
|
|||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
|
|
||||||
from docutranslate.exporter.export_config import ExportConfig
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
from docutranslate.exporter.txt.base import TXTExporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TXT2HTMLExportConfig(ExportConfig):
|
class TXT2HTMLExporterConfig(ExporterConfig):
|
||||||
cdn: bool = True
|
cdn: bool = True
|
||||||
|
|
||||||
|
|
||||||
class TXT2HTMLExporter(TXTExporter):
|
class TXT2HTMLExporter(TXTExporter):
|
||||||
def __init__(self, export_config: TXT2HTMLExportConfig = None):
|
def __init__(self, config: TXT2HTMLExporterConfig = None):
|
||||||
export_config = export_config or TXT2HTMLExportConfig()
|
config = config or TXT2HTMLExporterConfig()
|
||||||
self.cdn = export_config.cdn
|
super().__init__(config=config)
|
||||||
|
self.cdn = config.cdn
|
||||||
|
|
||||||
def export(self, document: Document) -> Document:
|
def export(self, document: Document) -> Document:
|
||||||
cdn = self.cdn
|
cdn = self.cdn
|
||||||
@@ -1,10 +1,7 @@
|
|||||||
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
from docutranslate.exporter.txt.base import TXTExporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TXT2TXTExporter(TXTExporter):
|
class TXT2TXTExporter(TXTExporter):
|
||||||
def export(self, document: Document) -> Document:
|
def export(self, document: Document) -> Document:
|
||||||
return document.copy()
|
return document.copy()
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
from dataclasses import dataclass
|
|
||||||
from logging import Logger
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AiTranslateConfig:
|
|
||||||
base_url: str
|
|
||||||
api_key: str
|
|
||||||
model_id: str
|
|
||||||
to_lang: str
|
|
||||||
custom_prompt: str | None = None
|
|
||||||
temperature: float = 0.7
|
|
||||||
timeout: int = 2000
|
|
||||||
chunk_size: int = 3000
|
|
||||||
concurrent: int = 30
|
|
||||||
logger: Logger | None = None
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
from typing import Protocol, TypeVar
|
|
||||||
|
|
||||||
from docutranslate.agents import Agent
|
|
||||||
from docutranslate.ir.document import Document
|
|
||||||
|
|
||||||
T=TypeVar('T',bound=Document)
|
|
||||||
V=TypeVar('V',bound=Agent)
|
|
||||||
|
|
||||||
class Translator(Protocol[T,V]):
|
|
||||||
"""
|
|
||||||
翻译中间文本(原地替换),Translator不做格式转换
|
|
||||||
"""
|
|
||||||
def translate(self, document:T) -> Document:
|
|
||||||
...
|
|
||||||
|
|
||||||
async def translate_async(self, document: T) -> Document:
|
|
||||||
...
|
|
||||||
|
|
||||||
def log(self,info:str):
|
|
||||||
...
|
|
||||||
0
docutranslate/translator/ai_translator/__init__.py
Normal file
0
docutranslate/translator/ai_translator/__init__.py
Normal file
35
docutranslate/translator/ai_translator/base.py
Normal file
35
docutranslate/translator/ai_translator/base.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from abc import abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
from typing import TypeVar
|
||||||
|
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.base import Translator, TranslatorConfig
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class AiTranslatorConfig(TranslatorConfig):
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
model_id: str
|
||||||
|
to_lang: str
|
||||||
|
custom_prompt: str | None = None
|
||||||
|
temperature: float = 0.7
|
||||||
|
timeout: int = 2000
|
||||||
|
chunk_size: int = 3000
|
||||||
|
concurrent: int = 30
|
||||||
|
|
||||||
|
T=TypeVar('T',bound=Document)
|
||||||
|
|
||||||
|
class AiTranslator(Translator[T]):
|
||||||
|
"""
|
||||||
|
翻译中间文本(原地替换),Translator不做格式转换
|
||||||
|
"""
|
||||||
|
def __init__(self,config:AiTranslatorConfig,logger:Logger|None=None):
|
||||||
|
super().__init__(config=config,logger=logger)
|
||||||
|
@abstractmethod
|
||||||
|
def translate(self, document:T) -> Document:
|
||||||
|
...
|
||||||
|
@abstractmethod
|
||||||
|
async def translate_async(self, document: T) -> Document:
|
||||||
|
...
|
||||||
@@ -5,22 +5,21 @@ from typing import Self
|
|||||||
from docutranslate.agents import MDTranslateAgent
|
from docutranslate.agents import MDTranslateAgent
|
||||||
from docutranslate.context.md_mask_context import MDMaskUrisContext
|
from docutranslate.context.md_mask_context import MDMaskUrisContext
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
||||||
from docutranslate.translater.base import AiTranslateConfig
|
from docutranslate.translator.base import Translator
|
||||||
from docutranslate.translater.interfaces import Translator
|
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MDTranslateConfig(AiTranslateConfig):
|
class MDTranslatorConfig(AiTranslatorConfig):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MDTranslator(Translator):
|
class MDTranslator(Translator):
|
||||||
def __init__(self, config: MDTranslateConfig):
|
def __init__(self, config: MDTranslatorConfig):
|
||||||
self.logger = config.logger or global_logger
|
super().__init__(config=config)
|
||||||
self.chunk_size = config.chunk_size
|
self.chunk_size = config.chunk_size
|
||||||
self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt,
|
self.translate_agent = MDTranslateAgent(custom_prompt=config.custom_prompt,
|
||||||
to_lang=config.to_lang,
|
to_lang=config.to_lang,
|
||||||
@@ -3,20 +3,19 @@ from typing import Self
|
|||||||
|
|
||||||
from docutranslate.agents.txt_agent import TXTTranslateAgent
|
from docutranslate.agents.txt_agent import TXTTranslateAgent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
||||||
from docutranslate.translater.base import AiTranslateConfig
|
from docutranslate.translator.base import Translator
|
||||||
from docutranslate.translater.interfaces import Translator
|
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text
|
from docutranslate.utils.markdown_splitter import split_markdown_text
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TXTTranslateConfig(AiTranslateConfig):
|
class TXTTranslatorConfig(AiTranslatorConfig):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
class TXTTranslator(Translator):
|
class TXTTranslator(Translator):
|
||||||
def __init__(self, config: TXTTranslateConfig):
|
def __init__(self, config: TXTTranslatorConfig):
|
||||||
self.logger = config.logger or global_logger
|
super().__init__(config=config)
|
||||||
self.chunk_size = config.chunk_size
|
self.chunk_size = config.chunk_size
|
||||||
self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt,
|
self.translate_agent = TXTTranslateAgent(custom_prompt=config.custom_prompt,
|
||||||
to_lang=config.to_lang,
|
to_lang=config.to_lang,
|
||||||
27
docutranslate/translator/base.py
Normal file
27
docutranslate/translator/base.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
|
from typing import TypeVar,Generic
|
||||||
|
from abc import ABC,abstractmethod
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.logger import global_logger
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class TranslatorConfig:
|
||||||
|
logger:Logger|None=None
|
||||||
|
|
||||||
|
T=TypeVar('T',bound=Document)
|
||||||
|
|
||||||
|
class Translator(ABC,Generic[T]):
|
||||||
|
"""
|
||||||
|
翻译中间文本(原地替换),Translator不做格式转换
|
||||||
|
"""
|
||||||
|
def __init__(self,config:TranslatorConfig|None=None):
|
||||||
|
self.config=config
|
||||||
|
self.logger=config.logger or global_logger
|
||||||
|
@abstractmethod
|
||||||
|
def translate(self, document:T) -> Document:
|
||||||
|
...
|
||||||
|
@abstractmethod
|
||||||
|
async def translate_async(self, document: T) -> Document:
|
||||||
|
...
|
||||||
@@ -1,19 +1,27 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self, Generic, TypeVar
|
from typing import Self, Generic, TypeVar
|
||||||
|
|
||||||
from docutranslate.exporter.interfaces import Exporter
|
from docutranslate.exporter.base import Exporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class WorkflowConfig:
|
||||||
|
logger: Logger | None = None
|
||||||
|
|
||||||
|
|
||||||
|
T_original = TypeVar('T_original', bound=Document)
|
||||||
T_Translated = TypeVar('T_Translated', bound=Document)
|
T_Translated = TypeVar('T_Translated', bound=Document)
|
||||||
|
|
||||||
|
|
||||||
class BaseWorkflow(ABC, Generic[T_Translated]):
|
class Workflow(ABC, Generic[T_original, T_Translated]):
|
||||||
def __init__(self, logger: Logger = global_logger):
|
def __init__(self, logger: Logger = global_logger):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.document_original: Document | None = None
|
self.document_original: T_original | None = None
|
||||||
self.document_translated: T_Translated | None = None
|
self.document_translated: T_Translated | None = None
|
||||||
|
|
||||||
def read_path(self, path: Path | str) -> Self:
|
def read_path(self, path: Path | str) -> Self:
|
||||||
@@ -1,9 +1,9 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Protocol, Self, TypeVar, runtime_checkable
|
from typing import Protocol, Self, TypeVar, runtime_checkable
|
||||||
|
|
||||||
from docutranslate.exporter.export_config import ExportConfig
|
from docutranslate.exporter.export_config import ExporterConfig
|
||||||
|
|
||||||
T = TypeVar("T", bound=ExportConfig)
|
T = TypeVar("T", bound=ExporterConfig)
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class HTMLExportable(Protocol[T]):
|
class HTMLExportable(Protocol[T]):
|
||||||
|
|||||||
@@ -1,40 +1,84 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from logging import Logger
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self, Literal, overload, TYPE_CHECKING
|
from typing import Self, Tuple, Any
|
||||||
|
|
||||||
from docutranslate.cacher import md_based_convert_cacher
|
from docutranslate.cacher import md_based_convert_cacher
|
||||||
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
||||||
|
|
||||||
if DOCLING_EXIST or TYPE_CHECKING:
|
if DOCLING_EXIST:
|
||||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
||||||
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
||||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
||||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
from docutranslate.converter.x2md.base import X2MarkdownConverterConfig
|
||||||
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
|
from docutranslate.exporter.md.md2html_exporter import MD2HTMLExporterConfig, MD2HTMLExporter
|
||||||
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
|
from docutranslate.exporter.md.md2md_exporter import MD2MDExporter
|
||||||
from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter
|
from docutranslate.exporter.md.md2mdzip_exporter import MD2MDZipExporter
|
||||||
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
|
from docutranslate.exporter.md.types import ConvertEnginType
|
||||||
from docutranslate.workflow.base_workflow import BaseWorkflow
|
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
||||||
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
|
from docutranslate.workflow.interfaces import MDFormatsExportable, HTMLExportable
|
||||||
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
|
from docutranslate.translator.ai_translator.md_translator import MDTranslatorConfig, MDTranslator
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
|
@dataclass(kw_only=True)
|
||||||
def __init__(self, *args, **kwargs):
|
class MarkdownBasedWorkflowConfig(WorkflowConfig):
|
||||||
super().__init__(*args, **kwargs)
|
# X2MarkdownConverterConfig
|
||||||
|
convert_engine: ConvertEnginType | None
|
||||||
|
formula: bool = True
|
||||||
|
# ConverterDoclingConfig
|
||||||
|
code: bool = True
|
||||||
|
artifact: Path | None = None
|
||||||
|
# ConverterMineruConfig
|
||||||
|
mineru_token: str
|
||||||
|
# MDTranslatorConfig
|
||||||
|
base_url: str
|
||||||
|
api_key: str
|
||||||
|
model_id: str
|
||||||
|
to_lang: str
|
||||||
|
custom_prompt: str | None = None
|
||||||
|
temperature: float = 0.7
|
||||||
|
timeout: int = 2000
|
||||||
|
chunk_size: int = 3000
|
||||||
|
concurrent: int = 30
|
||||||
|
# MD2HTMLExporterConfig
|
||||||
|
cdn: bool = True
|
||||||
|
# general
|
||||||
|
logger: Logger | None = None
|
||||||
|
|
||||||
if DOCLING_EXIST or TYPE_CHECKING:
|
|
||||||
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
|
|
||||||
"mineru": (ConverterMineru, ConverterMineruConfig),
|
|
||||||
"docling": (ConverterDocling, ConverterDoclingConfig)
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
|
|
||||||
"mineru": (ConverterMineru, ConverterMineruConfig),
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_document_md(self, convert_engin: convert_engin_type | None,
|
class MarkdownBasedWorkflow(Workflow, HTMLExportable, MDFormatsExportable):
|
||||||
convert_config: x2md_convert_config_type | None):
|
def __init__(self, config: MarkdownBasedWorkflowConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
self._converter_factory: dict[ConvertEnginType, Tuple[Any, Any]] = {
|
||||||
|
"mineru": (ConverterMineru, ConverterMineruConfig),
|
||||||
|
}
|
||||||
|
if DOCLING_EXIST:
|
||||||
|
self._converter_factory["docling"] = (ConverterDocling, ConverterDoclingConfig)
|
||||||
|
self.x2markdown_converter_config:X2MarkdownConverterConfig|None
|
||||||
|
if config.convert_engine is None:
|
||||||
|
self.converter_config=None
|
||||||
|
elif config.convert_engine== "mineru":
|
||||||
|
self.converter_config = ConverterMineruConfig(formula=config.formula,
|
||||||
|
mineru_token=config.mineru_token)
|
||||||
|
elif DOCLING_EXIST and config.convert_engine== "docling":
|
||||||
|
self.converter_config = ConverterDoclingConfig(code=config.code,
|
||||||
|
formula=config.formula,
|
||||||
|
artifact=config.artifact)
|
||||||
|
self.translator_config = MDTranslatorConfig(base_url=config.base_url,
|
||||||
|
api_key=config.api_key,
|
||||||
|
model_id=config.model_id,
|
||||||
|
to_lang=config.to_lang,
|
||||||
|
custom_prompt=config.custom_prompt,
|
||||||
|
temperature=config.temperature,
|
||||||
|
timeout=config.timeout,
|
||||||
|
chunk_size=config.chunk_size,
|
||||||
|
concurrent=config.concurrent,
|
||||||
|
)
|
||||||
|
self.md2html_exporter_config = MD2HTMLExporterConfig(cdn=config.cdn)
|
||||||
|
self.convert_engine=config.convert_engine
|
||||||
|
|
||||||
|
def _get_document_md(self,convert_engin:ConvertEnginType|None,convert_config:X2MarkdownConverterConfig):
|
||||||
if self.document_original is None:
|
if self.document_original is None:
|
||||||
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
||||||
# 获取缓存的解析后文件
|
# 获取缓存的解析后文件
|
||||||
@@ -51,7 +95,7 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
|
|||||||
if not isinstance(convert_config, config_class):
|
if not isinstance(convert_config, config_class):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f"未传入正确的convert_config,应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
|
f"未传入正确的convert_config,应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
|
||||||
converter = converter_class(convert_config, logger=self.logger)
|
converter = converter_class(convert_config)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"不存在{convert_engin}解析引擎")
|
raise ValueError(f"不存在{convert_engin}解析引擎")
|
||||||
document_md = converter.convert(self.document_original)
|
document_md = converter.convert(self.document_original)
|
||||||
@@ -59,67 +103,54 @@ class MarkdownBasedWorkflow(BaseWorkflow, HTMLExportable, MDFormatsExportable):
|
|||||||
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config)
|
||||||
return document_md
|
return document_md
|
||||||
|
|
||||||
@overload
|
|
||||||
def translate(self, convert_engin: None,
|
|
||||||
convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self:
|
|
||||||
...
|
|
||||||
|
|
||||||
@overload
|
def translate(self) -> Self:
|
||||||
def translate(self, convert_engin: Literal["docling"],
|
convert_engin,convert_config=self.convert_engine,self.converter_config
|
||||||
convert_config: "ConverterDoclingConfig", translate_config: MDTranslateConfig) -> Self:
|
translator_config=self.translator_config
|
||||||
...
|
document_md = self._get_document_md(convert_engin,convert_config)
|
||||||
|
|
||||||
@overload
|
|
||||||
def translate(self, convert_engin: Literal["mineru"],
|
|
||||||
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
|
|
||||||
...
|
|
||||||
|
|
||||||
def translate(self, convert_engin: convert_engin_type | None,
|
|
||||||
convert_config: x2md_convert_config_type | None,
|
|
||||||
translate_config: MDTranslateConfig) -> Self:
|
|
||||||
document_md = self._get_document_md(convert_engin, convert_config)
|
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = MDTranslator(translate_config)
|
translator = MDTranslator(translator_config)
|
||||||
translator.translate(document_md)
|
translator.translate(document_md)
|
||||||
self.document_translated = document_md
|
self.document_translated = document_md
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_async(self, convert_engin: Literal["mineru", "docling"] | None,
|
async def translate_async(self) -> Self:
|
||||||
convert_config: x2md_convert_config_type | None,
|
convert_engin,convert_config=self.convert_engine,self.converter_config
|
||||||
translate_config: MDTranslateConfig) -> Self:
|
translator_config=self.translator_config
|
||||||
|
|
||||||
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
document_md = await asyncio.to_thread(self._get_document_md, convert_engin, convert_config)
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = MDTranslator(translate_config)
|
translator = MDTranslator(translator_config)
|
||||||
await translator.translate_async(document_md)
|
await translator.translate_async(document_md)
|
||||||
self.document_translated = document_md
|
self.document_translated = document_md
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def export_to_html(self, export_config: MD2HTMLExportConfig | None = None) -> str:
|
def export_to_html(self, export_config: MD2HTMLExporterConfig | None = None) -> str:
|
||||||
|
export_config=export_config or self.md2html_exporter_config
|
||||||
docu = self._export(MD2HTMLExporter(export_config))
|
docu = self._export(MD2HTMLExporter(export_config))
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
|
def export_to_markdown(self, export_config: X2MarkdownConverterConfig | None = None) -> str:
|
||||||
docu = self._export(MD2MDExporter())
|
docu = self._export(MD2MDExporter())
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes:
|
def export_to_markdown_zip(self, export_config: X2MarkdownConverterConfig | None = None) -> bytes:
|
||||||
docu = self._export(MD2MDZipExporter())
|
docu = self._export(MD2MDZipExporter())
|
||||||
return docu.content
|
return docu.content
|
||||||
|
|
||||||
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: MD2HTMLExportConfig | None = None) -> Self:
|
export_config: MD2HTMLExporterConfig | None = None) -> Self:
|
||||||
self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir)
|
export_config = export_config or self.md2html_exporter_config
|
||||||
|
self._save(exporter=MD2HTMLExporter(config=export_config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: MD2MDExportConfig | None = None) -> Self:
|
export_config=None) -> Self:
|
||||||
|
|
||||||
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: MD2MDZIPExportConfig | None = None) -> Self:
|
export_config=None) -> Self:
|
||||||
|
|
||||||
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
|
from docutranslate.exporter.txt.txt2html_exporter import TXT2HTMLExporterConfig, TXT2HTMLExporter
|
||||||
from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter
|
from docutranslate.exporter.txt.txt2txt_exporter import TXT2TXTExporter
|
||||||
from docutranslate.workflow.base_workflow import BaseWorkflow
|
from docutranslate.workflow.base import Workflow
|
||||||
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
|
from docutranslate.workflow.interfaces import HTMLExportable, TXTExportable
|
||||||
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
|
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig, TXTTranslator
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
class TXTWorkflow(Workflow, HTMLExportable, TXTExportable):
|
||||||
|
|
||||||
def translate(self, translate_config: TXTTranslateConfig) -> Self:
|
def translate(self, translate_config: TXTTranslatorConfig) -> Self:
|
||||||
document = self.document_original.copy()
|
document = self.document_original.copy()
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = TXTTranslator(translate_config)
|
translator = TXTTranslator(translate_config)
|
||||||
@@ -19,7 +19,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
|||||||
self.document_translated = document
|
self.document_translated = document
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
|
async def translate_async(self, translate_config: TXTTranslatorConfig) -> Self:
|
||||||
document = self.document_original.copy()
|
document = self.document_original.copy()
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = TXTTranslator(translate_config)
|
translator = TXTTranslator(translate_config)
|
||||||
@@ -27,7 +27,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
|||||||
self.document_translated = document
|
self.document_translated = document
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str:
|
def export_to_html(self, export_config: TXT2HTMLExporterConfig=None) -> str:
|
||||||
docu = self._export(TXT2HTMLExporter(export_config))
|
docu = self._export(TXT2HTMLExporter(export_config))
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
@@ -36,7 +36,7 @@ class TXTWorkflow(BaseWorkflow, HTMLExportable, TXTExportable):
|
|||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: TXT2HTMLExportConfig | None = None) -> Self:
|
export_config: TXT2HTMLExporterConfig | None = None) -> Self:
|
||||||
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
|
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user