update
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
# docutranslate.app.py
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
import binascii
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
# --- 新增: 通用 Workflow 工厂函数 ---
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
# docutranslate.core.factory.py
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
|||||||
320
docutranslate/sdk.py
Normal file
320
docutranslate/sdk.py
Normal file
@@ -0,0 +1,320 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
# docutranslate/sdk.py
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Literal, Dict, Any, List, Union
|
||||||
|
|
||||||
|
from pydantic import TypeAdapter
|
||||||
|
|
||||||
|
from docutranslate.core.schemas import TranslatePayload, GlossaryAgentConfigPayload
|
||||||
|
from docutranslate.core.factory import create_workflow_from_payload
|
||||||
|
from docutranslate.translator import default_params
|
||||||
|
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
||||||
|
|
||||||
|
# --- 类型定义 ---
|
||||||
|
WorkflowType = Literal[
|
||||||
|
"auto", "markdown_based", "txt", "json", "xlsx", "docx",
|
||||||
|
"srt", "epub", "html", "ass", "pptx"
|
||||||
|
]
|
||||||
|
ThinkingMode = Literal["default", "enable", "disable"]
|
||||||
|
InsertMode = Literal["replace", "append", "prepend"]
|
||||||
|
ProviderType = Literal[
|
||||||
|
"ollama", "open.bigmodel.cn", "dashscope.aliyuncs.com",
|
||||||
|
"ark.cn-beijing.volces.com", "generativelanguage.googleapis.com",
|
||||||
|
"api.siliconflow.cn", "api.302.ai"
|
||||||
|
]
|
||||||
|
|
||||||
|
# --- 映射配置 ---
|
||||||
|
_WORKFLOW_MAPPINGS = {
|
||||||
|
"markdown_based": {"save": "save_as_markdown_zip", "export": "export_to_markdown_zip"},
|
||||||
|
"docx": {"save": "save_as_docx", "export": "export_to_docx"},
|
||||||
|
"xlsx": {"save": "save_as_xlsx", "export": "export_to_xlsx"},
|
||||||
|
"pptx": {"save": "save_as_pptx", "export": "export_to_pptx"},
|
||||||
|
"epub": {"save": "save_as_epub", "export": "export_to_epub"},
|
||||||
|
"txt": {"save": "save_as_txt", "export": "export_to_txt"},
|
||||||
|
"json": {"save": "save_as_json", "export": "export_to_json"},
|
||||||
|
"srt": {"save": "save_as_srt", "export": "export_to_srt"},
|
||||||
|
"ass": {"save": "save_as_ass", "export": "export_to_ass"},
|
||||||
|
"html": {"save": "save_as_html", "export": "export_to_html"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationResult:
|
||||||
|
"""
|
||||||
|
封装翻译结果,负责后续的保存或导出操作。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, workflow: Any, workflow_type: str, original_filename: str):
|
||||||
|
self._workflow = workflow
|
||||||
|
self._workflow_type = workflow_type
|
||||||
|
self._mapping = _WORKFLOW_MAPPINGS.get(workflow_type)
|
||||||
|
|
||||||
|
def save(self, output_dir: str = "./output", name: Optional[str] = None) -> str:
|
||||||
|
"""
|
||||||
|
保存结果到文件系统。
|
||||||
|
:param output_dir: 输出目录。
|
||||||
|
:param name: 文件名 (如 'result.docx')。若为 None,使用默认后缀命名。
|
||||||
|
:return: 保存文件的完整路径 (仅供参考)。
|
||||||
|
"""
|
||||||
|
if not self._mapping:
|
||||||
|
raise ValueError(f"工作流 {self._workflow_type} 不支持自动保存")
|
||||||
|
|
||||||
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
method = self._mapping["save"]
|
||||||
|
if hasattr(self._workflow, method):
|
||||||
|
getattr(self._workflow, method)(name=name, output_dir=output_dir)
|
||||||
|
return os.path.join(output_dir, name) if name else f"{output_dir} (Auto named)"
|
||||||
|
raise AttributeError(f"Workflow 缺少方法 {method}")
|
||||||
|
|
||||||
|
def export(self) -> str:
|
||||||
|
"""
|
||||||
|
导出为 Base64 编码的字符串 (用于 API 传输或无需落盘的场景)。
|
||||||
|
"""
|
||||||
|
if not self._mapping:
|
||||||
|
raise ValueError(f"工作流 {self._workflow_type} 不支持导出")
|
||||||
|
|
||||||
|
method = self._mapping["export"]
|
||||||
|
if hasattr(self._workflow, method):
|
||||||
|
content = getattr(self._workflow, method)()
|
||||||
|
if isinstance(content, str):
|
||||||
|
content = content.encode('utf-8')
|
||||||
|
return base64.b64encode(content).decode('utf-8')
|
||||||
|
raise AttributeError(f"Workflow 缺少方法 {method}")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def workflow(self):
|
||||||
|
"""获取底层 Workflow 实例以访问高级功能 (如附件)"""
|
||||||
|
return self._workflow
|
||||||
|
|
||||||
|
|
||||||
|
class DocuTranslate:
|
||||||
|
"""
|
||||||
|
DocuTranslate SDK。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
model_id: Optional[str] = None,
|
||||||
|
to_lang: str = "中文",
|
||||||
|
concurrent: int = default_params["concurrent"],
|
||||||
|
timeout: int = default_params["timeout"],
|
||||||
|
retry: int = default_params["retry"],
|
||||||
|
thinking: ThinkingMode = default_params["thinking"],
|
||||||
|
system_proxy_enable: bool = default_params["system_proxy_enable"],
|
||||||
|
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
|
||||||
|
mineru_token: Optional[str] = None,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
初始化 SDK 实例。
|
||||||
|
此处设置的参数将作为全局默认值,可在调用 translate 时被覆盖。
|
||||||
|
"""
|
||||||
|
self.defaults = {
|
||||||
|
"api_key": api_key, "base_url": base_url, "model_id": model_id,
|
||||||
|
"to_lang": to_lang, "concurrent": concurrent, "timeout": timeout,
|
||||||
|
"retry": retry, "thinking": thinking,
|
||||||
|
"system_proxy_enable": system_proxy_enable,
|
||||||
|
"convert_engine": convert_engine, "mineru_token": mineru_token,
|
||||||
|
**kwargs
|
||||||
|
}
|
||||||
|
self.defaults = {k: v for k, v in self.defaults.items() if v is not None}
|
||||||
|
|
||||||
|
def translate(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
*,
|
||||||
|
# --- 为了获得 IDE 提示,必须在这里显式列出所有参数 ---
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
model_id: Optional[str] = None,
|
||||||
|
to_lang: Optional[str] = None,
|
||||||
|
workflow_type: WorkflowType = "auto",
|
||||||
|
skip_translate: bool = False,
|
||||||
|
concurrent: Optional[int] = None,
|
||||||
|
chunk_size: Optional[int] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
retry: Optional[int] = None,
|
||||||
|
thinking: Optional[ThinkingMode] = None,
|
||||||
|
custom_prompt: Optional[str] = None,
|
||||||
|
system_proxy_enable: Optional[bool] = None,
|
||||||
|
force_json: Optional[bool] = None,
|
||||||
|
rpm: Optional[int] = None,
|
||||||
|
tpm: Optional[int] = None,
|
||||||
|
provider: Optional[Union[ProviderType, str]] = None,
|
||||||
|
insert_mode: Optional[InsertMode] = None,
|
||||||
|
separator: Optional[str] = None,
|
||||||
|
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None,
|
||||||
|
translate_regions: Optional[List[str]] = None,
|
||||||
|
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
|
||||||
|
mineru_token: Optional[str] = None,
|
||||||
|
model_version: Optional[Literal["pipeline", "vlm"]] = None,
|
||||||
|
formula_ocr: Optional[bool] = None,
|
||||||
|
code_ocr: Optional[bool] = None,
|
||||||
|
mineru_deploy_base_url: Optional[str] = None,
|
||||||
|
mineru_deploy_backend: Optional[str] = None,
|
||||||
|
mineru_deploy_formula_enable: Optional[bool] = None,
|
||||||
|
mineru_deploy_start_page_id: Optional[int] = None,
|
||||||
|
mineru_deploy_end_page_id: Optional[int] = None,
|
||||||
|
mineru_deploy_lang_list: Optional[List[str]] = None,
|
||||||
|
mineru_deploy_server_url: Optional[str] = None,
|
||||||
|
json_paths: Optional[List[str]] = None,
|
||||||
|
glossary_generate_enable: Optional[bool] = None,
|
||||||
|
glossary_dict: Optional[Dict[str, str]] = None,
|
||||||
|
glossary_agent_config: Optional[Union[GlossaryAgentConfigPayload, Dict[str, Any]]] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
同步执行翻译。参数说明请参考 translate_async。
|
||||||
|
"""
|
||||||
|
# 获取当前函数的所有参数(这包含了你传入的 api_key, model_id 等)
|
||||||
|
# 排除掉 self,剩下的就是传给 async 函数的参数
|
||||||
|
args = locals()
|
||||||
|
call_params = {k: v for k, v in args.items() if k != 'self'}
|
||||||
|
|
||||||
|
# 剔除 kwargs 避免双重传递 (因为 call_params 已经包含了 kwargs 的内容)
|
||||||
|
if 'kwargs' in call_params:
|
||||||
|
extra = call_params.pop('kwargs')
|
||||||
|
call_params.update(extra)
|
||||||
|
|
||||||
|
return asyncio.run(self.translate_async(**call_params))
|
||||||
|
|
||||||
|
async def translate_async(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
*,
|
||||||
|
# --- 核心覆盖参数 ---
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
model_id: Optional[str] = None,
|
||||||
|
to_lang: Optional[str] = None,
|
||||||
|
|
||||||
|
# --- 流程控制 ---
|
||||||
|
workflow_type: WorkflowType = "auto",
|
||||||
|
skip_translate: bool = False,
|
||||||
|
|
||||||
|
# --- LLM 参数 ---
|
||||||
|
concurrent: Optional[int] = None,
|
||||||
|
chunk_size: Optional[int] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
retry: Optional[int] = None,
|
||||||
|
thinking: Optional[ThinkingMode] = None,
|
||||||
|
custom_prompt: Optional[str] = None,
|
||||||
|
system_proxy_enable: Optional[bool] = None,
|
||||||
|
force_json: Optional[bool] = None,
|
||||||
|
rpm: Optional[int] = None,
|
||||||
|
tpm: Optional[int] = None,
|
||||||
|
provider: Optional[Union[ProviderType, str]] = None,
|
||||||
|
|
||||||
|
# --- 格式参数 (Docx/Excel/Txt) ---
|
||||||
|
insert_mode: Optional[InsertMode] = None,
|
||||||
|
separator: Optional[str] = None,
|
||||||
|
segment_mode: Optional[Literal["line", "paragraph", "none"]] = None,
|
||||||
|
translate_regions: Optional[List[str]] = None,
|
||||||
|
|
||||||
|
# --- 解析引擎 (PDF/OCR) ---
|
||||||
|
convert_engine: Optional[Literal["mineru", "docling", "identity", "mineru_deploy"]] = None,
|
||||||
|
mineru_token: Optional[str] = None,
|
||||||
|
model_version: Optional[Literal["pipeline", "vlm"]] = None,
|
||||||
|
formula_ocr: Optional[bool] = None,
|
||||||
|
code_ocr: Optional[bool] = None,
|
||||||
|
|
||||||
|
# --- Mineru 本地部署参数 ---
|
||||||
|
mineru_deploy_base_url: Optional[str] = None,
|
||||||
|
mineru_deploy_backend: Optional[str] = None,
|
||||||
|
mineru_deploy_formula_enable: Optional[bool] = None,
|
||||||
|
mineru_deploy_start_page_id: Optional[int] = None,
|
||||||
|
mineru_deploy_end_page_id: Optional[int] = None,
|
||||||
|
mineru_deploy_lang_list: Optional[List[str]] = None,
|
||||||
|
mineru_deploy_server_url: Optional[str] = None,
|
||||||
|
|
||||||
|
# --- JSON / 术语表 ---
|
||||||
|
json_paths: Optional[List[str]] = None,
|
||||||
|
glossary_generate_enable: Optional[bool] = None,
|
||||||
|
glossary_dict: Optional[Dict[str, str]] = None,
|
||||||
|
glossary_agent_config: Optional[Union[GlossaryAgentConfigPayload, Dict[str, Any]]] = None,
|
||||||
|
|
||||||
|
**kwargs
|
||||||
|
) -> TranslationResult:
|
||||||
|
"""
|
||||||
|
异步执行翻译任务。
|
||||||
|
|
||||||
|
:param file_path: 输入文件路径 (必需)。
|
||||||
|
:param workflow_type: 工作流类型 (auto, docx, markdown_based, xlsx, json, txt)。
|
||||||
|
:param skip_translate: 若为 True,仅进行解析/OCR,不调用 LLM 翻译。
|
||||||
|
:param concurrent: LLM 请求并发数。
|
||||||
|
:param json_paths: [Json专用] JsonPath 列表 (如 '$.data.*')。
|
||||||
|
:param translate_regions: [Excel专用] 翻译区域 (如 'Sheet1!A1:B10')。
|
||||||
|
:param insert_mode: [Docx/Xlsx/Txt] 译文插入模式 (replace, append, prepend)。
|
||||||
|
:param convert_engine: [PDF/OCR] 解析引擎 (mineru, docling)。
|
||||||
|
:param mineru_token: [Mineru Cloud] API Token。
|
||||||
|
:param mineru_deploy_base_url: [Mineru Local] 本地服务地址。
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 1. 获取所有参数
|
||||||
|
current_args = locals()
|
||||||
|
call_params = {
|
||||||
|
k: v for k, v in current_args.items()
|
||||||
|
if k not in ['self', 'file_path', 'kwargs'] and v is not None
|
||||||
|
}
|
||||||
|
call_params.update({k: v for k, v in kwargs.items() if v is not None})
|
||||||
|
|
||||||
|
# 2. 参数层级合并
|
||||||
|
final_params = {**default_params, **self.defaults, **call_params}
|
||||||
|
|
||||||
|
# 3. 文件校验
|
||||||
|
path_obj = Path(file_path)
|
||||||
|
if not path_obj.exists():
|
||||||
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||||
|
|
||||||
|
# 4. 自动检测 Workflow 类型
|
||||||
|
if final_params.get("workflow_type", "auto") == "auto":
|
||||||
|
final_params["workflow_type"] = self._detect_workflow(path_obj)
|
||||||
|
|
||||||
|
# 5. 智能补全
|
||||||
|
self._patch_defaults(final_params, path_obj)
|
||||||
|
|
||||||
|
# 6. Pydantic 校验
|
||||||
|
try:
|
||||||
|
payload = TypeAdapter(TranslatePayload).validate_python(final_params)
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"参数配置校验失败: {e}")
|
||||||
|
|
||||||
|
# 7. 创建 Workflow
|
||||||
|
workflow = create_workflow_from_payload(payload)
|
||||||
|
|
||||||
|
# 8. 执行逻辑
|
||||||
|
workflow.read_path(str(path_obj))
|
||||||
|
await workflow.translate_async()
|
||||||
|
|
||||||
|
return TranslationResult(workflow, final_params["workflow_type"], path_obj.name)
|
||||||
|
|
||||||
|
def _detect_workflow(self, path: Path) -> str:
|
||||||
|
ext = path.suffix.lower().lstrip(".")
|
||||||
|
if ext in ["md", "pdf", "png", "jpg"]: return "markdown_based"
|
||||||
|
if ext in ["xlsx", "csv", "xls"]: return "xlsx"
|
||||||
|
if ext in ["docx", "doc"]: return "docx"
|
||||||
|
if ext in ["html", "htm"]: return "html"
|
||||||
|
if ext in ["pptx", "ppt"]: return "pptx"
|
||||||
|
if ext in ["txt", "json", "srt", "epub", "ass"]: return ext
|
||||||
|
return "txt"
|
||||||
|
|
||||||
|
def _patch_defaults(self, params: Dict[str, Any], path: Path):
|
||||||
|
wf = params.get("workflow_type")
|
||||||
|
if wf == "json" and not params.get("json_paths"):
|
||||||
|
params["json_paths"] = ["$..*"]
|
||||||
|
if wf == "markdown_based" and "convert_engine" not in params:
|
||||||
|
ext = path.suffix.lower()
|
||||||
|
if ext == ".pdf":
|
||||||
|
params["convert_engine"] = "mineru" if not DOCLING_EXIST else "docling"
|
||||||
|
else:
|
||||||
|
params["convert_engine"] = "identity"
|
||||||
|
|
||||||
Reference in New Issue
Block a user