From ac6fcebe2454a909b44dfd9e8ea7a102a2d56cba Mon Sep 17 00:00:00 2001 From: xunbu Date: Wed, 24 Sep 2025 10:37:42 +0800 Subject: [PATCH] =?UTF-8?q?=E5=89=8D=E5=90=8E=E7=AB=AF=E5=A2=9E=E5=8A=A0sy?= =?UTF-8?q?stem=5Fproxy=5Fenable=E9=80=89=E9=A1=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/agent.py | 129 +- docutranslate/app.py | 1446 ++++++++++++----- docutranslate/global_values/__init__.py | 2 +- docutranslate/static/index.html | 2 +- .../translator/ai_translator/base.py | 29 +- docutranslate/utils/utils.py | 3 + 6 files changed, 1106 insertions(+), 505 deletions(-) diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index a276290..1cf948a 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -112,14 +112,14 @@ def extract_token_info(response_data: dict) -> tuple[int, int, int, int]: # 尝试从不同格式获取cached_tokens # 格式1: input_tokens_details.cached_tokens if ( - "input_tokens_details" in usage - and "cached_tokens" in usage["input_tokens_details"] + "input_tokens_details" in usage + and "cached_tokens" in usage["input_tokens_details"] ): cached_tokens = usage["input_tokens_details"]["cached_tokens"] # 格式2: prompt_tokens_details.cached_tokens elif ( - "prompt_tokens_details" in usage - and "cached_tokens" in usage["prompt_tokens_details"] + "prompt_tokens_details" in usage + and "cached_tokens" in usage["prompt_tokens_details"] ): cached_tokens = usage["prompt_tokens_details"]["cached_tokens"] # 格式3: prompt_cache_hit_tokens (直接在usage下) @@ -129,14 +129,14 @@ def extract_token_info(response_data: dict) -> tuple[int, int, int, int]: # 尝试从不同格式获取reasoning_tokens # 格式1: output_tokens_details.reasoning_tokens if ( - "output_tokens_details" in usage - and "reasoning_tokens" in usage["output_tokens_details"] + "output_tokens_details" in usage + and "reasoning_tokens" in usage["output_tokens_details"] ): reasoning_tokens = usage["output_tokens_details"]["reasoning_tokens"] # 格式2: completion_tokens_details.reasoning_tokens elif ( - "completion_tokens_details" in usage - and "reasoning_tokens" in usage["completion_tokens_details"] + "completion_tokens_details" in usage + and "reasoning_tokens" in usage["completion_tokens_details"] ): reasoning_tokens = usage["completion_tokens_details"]["reasoning_tokens"] return input_tokens, cached_tokens, output_tokens, reasoning_tokens @@ -156,11 +156,11 @@ class TokenCounter: self.logger = logger def add( - self, - input_tokens: int, - cached_tokens: int, - output_tokens: int, - reasoning_tokens: int, + self, + input_tokens: int, + cached_tokens: int, + output_tokens: int, + reasoning_tokens: int, ): with self.lock: self.input_tokens += input_tokens @@ -249,7 +249,8 @@ class Agent: self.retry = config.retry - self.system_proxy_enable=config.system_proxy_enable + self.system_proxy_enable = config.system_proxy_enable + def _add_thinking_mode(self, data: dict): if self.domain not in self._think_factory: return @@ -260,7 +261,7 @@ class Agent: data[field_thinking] = val_disable def _prepare_request_data( - self, prompt: str, system_prompt: str, temperature=None, top_p=0.9 + self, prompt: str, system_prompt: str, temperature=None, top_p=0.9 ): if temperature is None: temperature = self.temperature @@ -282,16 +283,16 @@ class Agent: return headers, data async def send_async( - self, - client: httpx.AsyncClient, - prompt: str, - system_prompt: None | str = None, - retry=True, - retry_count=0, - pre_send_handler: PreSendHandlerType = None, - result_handler: ResultHandlerType = None, - error_result_handler: ErrorResultHandlerType = None, - best_partial_result: dict | None = None, + self, + client: httpx.AsyncClient, + prompt: str, + system_prompt: None | str = None, + retry=True, + retry_count=0, + pre_send_handler: PreSendHandlerType = None, + result_handler: ResultHandlerType = None, + error_result_handler: ErrorResultHandlerType = None, + best_partial_result: dict | None = None, ) -> Any: if system_prompt is None: system_prompt = self.system_prompt @@ -432,24 +433,24 @@ class Agent: ) async def send_prompts_async( - self, - prompts: list[str], - system_prompt: str | None = None, - max_concurrent: int | None = None, - pre_send_handler: PreSendHandlerType = None, - result_handler: ResultHandlerType = None, - error_result_handler: ErrorResultHandlerType = None, + self, + prompts: list[str], + system_prompt: str | None = None, + max_concurrent: int | None = None, + pre_send_handler: PreSendHandlerType = None, + result_handler: ResultHandlerType = None, + error_result_handler: ErrorResultHandlerType = None, ) -> list[Any]: max_concurrent = ( self.max_concurrent if max_concurrent is None else max_concurrent ) total = len(prompts) self.logger.info( - f"base-url:{self.baseurl},model-id:{self.model_id},concurrent:{max_concurrent},temperature:{self.temperature}" + f"base-url:{self.baseurl},model-id:{self.model_id},concurrent:{max_concurrent},temperature:{self.temperature},system_proxy:{self.system_proxy_enable}" ) self.logger.info(f"预计发送{total}个请求,并发请求数:{max_concurrent}") self.total_error_counter.max_errors_count = ( - len(prompts) // MAX_REQUESTS_PER_ERROR + len(prompts) // MAX_REQUESTS_PER_ERROR ) # 新增:在每次批量发送前重置计数器 @@ -469,7 +470,7 @@ class Agent: ) async with httpx.AsyncClient( - trust_env=False, proxies=proxies, verify=False, limits=limits + trust_env=False, proxies=proxies, verify=False, limits=limits ) as client: async def send_with_semaphore(p_text: str): @@ -500,7 +501,7 @@ class Agent: # 新增:打印token使用统计 token_stats = self.token_counter.get_stats() - if token_stats['input_tokens'] < 0: + if token_stats["input_tokens"] < 0: self.logger.info("Token统计失败") else: self.logger.info( @@ -512,16 +513,16 @@ class Agent: return results def send( - self, - client: httpx.Client, - prompt: str, - system_prompt: None | str = None, - retry=True, - retry_count=0, - pre_send_handler=None, - result_handler=None, - error_result_handler=None, - best_partial_result: dict | None = None, + self, + client: httpx.Client, + prompt: str, + system_prompt: None | str = None, + retry=True, + retry_count=0, + pre_send_handler=None, + result_handler=None, + error_result_handler=None, + best_partial_result: dict | None = None, ) -> Any: if system_prompt is None: system_prompt = self.system_prompt @@ -658,14 +659,14 @@ class Agent: ) def _send_prompt_count( - self, - client: httpx.Client, - prompt: str, - system_prompt: None | str, - count: PromptsCounter, - pre_send_handler, - result_handler, - error_result_handler, + self, + client: httpx.Client, + prompt: str, + system_prompt: None | str, + count: PromptsCounter, + pre_send_handler, + result_handler, + error_result_handler, ) -> Any: result = self.send( client, @@ -679,21 +680,21 @@ class Agent: return result def send_prompts( - self, - prompts: list[str], - system_prompt: str | None = None, - pre_send_handler: PreSendHandlerType = None, - result_handler: ResultHandlerType = None, - error_result_handler: ErrorResultHandlerType = None, + self, + prompts: list[str], + system_prompt: str | None = None, + pre_send_handler: PreSendHandlerType = None, + result_handler: ResultHandlerType = None, + error_result_handler: ErrorResultHandlerType = None, ) -> list[Any]: self.logger.info( - f"base-url:{self.baseurl},model-id:{self.model_id},concurrent:{self.max_concurrent},temperature:{self.temperature}" + f"base-url:{self.baseurl},model-id:{self.model_id},concurrent:{self.max_concurrent},temperature:{self.temperature},system_proxy:{self.system_proxy_enable}" ) self.logger.info( f"预计发送{len(prompts)}个请求,并发请求数:{self.max_concurrent}" ) self.total_error_counter.max_errors_count = ( - len(prompts) // MAX_REQUESTS_PER_ERROR + len(prompts) // MAX_REQUESTS_PER_ERROR ) # 新增:在每次批量发送前重置计数器 @@ -714,7 +715,7 @@ class Agent: ) proxies = get_httpx_proxies() if self.system_proxy_enable else None with httpx.Client( - trust_env=False, proxies=proxies, verify=False, limits=limits + trust_env=False, proxies=proxies, verify=False, limits=limits ) as client: clients = itertools.repeat(client, len(prompts)) with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor: @@ -737,7 +738,7 @@ class Agent: # 新增:打印token使用统计 token_stats = self.token_counter.get_stats() - if token_stats['input_tokens'] < 0: + if token_stats["input_tokens"] < 0: self.logger.info("Token统计失败") else: self.logger.info( diff --git a/docutranslate/app.py b/docutranslate/app.py index 01461ea..7a33340 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -12,12 +12,26 @@ import time import uuid from contextlib import asynccontextmanager, closing from pathlib import Path -from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type +from typing import ( + List, + Dict, + Any, + Optional, + Literal, + Union, + Annotated, + TYPE_CHECKING, + Type, +) import httpx import uvicorn from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath -from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html +from fastapi.openapi.docs import ( + get_swagger_ui_html, + get_swagger_ui_oauth2_redirect_html, + get_redoc_html, +) from fastapi.responses import HTMLResponse, JSONResponse, FileResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel, Field, field_validator, model_validator, AliasChoices @@ -26,6 +40,8 @@ from docutranslate import __version__ from docutranslate.agents.agent import ThinkingMode from docutranslate.agents.glossary_agent import GlossaryAgentConfig from docutranslate.exporter.md.types import ConvertEngineType +from docutranslate.global_values import USE_PROXY + # --- 核心代码 Imports --- from docutranslate.global_values.conditional_import import DOCLING_EXIST from docutranslate.workflow.ass_workflow import AssWorkflow, AssWorkflowConfig @@ -34,10 +50,21 @@ from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfi from docutranslate.workflow.epub_workflow import EpubWorkflow, EpubWorkflowConfig from docutranslate.workflow.html_workflow import HtmlWorkflow, HtmlWorkflowConfig from docutranslate.workflow.interfaces import DocxExportable, EpubExportable -from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \ - XlsxExportable, SrtExportable, CsvExportable, AssExportable +from docutranslate.workflow.interfaces import ( + HTMLExportable, + MDFormatsExportable, + TXTExportable, + JsonExportable, + XlsxExportable, + SrtExportable, + CsvExportable, + AssExportable, +) from docutranslate.workflow.json_workflow import JsonWorkflow, JsonWorkflowConfig -from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow, MarkdownBasedWorkflowConfig +from docutranslate.workflow.md_based_workflow import ( + MarkdownBasedWorkflow, + MarkdownBasedWorkflowConfig, +) from docutranslate.workflow.srt_workflow import SrtWorkflow, SrtWorkflowConfig from docutranslate.workflow.txt_workflow import TXTWorkflow, TXTWorkflowConfig from docutranslate.workflow.xlsx_workflow import XlsxWorkflow, XlsxWorkflowConfig @@ -62,6 +89,7 @@ from docutranslate.exporter.epub.epub2html_exporter import Epub2HTMLExporterConf from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig from docutranslate.translator.ai_translator.ass_translator import AssTranslatorConfig from docutranslate.exporter.ass.ass2html_exporter import Ass2HTMLExporterConfig + # ------------------------------------ from docutranslate.logger import global_logger @@ -108,11 +136,15 @@ MEDIA_TYPES = { def _create_default_task_state() -> Dict[str, Any]: """创建新的默认任务状态,存储 workflow 实例而不是具体内容""" return { - "is_processing": False, "status_message": "空闲", "error_flag": False, + "is_processing": False, + "status_message": "空闲", + "error_flag": False, "download_ready": False, "workflow_instance": None, # 仅在处理期间使用 - "original_filename_stem": None, "task_start_time": 0, - "task_end_time": 0, "current_task_ref": None, + "original_filename_stem": None, + "task_start_time": 0, + "task_end_time": 0, + "current_task_ref": None, "original_filename": None, "temp_dir": None, # 用于存储临时文件的目录 "downloadable_files": {}, # 存储可下载文件的路径和名称 @@ -122,7 +154,13 @@ def _create_default_task_state() -> Dict[str, Any]: # --- 日志处理器 --- class QueueAndHistoryHandler(logging.Handler): - def __init__(self, queue_ref: asyncio.Queue, history_list_ref: List[str], max_history_items: int, task_id: str): + def __init__( + self, + queue_ref: asyncio.Queue, + history_list_ref: List[str], + max_history_items: int, + task_id: str, + ): super().__init__() self.queue = queue_ref self.history_list = history_list_ref @@ -134,7 +172,7 @@ class QueueAndHistoryHandler(logging.Handler): print(f"[{self.task_id}] {log_entry}") self.history_list.append(log_entry) if len(self.history_list) > self.max_history: - del self.history_list[:len(self.history_list) - self.max_history] + del self.history_list[: len(self.history_list) - self.max_history] if self.queue is not None: try: main_loop = getattr(app.state, "main_event_loop", None) @@ -145,7 +183,9 @@ class QueueAndHistoryHandler(logging.Handler): except asyncio.QueueFull: print(f"[{self.task_id}] Log queue is full. Log dropped: {log_entry}") except Exception as e: - print(f"[{self.task_id}] Error putting log to queue: {e}. Log: {log_entry}") + print( + f"[{self.task_id}] Error putting log to queue: {e}. Log: {log_entry}" + ) # --- 应用生命周期事件 --- @@ -190,7 +230,6 @@ tags_metadata = [ "name": "Temp", "description": "测试用接口。", }, - ] app = FastAPI( @@ -228,164 +267,253 @@ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static") # --- Pydantic Models for Service API --- # =================================================================== + class GlossaryAgentConfigPayload(BaseModel): - base_url: str = Field(..., validation_alias=AliasChoices('base_url', 'baseurl'), - description="用于术语表生成的Agent的LLM API基础URL。", examples=["https://api.openai.com/v1"]) - api_key: str = Field(..., validation_alias=AliasChoices('api_key', 'key'), - description="用于术语表生成的Agent的LLM API密钥。", examples=["sk-agent-api-key"]) - model_id: str = Field(..., description="用于术语表生成的Agent的模型ID。", examples=["gpt-4-turbo"]) - to_lang: str = Field(..., description="术语表生成的目标语言。", examples=["简体中文", "English"]) - temperature: float = Field(default=0.7, description="用于术语表生成的Agent的温度参数。") + base_url: str = Field( + ..., + validation_alias=AliasChoices("base_url", "baseurl"), + description="用于术语表生成的Agent的LLM API基础URL。", + examples=["https://api.openai.com/v1"], + ) + api_key: str = Field( + ..., + validation_alias=AliasChoices("api_key", "key"), + description="用于术语表生成的Agent的LLM API密钥。", + examples=["sk-agent-api-key"], + ) + model_id: str = Field( + ..., description="用于术语表生成的Agent的模型ID。", examples=["gpt-4-turbo"] + ) + to_lang: str = Field( + ..., description="术语表生成的目标语言。", examples=["简体中文", "English"] + ) + temperature: float = Field( + default=0.7, description="用于术语表生成的Agent的温度参数。" + ) concurrent: int = Field(default=30, description="Agent的最大并发请求数。") - timeout: int = Field(default=default_params["timeout"], description="等待API回复的时间(秒)。") + timeout: int = Field( + default=default_params["timeout"], description="等待API回复的时间(秒)。" + ) thinking: ThinkingMode = Field(default="default", description="Agent的思考模式。") - retry: int = Field(default=default_params["retry"], description="分块失败后的最大重试次数。") + retry: int = Field( + default=default_params["retry"], description="分块失败后的最大重试次数。" + ) + system_proxy_enable: bool = Field( + default=USE_PROXY, description="是否使用系统代理", examples=[True, False] + ) # 1. 定义所有工作流共享的基础参数 class BaseWorkflowParams(BaseModel): - skip_translate: bool = Field(default=False, description="是否跳过翻译步骤。如果为True,则仅执行文档解析和格式转换。") - base_url: Optional[str] = Field(default=None, validation_alias=AliasChoices('base_url', 'baseurl'), - description="LLM API的基础URL。当 `skip_translate` 为 `False` 时必填。", - examples=["https://api.openai.com/v1"]) - api_key: Optional[str] = Field(default=None, validation_alias=AliasChoices('api_key', 'key'), - description="LLM API的密钥(可选)。", - examples=["sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxx"]) - model_id: Optional[str] = Field(default=None, - description="要使用的LLM模型ID。当 `skip_translate` 为 `False` 时必填。", - examples=["gpt-4o"]) - to_lang: str = Field(default="中文", description="目标翻译语言。", examples=["简体中文", "English"]) - chunk_size: int = Field(default=default_params["chunk_size"], description="文本分割的块大小(字符)。") - concurrent: int = Field(default=default_params["concurrent"], description="并发请求数。") - temperature: float = Field(default=default_params["temperature"], description="LLM温度参数。") - timeout: int = Field(default=default_params["timeout"], description="等待API回复的时间(秒)。") - thinking: ThinkingMode = Field(default=default_params["thinking"], description="Agent的思考模式。", - examples=["default", "enable", "disable"]) - retry: int = Field(default=default_params["retry"], description="某个分块翻译失败后的最大重试次数。") - custom_prompt: Optional[str] = Field(None, description="用户自定义的翻译Prompt。", alias="custom_prompt") - glossary_dict: Optional[Dict[str, str]] = Field(None, description="术语表字典,key为原文,value为译文。") - glossary_generate_enable: bool = Field(default=False, description="是否开启术语表自动生成。") - glossary_agent_config: Optional[GlossaryAgentConfigPayload] = Field(None, - description="用于术语表生成的Agent的配置。如果 `glossary_generate_enable` 为 `True`,此项必填。") + skip_translate: bool = Field( + default=False, + description="是否跳过翻译步骤。如果为True,则仅执行文档解析和格式转换。", + ) + base_url: Optional[str] = Field( + default=None, + validation_alias=AliasChoices("base_url", "baseurl"), + description="LLM API的基础URL。当 `skip_translate` 为 `False` 时必填。", + examples=["https://api.openai.com/v1"], + ) + api_key: Optional[str] = Field( + default=None, + validation_alias=AliasChoices("api_key", "key"), + description="LLM API的密钥(可选)。", + examples=["sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxx"], + ) + model_id: Optional[str] = Field( + default=None, + description="要使用的LLM模型ID。当 `skip_translate` 为 `False` 时必填。", + examples=["gpt-4o"], + ) + to_lang: str = Field( + default="中文", description="目标翻译语言。", examples=["简体中文", "English"] + ) + chunk_size: int = Field( + default=default_params["chunk_size"], description="文本分割的块大小(字符)。" + ) + concurrent: int = Field( + default=default_params["concurrent"], description="并发请求数。" + ) + temperature: float = Field( + default=default_params["temperature"], description="LLM温度参数。" + ) + timeout: int = Field( + default=default_params["timeout"], description="等待API回复的时间(秒)。" + ) + thinking: ThinkingMode = Field( + default=default_params["thinking"], + description="Agent的思考模式。", + examples=["default", "enable", "disable"], + ) + retry: int = Field( + default=default_params["retry"], + description="某个分块翻译失败后的最大重试次数。", + ) + system_proxy_enable: bool = Field( + default=USE_PROXY, description="是否使用系统代理", examples=[True, False] + ) + custom_prompt: Optional[str] = Field( + None, description="用户自定义的翻译Prompt。", alias="custom_prompt" + ) + glossary_dict: Optional[Dict[str, str]] = Field( + None, description="术语表字典,key为原文,value为译文。" + ) + glossary_generate_enable: bool = Field( + default=False, description="是否开启术语表自动生成。" + ) + glossary_agent_config: Optional[GlossaryAgentConfigPayload] = Field( + None, + description="用于术语表生成的Agent的配置。如果 `glossary_generate_enable` 为 `True`,此项必填。", + ) - @model_validator(mode='before') + @model_validator(mode="before") @classmethod def check_translation_fields(cls, values): # 如果不跳过翻译 (值为False或字段不存在),则验证相关字段必须存在且不为空 - if not values.get('skip_translate'): + if not values.get("skip_translate"): # Check for standard keys or their aliases - if not (values.get('base_url') or values.get('baseurl')): - raise ValueError("当 `skip_translate` 为 `False` 时, `base_url` 或 `baseurl` 字段是必须的。") - if not values.get('model_id'): - raise ValueError("当 `skip_translate` 为 `False` 时, `model_id` 字段是必须的。") + if not (values.get("base_url") or values.get("baseurl")): + raise ValueError( + "当 `skip_translate` 为 `False` 时, `base_url` 或 `baseurl` 字段是必须的。" + ) + if not values.get("model_id"): + raise ValueError( + "当 `skip_translate` 为 `False` 时, `model_id` 字段是必须的。" + ) # 如果跳过翻译,则不进行任何检查,允许 base_url 等字段为空 return values # 2. 为每个工作流创建独立的参数模型 class MarkdownWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['markdown_based'] = Field(..., description="指定使用基于Markdown的翻译工作流。") + workflow_type: Literal["markdown_based"] = Field( + ..., description="指定使用基于Markdown的翻译工作流。" + ) convert_engine: ConvertEngineType = Field( "identity", description="选择将文件解析为markdown的引擎。如果输入文件是.md,此项可为`null`或不传。", - examples=["identity", "mineru", "docling"] + examples=["identity", "mineru", "docling"], + ) + mineru_token: Optional[str] = Field( + None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。" + ) + formula_ocr: bool = Field( + True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。" + ) + code_ocr: bool = Field( + True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。" + ) + model_version: Literal["pipeline", "vlm"] = Field( + "vlm", description="Mineru模型的版本,'vlm'是更新的版本。仅 `mineru` 引擎有效。" ) - mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。") - formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。") - code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。") - model_version: Literal["pipeline", "vlm"] = Field("vlm", - description="Mineru模型的版本,'vlm'是更新的版本。仅 `mineru` 引擎有效。") - @field_validator('mineru_token') + @field_validator("mineru_token") def check_mineru_token(cls, v, values): - if values.data.get('convert_engine') == 'mineru' and not v: - raise ValueError("当 `convert_engine` 为 'mineru' 时,`mineru_token` 字段是必须的。") + if values.data.get("convert_engine") == "mineru" and not v: + raise ValueError( + "当 `convert_engine` 为 'mineru' 时,`mineru_token` 字段是必须的。" + ) return v class TextWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['txt'] = Field(..., description="指定使用纯文本的翻译工作流。") + workflow_type: Literal["txt"] = Field( + ..., description="指定使用纯文本的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( "\n", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。", ) class JsonWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['json'] = Field(..., description="指定使用JSON的翻译工作流。") + workflow_type: Literal["json"] = Field( + ..., description="指定使用JSON的翻译工作流。" + ) json_paths: List[str] = Field( ..., description="一个jsonpath-ng表达式列表,用于指定需要翻译的JSON字段。", - examples=[["$.product.name", "$.product.description", "$.features[*]"]] + examples=[["$.product.name", "$.product.description", "$.features[*]"]], ) class XlsxWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['xlsx'] = Field(..., description="指定使用XLSX的翻译工作流。") + workflow_type: Literal["xlsx"] = Field( + ..., description="指定使用XLSX的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( "\n", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。", ) translate_regions: Optional[List[str]] = Field( None, - description="指定翻译区域列表。示例: ['Sheet1!A1:B10', 'C:D', 'E5']。如果不指定表名 (如 'C:D'),则应用于所有表。如果为 None,则翻译整个文件中的所有文本。" + description="指定翻译区域列表。示例: ['Sheet1!A1:B10', 'C:D', 'E5']。如果不指定表名 (如 'C:D'),则应用于所有表。如果为 None,则翻译整个文件中的所有文本。", ) class DocxWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['docx'] = Field(..., description="指定使用DOCX的翻译工作流。") + workflow_type: Literal["docx"] = Field( + ..., description="指定使用DOCX的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( "\n", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。", ) class SrtWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['srt'] = Field(..., description="指定使用SRT字幕的翻译工作流。") + workflow_type: Literal["srt"] = Field( + ..., description="指定使用SRT字幕的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( "\n", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。", ) class EpubWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['epub'] = Field(..., description="指定使用EPUB的翻译工作流。") + workflow_type: Literal["epub"] = Field( + ..., description="指定使用EPUB的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( "\n", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。", ) # --- HTML WORKFLOW PARAMS START --- class HtmlWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['html'] = Field(..., description="指定使用HTML的翻译工作流。") + workflow_type: Literal["html"] = Field( + ..., description="指定使用HTML的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( " ", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。", ) @@ -394,14 +522,16 @@ class HtmlWorkflowParams(BaseWorkflowParams): # --- ASS WORKFLOW PARAMS START --- class AssWorkflowParams(BaseWorkflowParams): - workflow_type: Literal['ass'] = Field(..., description="指定使用ASS字幕的翻译工作流。") + workflow_type: Literal["ass"] = Field( + ..., description="指定使用ASS字幕的翻译工作流。" + ) insert_mode: Literal["replace", "append", "prepend"] = Field( "replace", - description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。", ) separator: str = Field( "\\N", - description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。ASS格式通常使用 \\N 作为换行符。" + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。ASS格式通常使用 \\N 作为换行符。", ) @@ -411,18 +541,41 @@ class AssWorkflowParams(BaseWorkflowParams): # 3. 使用可辨识联合类型(Discriminated Union)将它们组合起来 TranslatePayload = Annotated[ Union[ - MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams, HtmlWorkflowParams, AssWorkflowParams], - Field(discriminator='workflow_type') + MarkdownWorkflowParams, + TextWorkflowParams, + JsonWorkflowParams, + XlsxWorkflowParams, + DocxWorkflowParams, + SrtWorkflowParams, + EpubWorkflowParams, + HtmlWorkflowParams, + AssWorkflowParams, + ], + Field(discriminator="workflow_type"), ] # 4. 创建最终的请求体模型 class TranslateServiceRequest(BaseModel): - file_name: str = Field(..., description="上传的原始文件名,含扩展名。", - examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", - "index.html", "dialogue.ass"]) - file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."]) - payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。") + file_name: str = Field( + ..., + description="上传的原始文件名,含扩展名。", + examples=[ + "my_paper.pdf", + "chapter1.txt", + "data.xlsx", + "video.srt", + "my_book.epub", + "index.html", + "dialogue.ass", + ], + ) + file_content: str = Field( + ..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."] + ) + payload: TranslatePayload = Field( + ..., description="包含工作流类型和相应参数的载荷。" + ) class Config: json_schema_extra = { @@ -447,8 +600,8 @@ class TranslateServiceRequest(BaseModel): "convert_engine": "mineru", "mineru_token": "your-mineru-token-if-any", "formula_ocr": True, - "model_version": "vlm" - } + "model_version": "vlm", + }, }, { "file_name": "product_info.json", @@ -467,8 +620,12 @@ class TranslateServiceRequest(BaseModel): "thinking": "default", "retry": default_params["retry"], "glossary_generate_enable": False, - "json_paths": ["$.product.name", "$.product.description", "$.features[*]"], - } + "json_paths": [ + "$.product.name", + "$.product.description", + "$.features[*]", + ], + }, }, { "file_name": "product_list.xlsx", @@ -492,9 +649,9 @@ class TranslateServiceRequest(BaseModel): "translate_regions": ["Sheet1!A1:B10", "C:D"], "glossary_dict": { "OpenAI": "开放人工智能", - "LLM": "大语言模型" - } - } + "LLM": "大语言模型", + }, + }, }, { "file_name": "complex_terms.xlsx", @@ -516,9 +673,9 @@ class TranslateServiceRequest(BaseModel): "concurrent": 30, "timeout": default_params["timeout"], "thinking": "default", - "retry": default_params["retry"] - } - } + "retry": default_params["retry"], + }, + }, }, { "file_name": "contract.docx", @@ -538,7 +695,7 @@ class TranslateServiceRequest(BaseModel): "timeout": default_params["timeout"], "thinking": "default", "retry": default_params["retry"], - } + }, }, { "file_name": "movie.srt", @@ -558,7 +715,7 @@ class TranslateServiceRequest(BaseModel): "timeout": default_params["timeout"], "thinking": "default", "retry": default_params["retry"], - } + }, }, { "file_name": "my_book.epub", @@ -578,7 +735,7 @@ class TranslateServiceRequest(BaseModel): "timeout": default_params["timeout"], "thinking": "default", "retry": default_params["retry"], - } + }, }, { "file_name": "company_about_us.html", @@ -598,7 +755,7 @@ class TranslateServiceRequest(BaseModel): "timeout": default_params["timeout"], "thinking": "default", "retry": default_params["retry"], - } + }, }, { "file_name": "dialogue.ass", @@ -618,18 +775,18 @@ class TranslateServiceRequest(BaseModel): "timeout": default_params["timeout"], "thinking": "default", "retry": default_params["retry"], - } - } + }, + }, ] } # --- Background Task Logic --- async def _perform_translation( - task_id: str, - payload: TranslatePayload, - file_contents: bytes, - original_filename: str + task_id: str, + payload: TranslatePayload, + file_contents: bytes, + original_filename: str, ): task_state = tasks_state[task_id] log_queue = tasks_log_queues[task_id] @@ -640,11 +797,17 @@ async def _perform_translation( task_logger.propagate = False if task_logger.hasHandlers(): task_logger.handlers.clear() - task_handler = QueueAndHistoryHandler(log_queue, log_history, MAX_LOG_HISTORY, task_id=task_id) - task_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + task_handler = QueueAndHistoryHandler( + log_queue, log_history, MAX_LOG_HISTORY, task_id=task_id + ) + task_handler.setFormatter( + logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + ) task_logger.addHandler(task_handler) - task_logger.info(f"后台翻译任务开始: 文件 '{original_filename}', 工作流: '{payload.workflow_type}'") + task_logger.info( + f"后台翻译任务开始: 文件 '{original_filename}', 工作流: '{payload.workflow_type}'" + ) task_state["status_message"] = f"正在处理 '{original_filename}'..." temp_dir = None @@ -661,165 +824,316 @@ async def _perform_translation( if payload.glossary_generate_enable and payload.glossary_agent_config: agent_payload = payload.glossary_agent_config return GlossaryAgentConfig( - logger=task_logger, - **agent_payload.model_dump() + logger=task_logger, **agent_payload.model_dump() ) return None # 2. 根据 payload 的具体类型构建配置并实例化 workflow if isinstance(payload, MarkdownWorkflowParams): task_logger.info("构建 MarkdownBasedWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = MDTranslatorConfig(**translator_args) converter_config = None - if payload.convert_engine == 'mineru': - converter_config = ConverterMineruConfig(logger=task_logger, mineru_token=payload.mineru_token, - formula_ocr=payload.formula_ocr, - model_version=payload.model_version) - elif payload.convert_engine == 'docling' and DOCLING_EXIST: - converter_config = ConverterDoclingConfig(logger=task_logger, code_ocr=payload.code_ocr, - formula_ocr=payload.formula_ocr) + if payload.convert_engine == "mineru": + converter_config = ConverterMineruConfig( + logger=task_logger, + mineru_token=payload.mineru_token, + formula_ocr=payload.formula_ocr, + model_version=payload.model_version, + ) + elif payload.convert_engine == "docling" and DOCLING_EXIST: + converter_config = ConverterDoclingConfig( + logger=task_logger, + code_ocr=payload.code_ocr, + formula_ocr=payload.formula_ocr, + ) html_exporter_config = MD2HTMLExporterConfig(cdn=True) workflow_config = MarkdownBasedWorkflowConfig( - convert_engine=payload.convert_engine, converter_config=converter_config, - translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + convert_engine=payload.convert_engine, + converter_config=converter_config, + translator_config=translator_config, + html_exporter_config=html_exporter_config, + logger=task_logger, ) workflow = MarkdownBasedWorkflow(config=workflow_config) elif isinstance(payload, TextWorkflowParams): task_logger.info("构建 TXTWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', 'glossary_dict', - 'insert_mode', 'separator', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "glossary_dict", + "insert_mode", + "separator", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = TXTTranslatorConfig(**translator_args) html_exporter_config = TXT2HTMLExporterConfig(cdn=True) workflow_config = TXTWorkflowConfig( - translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + translator_config=translator_config, + html_exporter_config=html_exporter_config, + logger=task_logger, ) workflow = TXTWorkflow(config=workflow_config) elif isinstance(payload, JsonWorkflowParams): task_logger.info("构建 JsonWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', 'glossary_dict', - 'json_paths', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "glossary_dict", + "json_paths", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = JsonTranslatorConfig(**translator_args) html_exporter_config = Json2HTMLExporterConfig(cdn=True) workflow_config = JsonWorkflowConfig( - translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + translator_config=translator_config, + html_exporter_config=html_exporter_config, + logger=task_logger, ) workflow = JsonWorkflow(config=workflow_config) elif isinstance(payload, XlsxWorkflowParams): task_logger.info("构建 XlsxWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator', 'translate_regions', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "insert_mode", + "separator", + "translate_regions", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = XlsxTranslatorConfig(**translator_args) html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True) workflow_config = XlsxWorkflowConfig( translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + logger=task_logger, ) workflow = XlsxWorkflow(config=workflow_config) elif isinstance(payload, DocxWorkflowParams): task_logger.info("构建 DocxWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "insert_mode", + "separator", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = DocxTranslatorConfig(**translator_args) html_exporter_config = Docx2HTMLExporterConfig(cdn=True) workflow_config = DocxWorkflowConfig( translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + logger=task_logger, ) workflow = DocxWorkflow(config=workflow_config) elif isinstance(payload, SrtWorkflowParams): task_logger.info("构建 SrtWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "insert_mode", + "separator", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = SrtTranslatorConfig(**translator_args) html_exporter_config = Srt2HTMLExporterConfig(cdn=True) workflow_config = SrtWorkflowConfig( translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + logger=task_logger, ) workflow = SrtWorkflow(config=workflow_config) elif isinstance(payload, EpubWorkflowParams): task_logger.info("构建 EpubWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "insert_mode", + "separator", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = EpubTranslatorConfig(**translator_args) html_exporter_config = Epub2HTMLExporterConfig(cdn=True) workflow_config = EpubWorkflowConfig( translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + logger=task_logger, ) workflow = EpubWorkflow(config=workflow_config) # --- HTML WORKFLOW LOGIC START --- elif isinstance(payload, HtmlWorkflowParams): task_logger.info("构建 HtmlWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "insert_mode", + "separator", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = HtmlTranslatorConfig(**translator_args) workflow_config = HtmlWorkflowConfig( - translator_config=translator_config, - logger=task_logger + translator_config=translator_config, logger=task_logger ) workflow = HtmlWorkflow(config=workflow_config) # --- HTML WORKFLOW LOGIC END --- @@ -827,20 +1141,38 @@ async def _perform_translation( # --- ASS WORKFLOW LOGIC START --- elif isinstance(payload, AssWorkflowParams): task_logger.info("构建 AssWorkflow 配置。") - translator_args = payload.model_dump(include={ - 'skip_translate', 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', - 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator', 'glossary_dict', 'timeout', 'retry' - }, exclude_none=True) - translator_args['glossary_generate_enable'] = payload.glossary_generate_enable - translator_args['glossary_agent_config'] = build_glossary_agent_config() + translator_args = payload.model_dump( + include={ + "skip_translate", + "base_url", + "api_key", + "model_id", + "to_lang", + "custom_prompt", + "temperature", + "thinking", + "chunk_size", + "concurrent", + "insert_mode", + "separator", + "glossary_dict", + "timeout", + "retry", + "system_proxy_enable", + }, + exclude_none=True, + ) + translator_args["glossary_generate_enable"] = ( + payload.glossary_generate_enable + ) + translator_args["glossary_agent_config"] = build_glossary_agent_config() translator_config = AssTranslatorConfig(**translator_args) html_exporter_config = Ass2HTMLExporterConfig(cdn=True) workflow_config = AssWorkflowConfig( translator_config=translator_config, html_exporter_config=html_exporter_config, - logger=task_logger + logger=task_logger, ) workflow = AssWorkflow(config=workflow_config) # --- ASS WORKFLOW LOGIC END --- @@ -859,13 +1191,15 @@ async def _perform_translation( temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_") task_state["temp_dir"] = temp_dir downloadable_files = {} - filename_stem = task_state['original_filename_stem'] + filename_stem = task_state["original_filename_stem"] # 检查CDN可用性 is_cdn_available = True try: - await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js", - timeout=3) + await httpx_client.head( + "https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js", + timeout=3, + ) except (httpx.TimeoutException, httpx.RequestError): is_cdn_available = False task_logger.warning("CDN连接失败,将使用本地JS进行渲染。") @@ -892,46 +1226,96 @@ async def _perform_translation( html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available) elif isinstance(workflow, AssWorkflow): html_config = Ass2HTMLExporterConfig(cdn=is_cdn_available) - export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html", - True) + export_map["html"] = ( + lambda: workflow.export_to_html(html_config), + f"{filename_stem}_translated.html", + True, + ) if isinstance(workflow, MDFormatsExportable): - export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True) - export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False) + export_map["markdown"] = ( + workflow.export_to_markdown, + f"{filename_stem}_translated.md", + True, + ) + export_map["markdown_zip"] = ( + workflow.export_to_markdown_zip, + f"{filename_stem}_translated.zip", + False, + ) if isinstance(workflow, TXTExportable): - export_map['txt'] = (workflow.export_to_txt, f"{filename_stem}_translated.txt", True) + export_map["txt"] = ( + workflow.export_to_txt, + f"{filename_stem}_translated.txt", + True, + ) if isinstance(workflow, JsonExportable): - export_map['json'] = (workflow.export_to_json, f"{filename_stem}_translated.json", True) + export_map["json"] = ( + workflow.export_to_json, + f"{filename_stem}_translated.json", + True, + ) if isinstance(workflow, XlsxExportable): - export_map['xlsx'] = (workflow.export_to_xlsx, f"{filename_stem}_translated.xlsx", False) + export_map["xlsx"] = ( + workflow.export_to_xlsx, + f"{filename_stem}_translated.xlsx", + False, + ) if isinstance(workflow, CsvExportable): - export_map['csv'] = (workflow.export_to_csv, f"{filename_stem}_translated.csv", False) + export_map["csv"] = ( + workflow.export_to_csv, + f"{filename_stem}_translated.csv", + False, + ) if isinstance(workflow, DocxExportable): - export_map['docx'] = (workflow.export_to_docx, f"{filename_stem}_translated.docx", False) + export_map["docx"] = ( + workflow.export_to_docx, + f"{filename_stem}_translated.docx", + False, + ) if isinstance(workflow, SrtExportable): - export_map['srt'] = (workflow.export_to_srt, f"{filename_stem}_translated.srt", True) + export_map["srt"] = ( + workflow.export_to_srt, + f"{filename_stem}_translated.srt", + True, + ) if isinstance(workflow, EpubExportable): - export_map['epub'] = (workflow.export_to_epub, f"{filename_stem}_translated.epub", False) + export_map["epub"] = ( + workflow.export_to_epub, + f"{filename_stem}_translated.epub", + False, + ) if isinstance(workflow, AssExportable): - export_map['ass'] = (workflow.export_to_ass, f"{filename_stem}_translated.ass", True) + export_map["ass"] = ( + workflow.export_to_ass, + f"{filename_stem}_translated.ass", + True, + ) # 循环生成文件 for file_type, (export_func, filename, is_string_output) in export_map.items(): try: content = await asyncio.to_thread(export_func) - content_bytes = content.encode('utf-8') if is_string_output else content + content_bytes = content.encode("utf-8") if is_string_output else content file_path = os.path.join(temp_dir, filename) with open(file_path, "wb") as f: f.write(content_bytes) - downloadable_files[file_type] = {"path": file_path, "filename": filename} + downloadable_files[file_type] = { + "path": file_path, + "filename": filename, + } task_logger.info(f"成功生成 {file_type} 文件") except Exception as export_error: - task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True) + task_logger.error( + f"生成 {file_type} 文件时出错: {export_error}", exc_info=True + ) # 处理附件文件 attachment_files = {} attachment_object = workflow.get_attachment() if attachment_object and attachment_object.attachment_dict: - task_logger.info(f"发现 {len(attachment_object.attachment_dict)} 个附件,正在处理...") + task_logger.info( + f"发现 {len(attachment_object.attachment_dict)} 个附件,正在处理..." + ) for identifier, doc in attachment_object.attachment_dict.items(): try: # 'doc' is a Document object @@ -939,42 +1323,61 @@ async def _perform_translation( attachment_path = os.path.join(temp_dir, attachment_filename) with open(attachment_path, "wb") as f: f.write(doc.content) - attachment_files[identifier] = {"path": attachment_path, "filename": attachment_filename} - task_logger.info(f"成功生成附件 '{identifier}' 文件: {attachment_filename}") + attachment_files[identifier] = { + "path": attachment_path, + "filename": attachment_filename, + } + task_logger.info( + f"成功生成附件 '{identifier}' 文件: {attachment_filename}" + ) except Exception as attachment_error: - task_logger.error(f"生成附件 '{identifier}' 文件时出错: {attachment_error}", exc_info=True) + task_logger.error( + f"生成附件 '{identifier}' 文件时出错: {attachment_error}", + exc_info=True, + ) # 5. 任务成功,更新最终状态 end_time = time.time() duration = end_time - task_state["task_start_time"] - task_state.update({ - "status_message": f"翻译成功!用时 {duration:.2f} 秒。", - "download_ready": True, - "error_flag": False, - "task_end_time": end_time, - "downloadable_files": downloadable_files, - "attachment_files": attachment_files, - }) + task_state.update( + { + "status_message": f"翻译成功!用时 {duration:.2f} 秒。", + "download_ready": True, + "error_flag": False, + "task_end_time": end_time, + "downloadable_files": downloadable_files, + "attachment_files": attachment_files, + } + ) task_logger.info(f"翻译成功完成,用时 {duration:.2f} 秒。") except asyncio.CancelledError: end_time = time.time() duration = end_time - task_state["task_start_time"] - task_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).") - task_state.update({ - "status_message": f"翻译任务已取消 (用时 {duration:.2f} 秒).", "error_flag": False, "download_ready": False, - "task_end_time": end_time, - }) + task_logger.info( + f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒)." + ) + task_state.update( + { + "status_message": f"翻译任务已取消 (用时 {duration:.2f} 秒).", + "error_flag": False, + "download_ready": False, + "task_end_time": end_time, + } + ) except Exception as e: end_time = time.time() duration = end_time - task_state["task_start_time"] error_message = f"翻译失败: {e}" task_logger.error(error_message, exc_info=True) - task_state.update({ - "status_message": f"翻译过程中发生错误 (用时 {duration:.2f} 秒): {e}", "error_flag": True, - "download_ready": False, - "task_end_time": end_time, - }) + task_state.update( + { + "status_message": f"翻译过程中发生错误 (用时 {duration:.2f} 秒): {e}", + "error_flag": True, + "download_ready": False, + "task_end_time": end_time, + } + ) finally: # 无论成功失败,都清理内存中的 workflow 实例和临时目录(如果失败) task_state["workflow_instance"] = None @@ -992,10 +1395,10 @@ async def _perform_translation( # --- 核心任务启动逻辑 --- async def _start_translation_task( - task_id: str, - payload: TranslatePayload, - file_contents: bytes, - original_filename: str + task_id: str, + payload: TranslatePayload, + file_contents: bytes, + original_filename: str, ): if task_id not in tasks_state: tasks_state[task_id] = _create_default_task_state() @@ -1003,22 +1406,36 @@ async def _start_translation_task( tasks_log_histories[task_id] = [] task_state = tasks_state[task_id] - if task_state["is_processing"] and task_state["current_task_ref"] and not task_state["current_task_ref"].done(): - raise HTTPException(status_code=429, detail=f"任务ID '{task_id}' 正在进行中,请稍后再试。") + if ( + task_state["is_processing"] + and task_state["current_task_ref"] + and not task_state["current_task_ref"].done() + ): + raise HTTPException( + status_code=429, detail=f"任务ID '{task_id}' 正在进行中,请稍后再试。" + ) # 如果存在旧的临时文件,先清理 if task_state.get("temp_dir") and os.path.isdir(task_state["temp_dir"]): shutil.rmtree(task_state["temp_dir"]) - task_state.update({ - "is_processing": True, - "status_message": "任务初始化中...", "error_flag": False, "download_ready": False, - "workflow_instance": None, - "original_filename_stem": Path(original_filename).stem, - "original_filename": original_filename, - "task_start_time": time.time(), "task_end_time": 0, "current_task_ref": None, - "temp_dir": None, "downloadable_files": {}, "attachment_files": {}, - }) + task_state.update( + { + "is_processing": True, + "status_message": "任务初始化中...", + "error_flag": False, + "download_ready": False, + "workflow_instance": None, + "original_filename_stem": Path(original_filename).stem, + "original_filename": original_filename, + "task_start_time": time.time(), + "task_end_time": 0, + "current_task_ref": None, + "temp_dir": None, + "downloadable_files": {}, + "attachment_files": {}, + } + ) log_history = tasks_log_histories[task_id] log_queue = tasks_log_queues[task_id] @@ -1036,12 +1453,24 @@ async def _start_translation_task( try: loop = asyncio.get_running_loop() - task = loop.create_task(_perform_translation(task_id, payload, file_contents, original_filename)) + task = loop.create_task( + _perform_translation(task_id, payload, file_contents, original_filename) + ) task_state["current_task_ref"] = task - return {"task_started": True, "task_id": task_id, "message": "翻译任务已成功启动,请稍候..."} + return { + "task_started": True, + "task_id": task_id, + "message": "翻译任务已成功启动,请稍候...", + } except Exception as e: - task_state.update({"is_processing": False, "status_message": f"启动任务失败: {e}", "error_flag": True, - "current_task_ref": None}) + task_state.update( + { + "is_processing": False, + "status_message": f"启动任务失败: {e}", + "error_flag": True, + "current_task_ref": None, + } + ) raise HTTPException(status_code=500, detail=f"启动翻译任务时出错: {e}") @@ -1050,8 +1479,14 @@ def _cancel_translation_logic(task_id: str): task_state = tasks_state.get(task_id) if not task_state: raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'。") - if not task_state or not task_state["is_processing"] or not task_state["current_task_ref"]: - raise HTTPException(status_code=400, detail=f"任务ID '{task_id}' 没有正在进行的翻译任务可取消。") + if ( + not task_state + or not task_state["is_processing"] + or not task_state["current_task_ref"] + ): + raise HTTPException( + status_code=400, detail=f"任务ID '{task_id}' 没有正在进行的翻译任务可取消。" + ) task_to_cancel: Optional[asyncio.Task] = task_state["current_task_ref"] if not task_to_cancel or task_to_cancel.done(): @@ -1069,6 +1504,7 @@ def _cancel_translation_logic(task_id: str): # --- Service Endpoints (/service) --- # =================================================================== + @service_router.post( "/translate", summary="提交翻译任务 (统一入口)", @@ -1082,15 +1518,28 @@ def _cancel_translation_logic(task_id: str): responses={ 200: { "description": "翻译任务已成功启动。", - "content": {"application/json": { - "example": {"task_started": True, "task_id": "a1b2c3d4", "message": "翻译任务已成功启动,请稍候..."}}} + "content": { + "application/json": { + "example": { + "task_started": True, + "task_id": "a1b2c3d4", + "message": "翻译任务已成功启动,请稍候...", + } + } + }, }, 400: {"description": "请求体无效,例如Base64解码失败。"}, - 429: {"description": "服务器已有一个同ID的任务在处理中(理论上不应发生,因为ID是新生成的)。"}, + 429: { + "description": "服务器已有一个同ID的任务在处理中(理论上不应发生,因为ID是新生成的)。" + }, 500: {"description": "启动后台任务时发生未知错误。"}, - } + }, ) -async def service_translate(request: TranslateServiceRequest = Body(..., description="翻译任务的详细参数和文件内容。")): +async def service_translate( + request: TranslateServiceRequest = Body( + ..., description="翻译任务的详细参数和文件内容。" + ) +): task_id = uuid.uuid4().hex[:8] try: @@ -1103,21 +1552,27 @@ async def service_translate(request: TranslateServiceRequest = Body(..., descrip task_id=task_id, payload=request.payload, file_contents=file_contents, - original_filename=request.file_name + original_filename=request.file_name, ) return JSONResponse(content=response_data) except HTTPException as e: if e.status_code == 429: - return JSONResponse(status_code=e.status_code, content={"task_started": False, "message": e.detail}) + return JSONResponse( + status_code=e.status_code, + content={"task_started": False, "message": e.detail}, + ) if e.status_code == 500: - return JSONResponse(status_code=e.status_code, content={"task_started": False, "message": e.detail}) + return JSONResponse( + status_code=e.status_code, + content={"task_started": False, "message": e.detail}, + ) raise e @service_router.post( "/cancel/{task_id}", summary="取消翻译任务", - description="""根据任务ID取消一个正在进行中的翻译任务。如果任务已经完成、失败或已经被取消,则会返回错误。""" + description="""根据任务ID取消一个正在进行中的翻译任务。如果任务已经完成、失败或已经被取消,则会返回错误。""", ) async def service_cancel_translate(task_id: str): return _cancel_translation_logic(task_id) @@ -1126,14 +1581,21 @@ async def service_cancel_translate(task_id: str): @service_router.post( "/release/{task_id}", summary="释放任务资源", - description="""根据任务ID释放其在服务器上占用的所有资源,包括状态、日志和缓存的翻译结果文件。如果任务正在进行,会先尝试取消该任务。此操作不可逆。""" + description="""根据任务ID释放其在服务器上占用的所有资源,包括状态、日志和缓存的翻译结果文件。如果任务正在进行,会先尝试取消该任务。此操作不可逆。""", ) async def service_release_task(task_id: str): if task_id not in tasks_state: - return JSONResponse(status_code=404, content={"released": False, "message": f"找不到任务ID '{task_id}'。"}) + return JSONResponse( + status_code=404, + content={"released": False, "message": f"找不到任务ID '{task_id}'。"}, + ) task_state = tasks_state.get(task_id) message_parts = [] - if task_state and task_state.get("is_processing") and task_state.get("current_task_ref"): + if ( + task_state + and task_state.get("is_processing") + and task_state.get("current_task_ref") + ): try: print(f"[{task_id}] 任务正在进行中,将在释放前尝试取消。") _cancel_translation_logic(task_id) @@ -1174,47 +1636,59 @@ async def service_release_task(task_id: str): "processing": { "summary": "进行中", "value": { - "task_id": "a1b2c3d4", "is_processing": True, + "task_id": "a1b2c3d4", + "is_processing": True, "status_message": "正在处理 'annual_report.pdf'...", - "error_flag": False, "download_ready": False, "original_filename_stem": "annual_report", - "original_filename": "annual_report.pdf", "task_start_time": 1678889400.0, - "task_end_time": 0, "downloads": {}, "attachment": {} - } + "error_flag": False, + "download_ready": False, + "original_filename_stem": "annual_report", + "original_filename": "annual_report.pdf", + "task_start_time": 1678889400.0, + "task_end_time": 0, + "downloads": {}, + "attachment": {}, + }, }, "completed_markdown": { "summary": "已完成 (Markdown)", "value": { - "task_id": "b2865b93", "is_processing": False, + "task_id": "b2865b93", + "is_processing": False, "status_message": "翻译成功!用时 123.45 秒。", - "error_flag": False, "download_ready": True, "original_filename_stem": "my_paper", - "original_filename": "my_paper.pdf", "task_start_time": 1678889400.123, + "error_flag": False, + "download_ready": True, + "original_filename_stem": "my_paper", + "original_filename": "my_paper.pdf", + "task_start_time": 1678889400.123, "task_end_time": 1678889523.573, "downloads": { "html": "/service/download/b2865b93/html", "markdown": "/service/download/b2865b93/markdown", - "markdown_zip": "/service/download/b2865b93/markdown_zip" + "markdown_zip": "/service/download/b2865b93/markdown_zip", }, - "attachment": {} - } + "attachment": {}, + }, }, "completed_with_attachment": { "summary": "已完成 (带附件)", "value": { - "task_id": "g1h2i3j4", "is_processing": False, + "task_id": "g1h2i3j4", + "is_processing": False, "status_message": "翻译成功!用时 125.00 秒。", - "error_flag": False, "download_ready": True, + "error_flag": False, + "download_ready": True, "original_filename_stem": "complex_document", "original_filename": "complex_document.docx", "task_start_time": 1678891000.0, "task_end_time": 1678891125.0, "downloads": { "docx": "/service/download/g1h2i3j4/docx", - "html": "/service/download/g1h2i3j4/html" + "html": "/service/download/g1h2i3j4/html", }, "attachment": { "glossary": "/service/attachment/g1h2i3j4/glossary" - } - } + }, + }, }, "completed_xlsx": { "summary": "已完成 (XLSX)", @@ -1231,93 +1705,118 @@ async def service_release_task(task_id: str): "downloads": { "xlsx": "/service/download/d7e8f9a0/xlsx", "csv": "/service/download/d7e8f9a0/csv", - "html": "/service/download/d7e8f9a0/html" + "html": "/service/download/d7e8f9a0/html", }, - "attachment": {} - } + "attachment": {}, + }, }, "completed_docx": { "summary": "已完成 (DOCX)", "value": { - "task_id": "f8a9c1b2", "is_processing": False, + "task_id": "f8a9c1b2", + "is_processing": False, "status_message": "翻译成功!用时 25.10 秒。", - "error_flag": False, "download_ready": True, "original_filename_stem": "contract", - "original_filename": "contract.docx", "task_start_time": 1678889500.123, + "error_flag": False, + "download_ready": True, + "original_filename_stem": "contract", + "original_filename": "contract.docx", + "task_start_time": 1678889500.123, "task_end_time": 1678889525.223, "downloads": { "docx": "/service/download/f8a9c1b2/docx", - "html": "/service/download/f8a9c1b2/html" + "html": "/service/download/f8a9c1b2/html", }, - "attachment": {} - } + "attachment": {}, + }, }, "completed_epub": { "summary": "已完成 (EPUB)", "value": { - "task_id": "e9b8d7c6", "is_processing": False, + "task_id": "e9b8d7c6", + "is_processing": False, "status_message": "翻译成功!用时 45.32 秒。", - "error_flag": False, "download_ready": True, "original_filename_stem": "my_book", - "original_filename": "my_book.epub", "task_start_time": 1678890000.0, + "error_flag": False, + "download_ready": True, + "original_filename_stem": "my_book", + "original_filename": "my_book.epub", + "task_start_time": 1678890000.0, "task_end_time": 1678890045.32, "downloads": { "epub": "/service/download/e9b8d7c6/epub", - "html": "/service/download/e9b8d7c6/html" + "html": "/service/download/e9b8d7c6/html", }, - "attachment": {} - } + "attachment": {}, + }, }, # --- HTML STATUS EXAMPLE START --- "completed_html": { "summary": "已完成 (HTML)", "value": { - "task_id": "a1b2c3d4", "is_processing": False, + "task_id": "a1b2c3d4", + "is_processing": False, "status_message": "翻译成功!用时 15.78 秒。", - "error_flag": False, "download_ready": True, "original_filename_stem": "about_us", - "original_filename": "about_us.html", "task_start_time": 1678890100.0, + "error_flag": False, + "download_ready": True, + "original_filename_stem": "about_us", + "original_filename": "about_us.html", + "task_start_time": 1678890100.0, "task_end_time": 1678890115.78, "downloads": { "html": "/service/download/a1b2c3d4/html" }, - "attachment": {} - } + "attachment": {}, + }, }, # --- HTML STATUS EXAMPLE END --- # --- ASS STATUS EXAMPLE START --- "completed_ass": { "summary": "已完成 (ASS)", "value": { - "task_id": "a1b2c3d5", "is_processing": False, + "task_id": "a1b2c3d5", + "is_processing": False, "status_message": "翻译成功!用时 12.34 秒。", - "error_flag": False, "download_ready": True, "original_filename_stem": "dialogue", - "original_filename": "dialogue.ass", "task_start_time": 1678890200.0, + "error_flag": False, + "download_ready": True, + "original_filename_stem": "dialogue", + "original_filename": "dialogue.ass", + "task_start_time": 1678890200.0, "task_end_time": 1678890212.34, "downloads": { "ass": "/service/download/a1b2c3d5/ass", - "html": "/service/download/a1b2c3d5/html" + "html": "/service/download/a1b2c3d5/html", }, - "attachment": {} - } + "attachment": {}, + }, }, # --- ASS STATUS EXAMPLE END --- "error": { "summary": "失败", "value": { - "task_id": "c3d4e5f6", "is_processing": False, + "task_id": "c3d4e5f6", + "is_processing": False, "status_message": "翻译过程中发生错误: LLM API key is invalid", - "error_flag": True, "download_ready": False, "original_filename_stem": "bad_config", - "original_filename": "bad_config.json", "task_start_time": 1678889600.0, - "task_end_time": 1678889610.0, "downloads": {}, "attachment": {} - } - } + "error_flag": True, + "download_ready": False, + "original_filename_stem": "bad_config", + "original_filename": "bad_config.json", + "task_start_time": 1678889600.0, + "task_end_time": 1678889610.0, + "downloads": {}, + "attachment": {}, + }, + }, } } - } + }, }, 404: {"description": "指定的任务ID不存在。"}, - } + }, ) async def service_get_status( - task_id: str = FastApiPath(..., description="要查询状态的任务的ID", examples=["b2865b93"])): + task_id: str = FastApiPath( + ..., description="要查询状态的任务的ID", examples=["b2865b93"] + ) +): task_state = tasks_state.get(task_id) if not task_state: raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'。") @@ -1332,29 +1831,33 @@ async def service_get_status( for identifier in task_state["attachment_files"].keys(): attachments[identifier] = f"/service/attachment/{task_id}/{identifier}" - return JSONResponse(content={ - "task_id": task_id, - "is_processing": task_state["is_processing"], - "status_message": task_state["status_message"], - "error_flag": task_state["error_flag"], - "download_ready": task_state["download_ready"], - "original_filename_stem": task_state["original_filename_stem"], - "original_filename": task_state.get("original_filename"), - "task_start_time": task_state["task_start_time"], - "task_end_time": task_state["task_end_time"], - "downloads": downloads, - "attachment": attachments - }) + return JSONResponse( + content={ + "task_id": task_id, + "is_processing": task_state["is_processing"], + "status_message": task_state["status_message"], + "error_flag": task_state["error_flag"], + "download_ready": task_state["download_ready"], + "original_filename_stem": task_state["original_filename_stem"], + "original_filename": task_state.get("original_filename"), + "task_start_time": task_state["task_start_time"], + "task_end_time": task_state["task_end_time"], + "downloads": downloads, + "attachment": attachments, + } + ) @service_router.get( "/logs/{task_id}", summary="获取任务增量日志", - description="""以流式方式获取任务的增量日志。客户端每次调用此接口,都会返回自上次调用以来产生的新日志行。这对于实时展示翻译进度非常有用。如果任务ID不存在,则返回404。""" + description="""以流式方式获取任务的增量日志。客户端每次调用此接口,都会返回自上次调用以来产生的新日志行。这对于实时展示翻译进度非常有用。如果任务ID不存在,则返回404。""", ) async def service_get_logs(task_id: str): if task_id not in tasks_log_queues: - raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}' 的日志队列。") + raise HTTPException( + status_code=404, detail=f"找不到任务ID '{task_id}' 的日志队列。" + ) log_queue = tasks_log_queues[task_id] new_logs = [] while not log_queue.empty(): @@ -1366,7 +1869,19 @@ async def service_get_logs(task_id: str): return JSONResponse(content={"logs": new_logs}) -FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx", "csv", "docx", "srt", "epub", "ass"] +FileType = Literal[ + "markdown", + "markdown_zip", + "html", + "txt", + "json", + "xlsx", + "csv", + "docx", + "srt", + "epub", + "ass", +] @service_router.get( @@ -1383,21 +1898,31 @@ FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx", "c "application/zip": {"schema": {"type": "string", "format": "binary"}}, "application/json": {"schema": {"type": "string", "format": "binary"}}, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": { - "schema": {"type": "string", "format": "binary"}}, + "schema": {"type": "string", "format": "binary"} + }, "application/vnd.openxmlformats-officedocument.wordprocessingml.document": { - "schema": {"type": "string", "format": "binary"}}, + "schema": {"type": "string", "format": "binary"} + }, "application/epub+zip": { - "schema": {"type": "string", "format": "binary"}}, - } + "schema": {"type": "string", "format": "binary"} + }, + }, }, - 404: {"description": "任务ID不存在,或该任务不支持所请求的文件类型,或临时文件已丢失。"}, - 500: {"description": "在服务器上读取文件时发生内部错误。"} - } + 404: { + "description": "任务ID不存在,或该任务不支持所请求的文件类型,或临时文件已丢失。" + }, + 500: {"description": "在服务器上读取文件时发生内部错误。"}, + }, ) async def service_download_file( - task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]), - file_type: FileType = FastApiPath(..., description="要下载的文件类型。", - examples=["html", "json", "csv", "docx", "srt", "epub", "ass"]) + task_id: str = FastApiPath( + ..., description="已完成任务的ID", examples=["b2865b93"] + ), + file_type: FileType = FastApiPath( + ..., + description="要下载的文件类型。", + examples=["html", "json", "csv", "docx", "srt", "epub", "ass"], + ), ): task_state = tasks_state.get(task_id) if not task_state: @@ -1405,8 +1930,10 @@ async def service_download_file( file_info = task_state.get("downloadable_files", {}).get(file_type) if not file_info or not os.path.exists(file_info.get("path")): - raise HTTPException(status_code=404, - detail=f"任务 '{task_id}' 不支持下载 '{file_type}' 类型的文件,或文件已丢失。") + raise HTTPException( + status_code=404, + detail=f"任务 '{task_id}' 不支持下载 '{file_type}' 类型的文件,或文件已丢失。", + ) file_path = file_info["path"] filename = file_info["filename"] @@ -1423,15 +1950,23 @@ async def service_download_file( 200: { "description": "成功返回文件流。文件名通过 Content-Disposition 头指定。", "content": { - "application/octet-stream": {"schema": {"type": "string", "format": "binary"}}, - } + "application/octet-stream": { + "schema": {"type": "string", "format": "binary"} + }, + }, }, - 404: {"description": "任务ID不存在,或该任务没有指定的附件,或临时文件已丢失。"}, - } + 404: { + "description": "任务ID不存在,或该任务没有指定的附件,或临时文件已丢失。" + }, + }, ) async def service_download_attachment( - task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["g1h2i3j4"]), - identifier: str = FastApiPath(..., description="要下载的附件的标识符。", examples=["glossary"]) + task_id: str = FastApiPath( + ..., description="已完成任务的ID", examples=["g1h2i3j4"] + ), + identifier: str = FastApiPath( + ..., description="要下载的附件的标识符。", examples=["glossary"] + ), ): task_state = tasks_state.get(task_id) if not task_state: @@ -1439,8 +1974,10 @@ async def service_download_attachment( attachment_info = task_state.get("attachment_files", {}).get(identifier) if not attachment_info or not os.path.exists(attachment_info.get("path")): - raise HTTPException(status_code=404, - detail=f"任务 '{task_id}' 不存在标识符为 '{identifier}' 的附件,或文件已丢失。") + raise HTTPException( + status_code=404, + detail=f"任务 '{task_id}' 不存在标识符为 '{identifier}' 的附件,或文件已丢失。", + ) file_path = attachment_info["path"] filename = attachment_info["filename"] @@ -1463,41 +2000,52 @@ async def service_download_attachment( responses={ 200: { "description": "成功返回文件内容。", - "content": {"application/json": {"examples": { - "html_base64": { - "summary": "HTML 内容 (Base64)", - "value": { - "file_type": "html", - "filename": "my_doc_translated.html", - "content": "PGh0bWw+PGhlYWQ+..." - } - }, - "docx_base64": { - "summary": "DOCX 内容 (Base64)", - "value": { - "file_type": "docx", - "filename": "my_doc_translated.docx", - "content": "UEsDBBQAAAAIA... (base64-encoded string)" - } - }, - "epub_base64": { - "summary": "EPUB 内容 (Base64)", - "value": { - "file_type": "epub", - "filename": "my_book_translated.epub", - "content": "UEsDBBQAAAAIA... (base64-encoded string)" + "content": { + "application/json": { + "examples": { + "html_base64": { + "summary": "HTML 内容 (Base64)", + "value": { + "file_type": "html", + "filename": "my_doc_translated.html", + "content": "PGh0bWw+PGhlYWQ+...", + }, + }, + "docx_base64": { + "summary": "DOCX 内容 (Base64)", + "value": { + "file_type": "docx", + "filename": "my_doc_translated.docx", + "content": "UEsDBBQAAAAIA... (base64-encoded string)", + }, + }, + "epub_base64": { + "summary": "EPUB 内容 (Base64)", + "value": { + "file_type": "epub", + "filename": "my_book_translated.epub", + "content": "UEsDBBQAAAAIA... (base64-encoded string)", + }, + }, } } - }}} + }, }, - 404: {"description": "任务ID不存在,或该任务不支持所请求的文件类型,或临时文件已丢失。"}, - 500: {"description": "在服务器上读取文件时发生内部错误。"} - } + 404: { + "description": "任务ID不存在,或该任务不支持所请求的文件类型,或临时文件已丢失。" + }, + 500: {"description": "在服务器上读取文件时发生内部错误。"}, + }, ) async def service_content( - task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]), - file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", - examples=["html", "json", "csv", "docx", "srt", "epub", "ass"]) + task_id: str = FastApiPath( + ..., description="已完成任务的ID", examples=["b2865b93"] + ), + file_type: FileType = FastApiPath( + ..., + description="要获取内容的文件类型。", + examples=["html", "json", "csv", "docx", "srt", "epub", "ass"], + ), ): task_state = tasks_state.get(task_id) if not task_state: @@ -1505,8 +2053,10 @@ async def service_content( file_info = task_state.get("downloadable_files", {}).get(file_type) if not file_info or not os.path.exists(file_info.get("path")): - raise HTTPException(status_code=404, - detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。") + raise HTTPException( + status_code=404, + detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。", + ) file_path = file_info["path"] filename = file_info["filename"] @@ -1514,12 +2064,14 @@ async def service_content( try: with open(file_path, "rb") as f: content_bytes = f.read() - final_content = base64.b64encode(content_bytes).decode('utf-8') - return JSONResponse(content={ - "file_type": file_type, - "filename": filename, - "content": final_content - }) + final_content = base64.b64encode(content_bytes).decode("utf-8") + return JSONResponse( + content={ + "file_type": file_type, + "filename": filename, + "content": final_content, + } + ) except Exception as e: raise HTTPException(status_code=500, detail=f"读取文件时发生内部错误: {e}") @@ -1527,40 +2079,58 @@ async def service_content( # =================================================================== # --- 应用主路由和启动 --- # =================================================================== -@service_router.get("/engin-list", tags=["Application"], description="返回正在进行的可用的转换引擎") +@service_router.get( + "/engin-list", tags=["Application"], description="返回正在进行的可用的转换引擎" +) async def service_get_engin_list(): engin_list = ["mineru"] - if DOCLING_EXIST: engin_list.append("docling") + if DOCLING_EXIST: + engin_list.append("docling") return JSONResponse(content=engin_list) -@service_router.get("/task-list", tags=["Application"], description="返回正在进行的task_id列表") -async def service_get_task_list(): return JSONResponse(content=list(tasks_state.keys())) +@service_router.get( + "/task-list", tags=["Application"], description="返回正在进行的task_id列表" +) +async def service_get_task_list(): + return JSONResponse(content=list(tasks_state.keys())) -@service_router.get("/default-params", tags=["Application"], description="返回一些默认参数") -def service_get_default_params(): return JSONResponse(content=default_params) +@service_router.get( + "/default-params", tags=["Application"], description="返回一些默认参数" +) +def service_get_default_params(): + return JSONResponse(content=default_params) @service_router.get("/meta", tags=["Application"], description="返回软件版本号") -async def service_get_app_version(): return JSONResponse(content={"version": __version__}) +async def service_get_app_version(): + return JSONResponse(content={"version": __version__}) @app.get("/", response_class=HTMLResponse, include_in_schema=False) async def main_page(): index_path = Path(STATIC_DIR) / "index.html" - if not index_path.exists(): raise HTTPException(status_code=404, detail="index.html not found") - no_cache_headers = {"Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", "Pragma": "no-cache", - "Expires": "0"} + if not index_path.exists(): + raise HTTPException(status_code=404, detail="index.html not found") + no_cache_headers = { + "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", + "Pragma": "no-cache", + "Expires": "0", + } return FileResponse(index_path, headers=no_cache_headers) @app.get("/admin", response_class=HTMLResponse, include_in_schema=False) async def main_page_admin(): index_path = Path(STATIC_DIR) / "index.html" - if not index_path.exists(): raise HTTPException(status_code=404, detail="index.html not found") - no_cache_headers = {"Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", "Pragma": "no-cache", - "Expires": "0"} + if not index_path.exists(): + raise HTTPException(status_code=404, detail="index.html not found") + no_cache_headers = { + "Cache-Control": "no-store, no-cache, must-revalidate, max-age=0", + "Pragma": "no-cache", + "Expires": "0", + } return FileResponse(index_path, headers=no_cache_headers) @@ -1591,32 +2161,50 @@ async def redoc_html(): @app.post("/temp/translate", tags=["Temp"]) async def temp_translate( - base_url: str = Body(...), api_key: str = Body(...), model_id: str = Body(...), - mineru_token: Optional[str] = Body(None), file_name: str = Body(...), file_content: str = Body(...), - to_lang: str = Body("中文"), concurrent: int = Body(default_params["concurrent"]), - temperature: float = Body(default_params["temperature"]), - thinking: ThinkingMode = Body(default_params["thinking"]), - chunk_size: int = Body(default_params["chunk_size"]), custom_prompt: Optional[str] = Body(None), - model_version: Literal["pipeline", "vlm"] = Body("vlm"), - glossary_dict: Optional[Dict[str, str]] = Body(None), + base_url: str = Body(...), + api_key: str = Body(...), + model_id: str = Body(...), + mineru_token: Optional[str] = Body(None), + file_name: str = Body(...), + file_content: str = Body(...), + to_lang: str = Body("中文"), + concurrent: int = Body(default_params["concurrent"]), + temperature: float = Body(default_params["temperature"]), + thinking: ThinkingMode = Body(default_params["thinking"]), + chunk_size: int = Body(default_params["chunk_size"]), + custom_prompt: Optional[str] = Body(None), + model_version: Literal["pipeline", "vlm"] = Body("vlm"), + glossary_dict: Optional[Dict[str, str]] = Body(None), ): file_name = Path(file_name) try: decoded_content = base64.b64decode(file_content) except (ValueError, binascii.Error): - decoded_content = file_content.encode('utf-8') + decoded_content = file_content.encode("utf-8") try: workflow_config = MarkdownBasedWorkflowConfig( convert_engine="mineru", - converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version), - translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id, - to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature, - thinking=thinking, chunk_size=chunk_size, concurrent=concurrent, - glossary_dict=glossary_dict), - html_exporter_config=MD2HTMLExporterConfig() + converter_config=ConverterMineruConfig( + mineru_token=mineru_token, model_version=model_version + ), + translator_config=MDTranslatorConfig( + base_url=base_url, + api_key=api_key, + model_id=model_id, + to_lang=to_lang, + custom_prompt=custom_prompt, + temperature=temperature, + thinking=thinking, + chunk_size=chunk_size, + concurrent=concurrent, + glossary_dict=glossary_dict, + ), + html_exporter_config=MD2HTMLExporterConfig(), ) workflow = MarkdownBasedWorkflow(workflow_config) - workflow.read_bytes(content=decoded_content, stem=file_name.stem, suffix=file_name.suffix) + workflow.read_bytes( + content=decoded_content, stem=file_name.stem, suffix=file_name.suffix + ) await workflow.translate_async() return {"success": True, "content": workflow.export_to_markdown()} except Exception as e: @@ -1631,7 +2219,8 @@ def find_free_port(start_port): port = start_port while True: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - if sock.connect_ex(('127.0.0.1', port)) != 0: return port + if sock.connect_ex(("127.0.0.1", port)) != 0: + return port port += 1 @@ -1639,7 +2228,8 @@ def run_app(port: int | None = None): initial_port = port or int(os.environ.get("DOCUTRANSLATE_PORT", 8010)) try: port_to_use = find_free_port(initial_port) - if port_to_use != initial_port: print(f"端口 {initial_port} 被占用,将使用端口 {port_to_use} 代替") + if port_to_use != initial_port: + print(f"端口 {initial_port} 被占用,将使用端口 {port_to_use} 代替") print(f"正在启动 DocuTranslate WebUI 版本号:{__version__}") app.state.port_to_use = port_to_use uvicorn.run(app, host=None, port=port_to_use, workers=1) diff --git a/docutranslate/global_values/__init__.py b/docutranslate/global_values/__init__.py index 6104af5..4e7d9c1 100644 --- a/docutranslate/global_values/__init__.py +++ b/docutranslate/global_values/__init__.py @@ -7,4 +7,4 @@ from .conditional_import import available_packages, conditional_import USE_PROXY = True if (os.getenv("DOCUTRANSLATE_PROXY_ENABLED") and os.getenv( "DOCUTRANSLATE_PROXY_ENABLED").lower() == "true") else False if USE_PROXY: - print(f"USE_PROXY:{USE_PROXY}") + print(f"USE_PROXY:{USE_PROXY}") \ No newline at end of file diff --git a/docutranslate/static/index.html b/docutranslate/static/index.html index c47b49d..853b963 100644 --- a/docutranslate/static/index.html +++ b/docutranslate/static/index.html @@ -1 +1 @@ - DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。
mineru VLM是更新的内测模型。

Base URL:

选择一个或多个CSV文件。文件需包含'src'和'dst'两列标题,分别代表原文和译文。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

version:

任务列表

LOGO

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file + DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。
mineru VLM是更新的内测模型。

Base URL:

选择一个或多个CSV文件。文件需包含'src'和'dst'两列标题,分别代表原文和译文。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

version:

任务列表

LOGO

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file diff --git a/docutranslate/translator/ai_translator/base.py b/docutranslate/translator/ai_translator/base.py index f63bcd9..978c575 100644 --- a/docutranslate/translator/ai_translator/base.py +++ b/docutranslate/translator/ai_translator/base.py @@ -12,9 +12,13 @@ from docutranslate.translator.base import Translator, TranslatorConfig @dataclass(kw_only=True) class AiTranslatorConfig(TranslatorConfig, AgentConfig): - base_url: str | None = field(default=None, - metadata={"description": "OpenAI兼容地址,当skip_translate为False时为必填项"}) - model_id: str | None = field(default=None, metadata={"description": "当skip_translate为False时为必填项"}) + base_url: str | None = field( + default=None, + metadata={"description": "OpenAI兼容地址,当skip_translate为False时为必填项"}, + ) + model_id: str | None = field( + default=None, metadata={"description": "当skip_translate为False时为必填项"} + ) to_lang: str = "简体中文" custom_prompt: str | None = None chunk_size: int = 3000 @@ -24,7 +28,7 @@ class AiTranslatorConfig(TranslatorConfig, AgentConfig): skip_translate: bool = False # 当skip_translate为False时base_url、model_id为必填项 -T = TypeVar('T', bound=Document) +T = TypeVar("T", bound=Document) class AiTranslator(Translator[T]): @@ -37,8 +41,12 @@ class AiTranslator(Translator[T]): self.skip_translate = config.skip_translate self.glossary_agent = None self.glossary_dict_gen = None - if not self.skip_translate and (config.base_url is None or config.api_key is None or config.model_id is None): - raise ValueError("skip_translate不为false时,base_url、api_key、model_id为必填项") + if not self.skip_translate and ( + config.base_url is None or config.api_key is None or config.model_id is None + ): + raise ValueError( + "skip_translate不为false时,base_url、api_key、model_id为必填项" + ) if config.glossary_generate_enable: if config.glossary_agent_config: @@ -54,14 +62,13 @@ class AiTranslator(Translator[T]): concurrent=config.concurrent, timeout=config.timeout, logger=self.logger, - retry=config.retry + retry=config.retry, + system_proxy_enable=config.system_proxy_enable, ) self.glossary_agent = GlossaryAgent(glossary_agent_config) @abstractmethod - def translate(self, document: T) -> Document: - ... + def translate(self, document: T) -> Document: ... @abstractmethod - async def translate_async(self, document: T) -> Document: - ... + async def translate_async(self, document: T) -> Document: ... diff --git a/docutranslate/utils/utils.py b/docutranslate/utils/utils.py index 6575c84..0763706 100644 --- a/docutranslate/utils/utils.py +++ b/docutranslate/utils/utils.py @@ -12,3 +12,6 @@ def get_httpx_proxies(): if http_proxy: proxies["http://"] = http_proxy return proxies + +if __name__ == '__main__': + print(get_httpx_proxies()) \ No newline at end of file