diff --git a/.idea/workspace.xml b/.idea/workspace.xml index fd2a3c0..eef37cb 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -5,9 +5,17 @@ + - + + + + + + + + @@ -33,59 +41,59 @@ - { - "keyToString": { - "DefaultHtmlFileTemplate": "HTML File", - "JavaScript 调试.output.html (1).executor": "Run", - "JavaScript 调试.output.html.executor": "Run", - "JavaScript 调试.regex.md_中文.html.executor": "Run", - "JavaScript 调试.regex_中文.html.executor": "Run", - "JavaScript 调试.test2.html.executor": "Run", - "JavaScript 调试.test2_英文.html.executor": "Run", - "JavaScript 调试.test4-1_中文.html.executor": "Run", - "JavaScript 调试.互联网认证授权机制.html.executor": "Run", - "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run", - "JavaScript 调试.毕业论文_英文.html.executor": "Run", - "ModuleVcsDetector.initialDetectionPerformed": "true", - "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run", - "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", - "Python.PDFtranslater (1).executor": "Run", - "Python.PDFtranslater (2).executor": "Run", - "Python.agent.executor": "Debug", - "Python.agent_utils.executor": "Run", - "Python.app (1).executor": "Run", - "Python.app.executor": "Run", - "Python.app2.executor": "Run", - "Python.app_test (1).executor": "Run", - "Python.convert.executor": "Run", - "Python.markdown_splitter.executor": "Debug", - "Python.markdown_utils.executor": "Run", - "Python.test.executor": "Run", - "Python.test1.executor": "Run", - "Python.test2.executor": "Run", - "Python.test3.executor": "Run", - "Python.test4.executor": "Run", - "Python.translater.executor": "Run", - "Python.切分测试.executor": "Run", - "RunOnceActivity.ShowReadmeOnStart": "true", - "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true", - "RunOnceActivity.git.unshallow": "true", - "git-widget-placeholder": "main", - "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate", - "node.js.detected.package.eslint": "true", - "node.js.detected.package.tslint": "true", - "node.js.selected.package.eslint": "(autodetect)", - "node.js.selected.package.tslint": "(autodetect)", - "nodejs_package_manager_path": "npm", - "settings.editor.selected.configurable": "preferences.pluginManager", - "vue.rearranger.settings.migration": "true" + +}]]> + - @@ -97,7 +105,7 @@ - + @@ -329,29 +337,6 @@ - - - - - - - - - - - - - - - - - - - - - - - @@ -421,6 +406,29 @@ + + + + + + + + + + + + + + + + + + + + + + + @@ -543,11 +551,11 @@ - + + - @@ -616,7 +624,9 @@ - + + + @@ -624,8 +634,8 @@ - - + + @@ -645,7 +655,7 @@ - + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..671b116 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 International Business Machines + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/docutranslate/agents/__init__.py b/docutranslate/agents/__init__.py index c34968a..d918a04 100644 --- a/docutranslate/agents/__init__.py +++ b/docutranslate/agents/__init__.py @@ -1,2 +1,2 @@ -from .agent_async import Agent, AgentArgs +from .agent import Agent, AgentArgs from .markdown_agent import MDRefineAgent, MDTranslateAgent diff --git a/docutranslate/agents/agent_async.py b/docutranslate/agents/agent.py similarity index 62% rename from docutranslate/agents/agent_async.py rename to docutranslate/agents/agent.py index 73a9e3f..24a9e89 100644 --- a/docutranslate/agents/agent_async.py +++ b/docutranslate/agents/agent.py @@ -1,4 +1,6 @@ import asyncio +from concurrent.futures import ThreadPoolExecutor +from threading import Lock from typing import TypedDict import httpx @@ -16,17 +18,32 @@ class AgentArgs(TypedDict, total=False): timeout: int +# 仅使用多线程时用以计数 +class PromptsCount: + def __init__(self, total: int): + self.lock = Lock() + self.count = 0 + self.total = total + + def add(self): + self.lock.acquire() + self.count += 1 + translater_logger.info(f"多线程-已完成:{self.count}/{self.total}") + self.lock.release() + + TIMEOUT = 500 class Agent: def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7, - max_concurrent=6, timeout: int = TIMEOUT): + max_concurrent=15, timeout: int = TIMEOUT): self.baseurl = baseurl.strip() self.key = key.strip() self.model_id = model_id.strip() self.system_prompt = system_prompt self.temperature = temperature + self.client = httpx.Client() self.client_async = httpx.AsyncClient() self.max_concurrent = max_concurrent self.timeout = timeout @@ -65,12 +82,6 @@ class Agent: ) response.raise_for_status() result = response.json()["choices"][0]["message"]["content"] - # pattern = r".*【SSS】(.*)" - # match = re.search(pattern, result, re.DOTALL) - # if match is None: - # print("检测开头`【SSS】`失败") - # else: - # result = match.group(1) return result except httpx.HTTPStatusError as e: raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e @@ -83,8 +94,9 @@ class Agent: self, prompts: list[str], system_prompt: str | None = None, - max_concurrent: int = 5 # 新增参数,默认并发数为5 + max_concurrent: int | None = None # 新增参数,默认并发数为5 ) -> list[str]: + max_concurrent = self.max_concurrent if max_concurrent is None else max_concurrent total = len(prompts) count = 0 semaphore = asyncio.Semaphore(max_concurrent) @@ -109,14 +121,48 @@ class Agent: results = await asyncio.gather(*tasks, return_exceptions=False) return results + def send(self, prompt: str, system_prompt: None | str = None) -> str: + if system_prompt is None: + system_prompt = self.system_prompt + + """Sends a single prompt asynchronously.""" + headers, data = self._prepare_request_data(prompt, system_prompt) + if self.baseurl.endswith("/"): + self.baseurl = self.baseurl[:-1] + try: + response = self.client.post( + f"{self.baseurl}/chat/completions", + json=data, + headers=headers, + timeout=self.timeout + ) + response.raise_for_status() + result = response.json()["choices"][0]["message"]["content"] + return result + except httpx.HTTPStatusError as e: + raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e + except httpx.RequestError as e: + raise Exception(f"AI请求连接错误 (async): {e}") from e + except (KeyError, IndexError) as e: + raise Exception(f"AI响应格式错误 (async): {e}") from e + + def _send_prompt_count(self, prompt: str, system_prompt: None | str, count: PromptsCount) -> str: + result = self.send(prompt, system_prompt) + count.add() + return result + def send_prompts( self, prompts: list[str], system_prompt: str | None = None, ) -> list[str]: - - result = asyncio.run(self.send_prompts_async(prompts, system_prompt, self.max_concurrent)) - return result + system_prompts = [system_prompt] * len(prompts) + counts = [PromptsCount(len(prompts))] * len(prompts) + output_list = [] + with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor: + results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts, counts) + output_list = list(results_iterator) + return output_list if __name__ == '__main__': diff --git a/docutranslate/agents/agent_sync.py b/docutranslate/agents/agent_sync.py deleted file mode 100644 index 72ef583..0000000 --- a/docutranslate/agents/agent_sync.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import TypedDict -from docutranslate.logger import translater_logger -import httpx - - -class AgentArgs(TypedDict, total=False): - baseurl: str - key: str - model_id: str - system_prompt: str - temperature: float - max_concurrent: int - timeout: int - - -TIMEOUT = 500 - - - -class Agent: - def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7, - max_concurrent=6, timeout: int = TIMEOUT): - self.baseurl = baseurl.strip() - self.key = key.strip() - self.model_id = model_id.strip() - self.system_prompt = system_prompt - self.temperature = temperature - self.client = httpx.Client() - self.max_concurrent = max_concurrent - self.timeout = timeout - - def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9): - if temperature is None: - temperature = self.temperature - headers = {"Content-Type": "application/json", - "Authorization": f"Bearer {self.key}"} - data = { - "model": self.model_id, - "messages": [ - {"role": "system", "content": system_prompt}, - # {"role": "system", "content": "所有回复必须以【SSS】开头(这是最高规则,适用于之后的所有例子)。示例:【SSS】这是示例回答\n"+system_prompt}, - {"role": "user", "content": prompt} - ], - "temperature": temperature, - "top_p": top_p - } - return headers, data - - def send(self, prompt: str, system_prompt: None | str = None) -> str: - if system_prompt is None: - system_prompt = self.system_prompt - - """Sends a single prompt asynchronously.""" - headers, data = self._prepare_request_data(prompt, system_prompt) - if self.baseurl.endswith("/"): - self.baseurl = self.baseurl[:-1] - try: - response = self.client.post( - f"{self.baseurl}/chat/completions", - json=data, - headers=headers, - timeout=self.timeout - ) - response.raise_for_status() - result = response.json()["choices"][0]["message"]["content"] - return result - except httpx.HTTPStatusError as e: - raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e - except httpx.RequestError as e: - raise Exception(f"AI请求连接错误 (async): {e}") from e - except (KeyError, IndexError) as e: - raise Exception(f"AI响应格式错误 (async): {e}") from e - - - def send_prompts( - self, - prompts: list[str], - system_prompt: str | None = None, - ) -> list[str]: - result=[] - for prompt in prompts: - result.append(self.send(prompt,system_prompt)) - translater_logger.info(f"单线程-已完成{len(result)}/{len(prompts)}") - return result - - - -if __name__ == '__main__': - pass diff --git a/docutranslate/agents/agent_thread.py b/docutranslate/agents/agent_thread.py deleted file mode 100644 index d8ed1df..0000000 --- a/docutranslate/agents/agent_thread.py +++ /dev/null @@ -1,109 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from threading import Lock -from typing import TypedDict -from docutranslate.logger import translater_logger -import httpx - - -class AgentArgs(TypedDict, total=False): - baseurl: str - key: str - model_id: str - system_prompt: str - temperature: float - max_concurrent: int - timeout: int - - -TIMEOUT = 500 - - -class PromptsCount(): - def __init__(self,max:int): - self.lock=Lock() - self.count=0 - self.max=max - - def add(self): - self.lock.acquire() - self.count+=1 - translater_logger.info(f"多线程-已完成:{self.count}/{self.max}") - self.lock.release() - - -class Agent: - def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7, - max_concurrent=6, timeout: int = TIMEOUT): - self.baseurl = baseurl.strip() - self.key = key.strip() - self.model_id = model_id.strip() - self.system_prompt = system_prompt - self.temperature = temperature - self.client = httpx.Client() - self.max_concurrent = max_concurrent - self.timeout = timeout - - def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9): - if temperature is None: - temperature = self.temperature - headers = {"Content-Type": "application/json", - "Authorization": f"Bearer {self.key}"} - data = { - "model": self.model_id, - "messages": [ - {"role": "system", "content": system_prompt}, - # {"role": "system", "content": "所有回复必须以【SSS】开头(这是最高规则,适用于之后的所有例子)。示例:【SSS】这是示例回答\n"+system_prompt}, - {"role": "user", "content": prompt} - ], - "temperature": temperature, - "top_p": top_p - } - return headers, data - - def send(self, prompt: str, system_prompt: None | str = None) -> str: - if system_prompt is None: - system_prompt = self.system_prompt - - """Sends a single prompt asynchronously.""" - headers, data = self._prepare_request_data(prompt, system_prompt) - if self.baseurl.endswith("/"): - self.baseurl = self.baseurl[:-1] - try: - response = self.client.post( - f"{self.baseurl}/chat/completions", - json=data, - headers=headers, - timeout=self.timeout - ) - response.raise_for_status() - result = response.json()["choices"][0]["message"]["content"] - return result - except httpx.HTTPStatusError as e: - raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e - except httpx.RequestError as e: - raise Exception(f"AI请求连接错误 (async): {e}") from e - except (KeyError, IndexError) as e: - raise Exception(f"AI响应格式错误 (async): {e}") from e - - def _send_prompt_count(self,prompt: str, system_prompt:None | str,count:PromptsCount)->str: - result=self.send(prompt,system_prompt) - count.add() - return result - - - def send_prompts( - self, - prompts: list[str], - system_prompt: str | None = None, - ) -> list[str]: - system_prompts = [system_prompt] * len(prompts) - counts=[PromptsCount(len(prompts))]* len(prompts) - output_list = [] - with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor: - results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts,counts) - output_list = list(results_iterator) - return output_list - - -if __name__ == '__main__': - pass diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index bc65882..54f12ad 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -1,6 +1,6 @@ from typing import Unpack -from .agent_async import Agent, AgentArgs +from .agent import (Agent, AgentArgs) class MDRefineAgent(Agent): def __init__(self,**kwargs:Unpack[AgentArgs]): @@ -19,7 +19,7 @@ class MDRefineAgent(Agent): 形如的占位符不要改变 code、latex和HTML保持结构 所有公式(包括短公式)都应该是latex公式 -修复不正确的latex公式,要用$正确包裹以构造合法latex表达式 +修复不正确的latex公式,行内公式要用$正确包裹以构造合法latex表达式 # 输出 修正后的markdown纯文本(不是markdown代码块) # 示例 @@ -29,11 +29,13 @@ code、latex和HTML保持结构 你叫 输出: 你叫什么名字 -## 去掉异常字词与修正公式(优先使用$包裹) +## 去掉异常字词与修正公式(行内公式使用$包裹) 输入: 一道\题@#目:c_0+1=2,\(c 0\)等于几 +{c_0,c_1,c^2}是一个集合 输出: 一道题目:$c_0+1=2$,$c_0$等于几 +{$c_0$,$c_1$,$c^2$}是一个集合 \no_think""" @@ -53,7 +55,7 @@ class MDTranslateAgent(Agent): 引用的参考文献和其作者不要翻译 形如的占位符不要改变 code、latex和HTML只翻译说明文字,其余保持原文 -公式必须表示为合法的latex公式,且被$正确包裹 +公式必须表示为合法的latex公式,行内公式需被$正确包裹 # 输出 翻译后的markdown纯文本(不是markdown代码块) # 示例 @@ -62,11 +64,13 @@ code、latex和HTML只翻译说明文字,其余保持原文 hello, what's your name? 输出: 你好,你叫什么名字? -## 公式要为合法latex(优先使用$包裹) +## 公式要为合法latex(行内公式使用$包裹) 输入: -c_0+1=2 +The equation is E=mc 2. This is famous. +{{c_0,c_1,c^2}}is a set. 输出: -$c_0+1=2$ +这个方程是 $E=mc^2$。这很有名。 +{{$c_0$,$c_1$,$c^2$}}是一个集合。 ## 引用的参考文献要保持原文不要翻译 输入:【假设目标语言为中文】 [2] M. Castro, B. Liskov, et al. Practical byzantine fault tolerance. In OSDI, diff --git a/docutranslate/app.py b/docutranslate/app.py index 8f1759b..1945701 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -2,9 +2,9 @@ import asyncio import io import logging import time -from pathlib import Path -from typing import AsyncGenerator, List, Dict, Any import traceback +from pathlib import Path +from typing import List, Dict, Any import uvicorn from fastapi import FastAPI, File, Form, UploadFile, Request, HTTPException, BackgroundTasks @@ -14,6 +14,641 @@ from fastapi.templating import Jinja2Templates from docutranslate import FileTranslater from docutranslate.logger import translater_logger +# --- HTML模板 (JS part needs modification) --- +# language=HTML +HTML_TEMPLATE = """ + + + + + + DocuTranslate + + + + + + + DocuTranslate + + + + API 配置 + + + AI 平台 + + 自定义接口 + OpenAI + 智谱AI + DeepSeek + 阿里云百炼 + + DMXAPI + OpenRouter + 火山引擎 + 硅基流动 + + + + API 地址 (Base URL) + + + + + API 密钥 + + + + 模型 ID + + + + + 文档选择 + + + + + 目标语言 + + 中文 (Chinese) + 英文 (English) + 日语 (Japanese) + 韩语 (Korean) + 法语 (French) + 德语 (German) + 西班牙语 (Spanish) + 意大利语 (Italian) + 葡萄牙语 (Portuguese) + 俄语 (Russian) + 阿拉伯语 (Arabic) + 印地语 (Hindi) + + + + 高级选项 + + 公式识别 + 代码识别 + 修正文本(耗时,有概率修复文本流和公式识别错误) + + + + 开始翻译 + + + + + 翻译结果 + 下载 Markdown + 下载 HTML + 下载 PDF + 预览 + + + 运行日志 + + + + + × + HTML 预览 + + + 打印/保存为PDF + 关闭 + + + + + + + + \ + """ + app = FastAPI() # --- 全局配置 --- @@ -71,410 +706,6 @@ async def startup_event(): translater_logger.info("应用启动完成,日志队列/历史处理器已配置。") -# --- HTML模板 (JS part needs modification) --- -# language=HTML -HTML_TEMPLATE = """ - - - - - - DocuTranslate - - - - - - - DocuTranslate - - - - API 配置 - - - AI 平台 - - 自定义接口 - OpenAI - 智谱AI - DeepSeek - 阿里云百炼 - DMXAPI - OpenRouter - 火山引擎 - 硅基流动 - - - - API 地址 (Base URL) - - - - - API 密钥 - - - - 模型 ID - - - - - 文档选择 - - - - - 目标语言 - - 中文 (Chinese) - 英文 (English) - 日语 (Japanese) - 韩语 (Korean) - 法语 (French) - 德语 (German) - 西班牙语 (Spanish) - 意大利语 (Italian) - 葡萄牙语 (Portuguese) - 俄语 (Russian) - 阿拉伯语 (Arabic) - 印地语 (Hindi) - - - - 高级选项 - - 公式识别 - 代码识别 - 修正文本(耗时) - - - - 开始翻译 - - - - - 翻译结果 - 下载 Markdown - 下载 HTML - 下载 PDF - 预览 - - - 运行日志 - - - - - × - HTML 预览 - - - 打印/保存为PDF - 关闭 - - - - - - - - -""" - - # --- Background Task Logic --- async def _perform_translation(params: Dict[str, Any], file_contents: bytes, original_filename: str): start_time = time.time() @@ -503,7 +734,8 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori try: translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}") translater_logger.info(f"文件大小: {len(file_contents)} 字节。目标语言: {params['to_lang']}") - translater_logger.info(f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}") + translater_logger.info( + f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}") ft = FileTranslater( base_url=params['base_url'], @@ -511,8 +743,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori model_id=params['model_id'], tips=False ) - await asyncio.to_thread( - ft.translate_bytes, + await ft.translate_bytes_async( name=original_filename, file=file_contents, to_lang=params['to_lang'], @@ -521,6 +752,16 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori refine=params['refine_markdown'], save=False ) + # await asyncio.to_thread( + # ft.translate_bytes, + # name=original_filename, + # file=file_contents, + # to_lang=params['to_lang'], + # formula=params['formula_ocr'], + # code=params['code_ocr'], + # refine=params['refine_markdown'], + # save=False + # ) md_content = ft.export_to_markdown() html_content = ft.export_to_html(title=file_stem) end_time = time.time() @@ -625,8 +866,10 @@ async def get_status(): "error_flag": current_state["error_flag"], "download_ready": current_state["download_ready"], "original_filename_stem": current_state["original_filename_stem"], - "markdown_url": f"/download/markdown/{current_state['original_filename_stem']}_translated.md" if current_state["download_ready"] else None, - "html_url": f"/download/html/{current_state['original_filename_stem']}_translated.html" if current_state["download_ready"] else None, + "markdown_url": f"/download/markdown/{current_state['original_filename_stem']}_translated.md" if current_state[ + "download_ready"] else None, + "html_url": f"/download/html/{current_state['original_filename_stem']}_translated.html" if current_state[ + "download_ready"] else None, "task_start_time": current_state["task_start_time"], "task_end_time": current_state["task_end_time"], } @@ -643,11 +886,12 @@ async def get_logs(since: int = 0): @app.get("/download/markdown/{filename_with_ext}") async def download_markdown(filename_with_ext: str): - if not current_state["download_ready"] or not current_state["markdown_content"] or not current_state["original_filename_stem"]: + if not current_state["download_ready"] or not current_state["markdown_content"] or not current_state[ + "original_filename_stem"]: raise HTTPException(status_code=404, detail="Markdown 内容尚未准备好或不可用。") requested_stem = Path(filename_with_ext).stem.replace("_translated", "") if requested_stem != current_state["original_filename_stem"]: - raise HTTPException(status_code=404, detail="请求的文件名与当前结果不符。") + raise HTTPException(status_code=404, detail="请求的文件名与当前结果不符。") actual_filename = f"{current_state['original_filename_stem']}_translated.md" return StreamingResponse( io.StringIO(current_state["markdown_content"]), @@ -658,11 +902,12 @@ async def download_markdown(filename_with_ext: str): @app.get("/download/html/{filename_with_ext}") async def download_html(filename_with_ext: str): - if not current_state["download_ready"] or not current_state["html_content"] or not current_state["original_filename_stem"]: + if not current_state["download_ready"] or not current_state["html_content"] or not current_state[ + "original_filename_stem"]: raise HTTPException(status_code=404, detail="HTML 内容尚未准备好或不可用。") requested_stem = Path(filename_with_ext).stem.replace("_translated", "") if requested_stem != current_state["original_filename_stem"]: - raise HTTPException(status_code=404, detail="请求的文件名与当前结果不符。") + raise HTTPException(status_code=404, detail="请求的文件名与当前结果不符。") actual_filename = f"{current_state['original_filename_stem']}_translated.html" return HTMLResponse( content=current_state["html_content"], @@ -670,10 +915,12 @@ async def download_html(filename_with_ext: str): headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""} ) + def run_app(): print("正在启动 DocuTranslate") print("请访问 http://127.0.0.1:8010") uvicorn.run(app, host="127.0.0.1", port=8010, workers=1) + if __name__ == "__main__": - run_app() \ No newline at end of file + run_app() diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 26f0eb5..976df70 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -1,3 +1,4 @@ +import asyncio from io import BytesIO from pathlib import Path from typing import Literal @@ -15,7 +16,7 @@ from docutranslate.logger import translater_logger class FileTranslater: - def __init__(self, file_path: Path | str | None = None, chunksize: int = 3500, base_url="", key=None, + def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000, base_url="", key=None, model_id="", temperature=0.7, max_concurrent=15, docling_artifact: Path | str | None = None, timeout=2000, tips=True): if isinstance(file_path, str): @@ -145,6 +146,31 @@ class FileTranslater: translater_logger.info("翻译完成") return self.markdown + + async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None) -> str: + translater_logger.info("正在修正markdown") + self._mask_uris_in_markdown() + chuncks = self._split_markdown_into_chunks() + if refine_agent is None: + refine_agent = MDRefineAgent(**self.default_agent_params()) + result: list[str] = await refine_agent.send_prompts_async(chuncks) + self.markdown = "\n\n".join(result) + self._unmask_uris_in_markdown() + translater_logger.info("markdown已修正") + return self.markdown + + async def translate_markdown_by_agent_async(self, translate_agent: Agent | None = None, to_lang="中文"): + translater_logger.info("正在翻译markdown") + self._mask_uris_in_markdown() + chuncks = self._split_markdown_into_chunks() + if translate_agent is None: + translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params()) + result: list[str] = await translate_agent.send_prompts_async(chuncks) + self.markdown = "\n\n".join(result) + self._unmask_uris_in_markdown() + translater_logger.info("翻译完成") + return self.markdown + def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"): if isinstance(filename, str): filename = Path(filename) @@ -191,7 +217,7 @@ class FileTranslater: def export_to_html(self, title="title") -> str: markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"]) - + # language=html html = f""" @@ -206,6 +232,7 @@ class FileTranslater: