增加async的完全支持

This commit is contained in:
xunbu
2025-05-16 18:15:12 +08:00
parent 9c45a673f9
commit a1d6725321
11 changed files with 933 additions and 723 deletions

168
.idea/workspace.xml generated
View File

@@ -5,9 +5,17 @@
</component>
<component name="ChangeListManager">
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
<change afterPath="$PROJECT_DIR$/LICENSE" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/agents/__init__.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/agents/__init__.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/agents/agent_async.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/agents/agent.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/agents/agent_sync.py" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/agents/agent_thread.py" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/agents/markdown_agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/agents/markdown_agent.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/app.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/utils/convert.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/convert.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -33,59 +41,59 @@
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">{
&quot;keyToString&quot;: {
&quot;DefaultHtmlFileTemplate&quot;: &quot;HTML File&quot;,
&quot;JavaScript 调试.output.html (1).executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.output.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.regex.md_中文.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.regex_中文.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.test2.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.test2_英文.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.test4-1_中文.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.互联网认证授权机制.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.互联网认证授权机制_英文.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.毕业论文_英文.html.executor&quot;: &quot;Run&quot;,
&quot;ModuleVcsDetector.initialDetectionPerformed&quot;: &quot;true&quot;,
&quot;Python 测试.Python 测试 (markdown_mask.py 内).executor&quot;: &quot;Run&quot;,
&quot;Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor&quot;: &quot;Run&quot;,
&quot;Python.PDFtranslater (1).executor&quot;: &quot;Run&quot;,
&quot;Python.PDFtranslater (2).executor&quot;: &quot;Run&quot;,
&quot;Python.agent.executor&quot;: &quot;Debug&quot;,
&quot;Python.agent_utils.executor&quot;: &quot;Run&quot;,
&quot;Python.app (1).executor&quot;: &quot;Run&quot;,
&quot;Python.app.executor&quot;: &quot;Run&quot;,
&quot;Python.app2.executor&quot;: &quot;Run&quot;,
&quot;Python.app_test (1).executor&quot;: &quot;Run&quot;,
&quot;Python.convert.executor&quot;: &quot;Run&quot;,
&quot;Python.markdown_splitter.executor&quot;: &quot;Debug&quot;,
&quot;Python.markdown_utils.executor&quot;: &quot;Run&quot;,
&quot;Python.test.executor&quot;: &quot;Run&quot;,
&quot;Python.test1.executor&quot;: &quot;Run&quot;,
&quot;Python.test2.executor&quot;: &quot;Run&quot;,
&quot;Python.test3.executor&quot;: &quot;Run&quot;,
&quot;Python.test4.executor&quot;: &quot;Run&quot;,
&quot;Python.translater.executor&quot;: &quot;Run&quot;,
&quot;Python.切分测试.executor&quot;: &quot;Run&quot;,
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager&quot;: &quot;true&quot;,
&quot;RunOnceActivity.git.unshallow&quot;: &quot;true&quot;,
&quot;git-widget-placeholder&quot;: &quot;main&quot;,
&quot;last_opened_file_path&quot;: &quot;C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate&quot;,
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
&quot;settings.editor.selected.configurable&quot;: &quot;preferences.pluginManager&quot;,
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"DefaultHtmlFileTemplate": "HTML File",
"JavaScript 调试.output.html (1).executor": "Run",
"JavaScript 调试.output.html.executor": "Run",
"JavaScript 调试.regex.md_中文.html.executor": "Run",
"JavaScript 调试.regex_中文.html.executor": "Run",
"JavaScript 调试.test2.html.executor": "Run",
"JavaScript 调试.test2_英文.html.executor": "Run",
"JavaScript 调试.test4-1_中文.html.executor": "Run",
"JavaScript 调试.互联网认证授权机制.html.executor": "Run",
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
"JavaScript 调试.毕业论文_英文.html.executor": "Run",
"ModuleVcsDetector.initialDetectionPerformed": "true",
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
"Python.PDFtranslater (1).executor": "Run",
"Python.PDFtranslater (2).executor": "Run",
"Python.agent.executor": "Debug",
"Python.agent_utils.executor": "Run",
"Python.app (1).executor": "Run",
"Python.app.executor": "Run",
"Python.app2.executor": "Run",
"Python.app_test (1).executor": "Run",
"Python.convert.executor": "Run",
"Python.markdown_splitter.executor": "Debug",
"Python.markdown_utils.executor": "Run",
"Python.test.executor": "Run",
"Python.test1.executor": "Run",
"Python.test2.executor": "Run",
"Python.test3.executor": "Run",
"Python.test4.executor": "Run",
"Python.translater.executor": "Run",
"Python.切分测试.executor": "Run",
"RunOnceActivity.ShowReadmeOnStart": "true",
"RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
"RunOnceActivity.git.unshallow": "true",
"git-widget-placeholder": "main",
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/docutranslate/agents",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "preferences.pluginManager",
"vue.rearranger.settings.migration": "true"
}
}</component>
}]]></component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\jxgm\Desktop\FileTranslate\docutranslate\agents" />
<recent name="C:\Users\jxgm\Desktop\FileTranslate\dist\DocuTranslate" />
<recent name="C:\Users\jxgm\Desktop\FileTranslate\dist" />
<recent name="C:\Users\jxgm\Desktop\FileTranslate\docutranslate\agents" />
<recent name="C:\Users\jxgm\Desktop\FileTranslate\dist\app" />
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\files" />
</key>
@@ -97,7 +105,7 @@
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\resource" />
</key>
</component>
<component name="RunManager" selected="Python.test">
<component name="RunManager" selected="Python.app_test (1)">
<configuration default="true" type="DjangoTestsConfigurationType">
<module name="filetranslate" />
<option name="ENV_FILES" value="" />
@@ -329,29 +337,6 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/app.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="app_test (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
@@ -421,6 +406,29 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test2" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/test2.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
@@ -543,11 +551,11 @@
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.test" />
<item itemvalue="Python.app_test (1)" />
<item itemvalue="Python.test2" />
<item itemvalue="Python.test" />
<item itemvalue="Python.切分测试" />
<item itemvalue="Python.app_test" />
<item itemvalue="Python.app" />
</list>
</recent_temporary>
</component>
@@ -616,7 +624,9 @@
<workItem from="1747146670281" duration="64000" />
<workItem from="1747185217844" duration="6194000" />
<workItem from="1747297470216" duration="347000" />
<workItem from="1747299661166" duration="1977000" />
<workItem from="1747299661166" duration="4649000" />
<workItem from="1747311432043" duration="2883000" />
<workItem from="1747380029603" duration="10381000" />
</task>
<servers />
</component>
@@ -624,8 +634,8 @@
<option name="version" value="3" />
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747189112668" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747300796373" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747390450384" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747301959211" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
@@ -645,7 +655,7 @@
<SUITE FILE_PATH="coverage/filetranslate$test4.coverage" NAME="test4 覆盖结果" MODIFIED="1746887036353" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1747008834523" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1747383231002" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746843159560" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
</component>

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 International Business Machines
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,2 +1,2 @@
from .agent_async import Agent, AgentArgs
from .agent import Agent, AgentArgs
from .markdown_agent import MDRefineAgent, MDTranslateAgent

View File

@@ -1,4 +1,6 @@
import asyncio
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from typing import TypedDict
import httpx
@@ -16,17 +18,32 @@ class AgentArgs(TypedDict, total=False):
timeout: int
# 仅使用多线程时用以计数
class PromptsCount:
def __init__(self, total: int):
self.lock = Lock()
self.count = 0
self.total = total
def add(self):
self.lock.acquire()
self.count += 1
translater_logger.info(f"多线程-已完成:{self.count}/{self.total}")
self.lock.release()
TIMEOUT = 500
class Agent:
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
max_concurrent=6, timeout: int = TIMEOUT):
max_concurrent=15, timeout: int = TIMEOUT):
self.baseurl = baseurl.strip()
self.key = key.strip()
self.model_id = model_id.strip()
self.system_prompt = system_prompt
self.temperature = temperature
self.client = httpx.Client()
self.client_async = httpx.AsyncClient()
self.max_concurrent = max_concurrent
self.timeout = timeout
@@ -65,12 +82,6 @@ class Agent:
)
response.raise_for_status()
result = response.json()["choices"][0]["message"]["content"]
# pattern = r".*【SSS】(.*)"
# match = re.search(pattern, result, re.DOTALL)
# if match is None:
# print("检测开头`【SSS】`失败")
# else:
# result = match.group(1)
return result
except httpx.HTTPStatusError as e:
raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e
@@ -83,8 +94,9 @@ class Agent:
self,
prompts: list[str],
system_prompt: str | None = None,
max_concurrent: int = 5 # 新增参数默认并发数为5
max_concurrent: int | None = None # 新增参数默认并发数为5
) -> list[str]:
max_concurrent = self.max_concurrent if max_concurrent is None else max_concurrent
total = len(prompts)
count = 0
semaphore = asyncio.Semaphore(max_concurrent)
@@ -109,14 +121,48 @@ class Agent:
results = await asyncio.gather(*tasks, return_exceptions=False)
return results
def send(self, prompt: str, system_prompt: None | str = None) -> str:
if system_prompt is None:
system_prompt = self.system_prompt
"""Sends a single prompt asynchronously."""
headers, data = self._prepare_request_data(prompt, system_prompt)
if self.baseurl.endswith("/"):
self.baseurl = self.baseurl[:-1]
try:
response = self.client.post(
f"{self.baseurl}/chat/completions",
json=data,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
result = response.json()["choices"][0]["message"]["content"]
return result
except httpx.HTTPStatusError as e:
raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e
except httpx.RequestError as e:
raise Exception(f"AI请求连接错误 (async): {e}") from e
except (KeyError, IndexError) as e:
raise Exception(f"AI响应格式错误 (async): {e}") from e
def _send_prompt_count(self, prompt: str, system_prompt: None | str, count: PromptsCount) -> str:
result = self.send(prompt, system_prompt)
count.add()
return result
def send_prompts(
self,
prompts: list[str],
system_prompt: str | None = None,
) -> list[str]:
result = asyncio.run(self.send_prompts_async(prompts, system_prompt, self.max_concurrent))
return result
system_prompts = [system_prompt] * len(prompts)
counts = [PromptsCount(len(prompts))] * len(prompts)
output_list = []
with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts, counts)
output_list = list(results_iterator)
return output_list
if __name__ == '__main__':

View File

@@ -1,89 +0,0 @@
from typing import TypedDict
from docutranslate.logger import translater_logger
import httpx
class AgentArgs(TypedDict, total=False):
baseurl: str
key: str
model_id: str
system_prompt: str
temperature: float
max_concurrent: int
timeout: int
TIMEOUT = 500
class Agent:
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
max_concurrent=6, timeout: int = TIMEOUT):
self.baseurl = baseurl.strip()
self.key = key.strip()
self.model_id = model_id.strip()
self.system_prompt = system_prompt
self.temperature = temperature
self.client = httpx.Client()
self.max_concurrent = max_concurrent
self.timeout = timeout
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
if temperature is None:
temperature = self.temperature
headers = {"Content-Type": "application/json",
"Authorization": f"Bearer {self.key}"}
data = {
"model": self.model_id,
"messages": [
{"role": "system", "content": system_prompt},
# {"role": "system", "content": "所有回复必须以【SSS】开头这是最高规则适用于之后的所有例子。示例【SSS】这是示例回答\n"+system_prompt},
{"role": "user", "content": prompt}
],
"temperature": temperature,
"top_p": top_p
}
return headers, data
def send(self, prompt: str, system_prompt: None | str = None) -> str:
if system_prompt is None:
system_prompt = self.system_prompt
"""Sends a single prompt asynchronously."""
headers, data = self._prepare_request_data(prompt, system_prompt)
if self.baseurl.endswith("/"):
self.baseurl = self.baseurl[:-1]
try:
response = self.client.post(
f"{self.baseurl}/chat/completions",
json=data,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
result = response.json()["choices"][0]["message"]["content"]
return result
except httpx.HTTPStatusError as e:
raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e
except httpx.RequestError as e:
raise Exception(f"AI请求连接错误 (async): {e}") from e
except (KeyError, IndexError) as e:
raise Exception(f"AI响应格式错误 (async): {e}") from e
def send_prompts(
self,
prompts: list[str],
system_prompt: str | None = None,
) -> list[str]:
result=[]
for prompt in prompts:
result.append(self.send(prompt,system_prompt))
translater_logger.info(f"单线程-已完成{len(result)}/{len(prompts)}")
return result
if __name__ == '__main__':
pass

View File

@@ -1,109 +0,0 @@
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from typing import TypedDict
from docutranslate.logger import translater_logger
import httpx
class AgentArgs(TypedDict, total=False):
baseurl: str
key: str
model_id: str
system_prompt: str
temperature: float
max_concurrent: int
timeout: int
TIMEOUT = 500
class PromptsCount():
def __init__(self,max:int):
self.lock=Lock()
self.count=0
self.max=max
def add(self):
self.lock.acquire()
self.count+=1
translater_logger.info(f"多线程-已完成:{self.count}/{self.max}")
self.lock.release()
class Agent:
def __init__(self, baseurl: str = "", key: str = "xx", model_id: str = "", system_prompt: str = "", temperature=0.7,
max_concurrent=6, timeout: int = TIMEOUT):
self.baseurl = baseurl.strip()
self.key = key.strip()
self.model_id = model_id.strip()
self.system_prompt = system_prompt
self.temperature = temperature
self.client = httpx.Client()
self.max_concurrent = max_concurrent
self.timeout = timeout
def _prepare_request_data(self, prompt: str, system_prompt: str, temperature=None, top_p=0.9):
if temperature is None:
temperature = self.temperature
headers = {"Content-Type": "application/json",
"Authorization": f"Bearer {self.key}"}
data = {
"model": self.model_id,
"messages": [
{"role": "system", "content": system_prompt},
# {"role": "system", "content": "所有回复必须以【SSS】开头这是最高规则适用于之后的所有例子。示例【SSS】这是示例回答\n"+system_prompt},
{"role": "user", "content": prompt}
],
"temperature": temperature,
"top_p": top_p
}
return headers, data
def send(self, prompt: str, system_prompt: None | str = None) -> str:
if system_prompt is None:
system_prompt = self.system_prompt
"""Sends a single prompt asynchronously."""
headers, data = self._prepare_request_data(prompt, system_prompt)
if self.baseurl.endswith("/"):
self.baseurl = self.baseurl[:-1]
try:
response = self.client.post(
f"{self.baseurl}/chat/completions",
json=data,
headers=headers,
timeout=self.timeout
)
response.raise_for_status()
result = response.json()["choices"][0]["message"]["content"]
return result
except httpx.HTTPStatusError as e:
raise Exception(f"AI请求错误 (async): {e.response.status_code} - {e.response.text}") from e
except httpx.RequestError as e:
raise Exception(f"AI请求连接错误 (async): {e}") from e
except (KeyError, IndexError) as e:
raise Exception(f"AI响应格式错误 (async): {e}") from e
def _send_prompt_count(self,prompt: str, system_prompt:None | str,count:PromptsCount)->str:
result=self.send(prompt,system_prompt)
count.add()
return result
def send_prompts(
self,
prompts: list[str],
system_prompt: str | None = None,
) -> list[str]:
system_prompts = [system_prompt] * len(prompts)
counts=[PromptsCount(len(prompts))]* len(prompts)
output_list = []
with ThreadPoolExecutor(max_workers=self.max_concurrent) as executor:
results_iterator = executor.map(self._send_prompt_count, prompts, system_prompts,counts)
output_list = list(results_iterator)
return output_list
if __name__ == '__main__':
pass

View File

@@ -1,6 +1,6 @@
from typing import Unpack
from .agent_async import Agent, AgentArgs
from .agent import (Agent, AgentArgs)
class MDRefineAgent(Agent):
def __init__(self,**kwargs:Unpack[AgentArgs]):
@@ -19,7 +19,7 @@ class MDRefineAgent(Agent):
形如<ph-abc123>的占位符不要改变
code、latex和HTML保持结构
所有公式包括短公式都应该是latex公式
修复不正确的latex公式要用$正确包裹以构造合法latex表达式
修复不正确的latex公式行内公式要用$正确包裹以构造合法latex表达式
# 输出
修正后的markdown纯文本不是markdown代码块
# 示例
@@ -29,11 +29,13 @@ code、latex和HTML保持结构
你叫
输出:
你叫什么名字
## 去掉异常字词与修正公式(优先使用$包裹)
## 去掉异常字词与修正公式(行内公式使用$包裹)
输入:
一道\题@#目<ph-12asd2>:c_0+1=2\(c 0\)等于几
{c_0,c_1,c^2}是一个集合
输出:
一道题目<ph-12asd2>:$c_0+1=2$$c_0$等于几
{$c_0$,$c_1$,$c^2$}是一个集合
\no_think"""
@@ -53,7 +55,7 @@ class MDTranslateAgent(Agent):
引用的参考文献和其作者不要翻译
形如<ph-abc123>的占位符不要改变
code、latex和HTML只翻译说明文字其余保持原文
公式必须表示为合法的latex公式,被$正确包裹
公式必须表示为合法的latex公式,行内公式需被$正确包裹
# 输出
翻译后的markdown纯文本不是markdown代码块
# 示例
@@ -62,11 +64,13 @@ code、latex和HTML只翻译说明文字其余保持原文
hello<ph-aaaaaa>, what's your name?
输出:
你好<ph-aaaaaa>,你叫什么名字?
## 公式要为合法latex优先使用$包裹)
## 公式要为合法latex行内公式使用$包裹)
输入:
c_0+1=2
The equation is E=mc 2. This is famous.
{{c_0,c_1,c^2}}is a set.
输出:
$c_0+1=2$
这个方程是 $E=mc^2$。这很有名。
{{$c_0$,$c_1$,$c^2$}}是一个集合。
## 引用的参考文献要保持原文不要翻译
输入:【假设目标语言为中文】
[2] M. Castro, B. Liskov, et al. Practical byzantine fault tolerance. In OSDI,

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,4 @@
import asyncio
from io import BytesIO
from pathlib import Path
from typing import Literal
@@ -15,7 +16,7 @@ from docutranslate.logger import translater_logger
class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunksize: int = 3500, base_url="", key=None,
def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000, base_url="", key=None,
model_id="", temperature=0.7, max_concurrent=15, docling_artifact: Path | str | None = None,
timeout=2000, tips=True):
if isinstance(file_path, str):
@@ -145,6 +146,31 @@ class FileTranslater:
translater_logger.info("翻译完成")
return self.markdown
async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None) -> str:
translater_logger.info("正在修正markdown")
self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks()
if refine_agent is None:
refine_agent = MDRefineAgent(**self.default_agent_params())
result: list[str] = await refine_agent.send_prompts_async(chuncks)
self.markdown = "\n\n".join(result)
self._unmask_uris_in_markdown()
translater_logger.info("markdown已修正")
return self.markdown
async def translate_markdown_by_agent_async(self, translate_agent: Agent | None = None, to_lang="中文"):
translater_logger.info("正在翻译markdown")
self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks()
if translate_agent is None:
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
result: list[str] = await translate_agent.send_prompts_async(chuncks)
self.markdown = "\n\n".join(result)
self._unmask_uris_in_markdown()
translater_logger.info("翻译完成")
return self.markdown
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
if isinstance(filename, str):
filename = Path(filename)
@@ -191,7 +217,7 @@ class FileTranslater:
def export_to_html(self, title="title") -> str:
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
# language=html
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
@@ -206,6 +232,7 @@ class FileTranslater:
</style>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({{
messageStyle: "none",
tex2jax: {{
inlineMath: [ ['$','$'], ["\\\\(","\\\\)"] ],
processEscapes: true
@@ -264,7 +291,32 @@ class FileTranslater:
filename = f"{file_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
async def translate_file_async(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output",
formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None, translate_agent: Agent | None = None,save=True):
if file_path is None:
assert self.file_path is not None, "未输入文件路径"
file_path = self.file_path
if isinstance(file_path, str):
file_path = Path(file_path)
await asyncio.to_thread(
self.read_file,
file_path,
formula=formula,
code=code
)
if refine:
await self.refine_markdown_by_agent_async(refine_agent)
await self.translate_markdown_by_agent_async(translate_agent, to_lang=to_lang)
if save:
if output_format == "markdown":
filename = f"{file_path.stem}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)
elif output_format == "html":
filename = f"{file_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
def translate_bytes(self, name:str,file: bytes, to_lang="中文", output_dir="./output",
formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
@@ -281,3 +333,26 @@ class FileTranslater:
filename = f"{name}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self
async def translate_bytes_async(self, name:str,file: bytes, to_lang="中文", output_dir="./output",
formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None, translate_agent: Agent | None = None,save=True):
await asyncio.to_thread(
self.read_bytes,
name=name,
file=file,
formula=formula,
code=code
)
if refine:
await self.refine_markdown_by_agent_async(refine_agent)
await self.translate_markdown_by_agent_async(translate_agent, to_lang=to_lang)
if save:
if output_format == "markdown":
filename = f"{name}_{to_lang}.md"
self.save_as_markdown(filename=filename, output_dir=output_dir)
elif output_format == "html":
filename = f"{name}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir)
return self

View File

@@ -1,3 +1,4 @@
import asyncio
import os
from huggingface_hub.errors import LocalEntryNotFoundError
from docling.datamodel.base_models import InputFormat
@@ -6,8 +7,7 @@ from docling_core.types.doc import ImageRefMode
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.document import DocumentStream
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.datamodel.settings import settings
from docutranslate.logger import translater_logger
IMAGE_RESOLUTION_SCALE = 4
@@ -22,20 +22,25 @@ def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=Fal
pipeline_options.do_formula_enrichment=True
if code:
pipeline_options.do_code_enrichment=True
pipeline_options.accelerator_options= AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.AUTO
)
# pipeline_options.accelerator_options= AcceleratorOptions(
# num_threads=4, device=AcceleratorDevice.AUTO
# )
#打印时间
settings.debug.profile_pipeline_timings=True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
try:
result = converter.convert(file_path).document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
conversion_result = converter.convert(file_path)
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
except LocalEntryNotFoundError:
translater_logger.info(f"无法连接huggingface正在尝试换源")
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
result = converter.convert(file_path).document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
conversion_result = converter.convert(file_path)
result=conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
translater_logger.info(f"已转换为markdown")
translater_logger.info(f"pdf转换耗时: {conversion_result.timings["pipeline_total"].times}")
return result
if __name__ == '__main__':

View File

@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
version = "0.2.6.post1"
version = "0.2.7"
description = "文件翻译工具"
readme = "README.md"
requires-python = ">=3.10"