1.改进翻译Prompt2.改进urimask

This commit is contained in:
xunbu
2025-05-08 21:52:43 +08:00
parent fba57fd9b5
commit 621cdcb8ea
6 changed files with 94 additions and 61 deletions

111
.idea/workspace.xml generated
View File

@@ -6,7 +6,11 @@
<component name="ChangeListManager">
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/utils/agent_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/agent_utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/utils/markdown_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_utils.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -32,38 +36,40 @@
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">{
&quot;keyToString&quot;: {
&quot;DefaultHtmlFileTemplate&quot;: &quot;HTML File&quot;,
&quot;JavaScript 调试.output.html (1).executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.output.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.regex_中文.html.executor&quot;: &quot;Run&quot;,
&quot;JavaScript 调试.test2_英文.html.executor&quot;: &quot;Run&quot;,
&quot;ModuleVcsDetector.initialDetectionPerformed&quot;: &quot;true&quot;,
&quot;Python 测试.Python 测试 (markdown_mask.py 内).executor&quot;: &quot;Run&quot;,
&quot;Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor&quot;: &quot;Run&quot;,
&quot;Python.PDFtranslater (1).executor&quot;: &quot;Run&quot;,
&quot;Python.PDFtranslater (2).executor&quot;: &quot;Run&quot;,
&quot;Python.agent_utils.executor&quot;: &quot;Run&quot;,
&quot;Python.convert.executor&quot;: &quot;Run&quot;,
&quot;Python.markdown_splitter.executor&quot;: &quot;Run&quot;,
&quot;Python.markdown_utils.executor&quot;: &quot;Run&quot;,
&quot;Python.test.executor&quot;: &quot;Run&quot;,
&quot;Python.test1.executor&quot;: &quot;Run&quot;,
&quot;Python.translater.executor&quot;: &quot;Debug&quot;,
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.git.unshallow&quot;: &quot;true&quot;,
&quot;git-widget-placeholder&quot;: &quot;main&quot;,
&quot;last_opened_file_path&quot;: &quot;C:/Users/jxgm/Desktop/FileTranslate/tests/resource&quot;,
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
&quot;settings.editor.selected.configurable&quot;: &quot;Errors&quot;,
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"DefaultHtmlFileTemplate": "HTML File",
"JavaScript 调试.output.html (1).executor": "Run",
"JavaScript 调试.output.html.executor": "Run",
"JavaScript 调试.regex_中文.html.executor": "Run",
"JavaScript 调试.test2_英文.html.executor": "Run",
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
"ModuleVcsDetector.initialDetectionPerformed": "true",
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
"Python.PDFtranslater (1).executor": "Run",
"Python.PDFtranslater (2).executor": "Run",
"Python.agent_utils.executor": "Run",
"Python.convert.executor": "Run",
"Python.markdown_splitter.executor": "Run",
"Python.markdown_utils.executor": "Run",
"Python.test.executor": "Run",
"Python.test1.executor": "Run",
"Python.test2.executor": "Run",
"Python.translater.executor": "Debug",
"RunOnceActivity.ShowReadmeOnStart": "true",
"RunOnceActivity.git.unshallow": "true",
"git-widget-placeholder": "main",
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/resource",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "Errors",
"vue.rearranger.settings.migration": "true"
}
}</component>
}]]></component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\resource" />
@@ -76,7 +82,7 @@
<recent name="C:\Users\jxgm\Desktop\PDFtranslate\pdf" />
</key>
</component>
<component name="RunManager" selected="Python.test1">
<component name="RunManager" selected="JavaScript 调试.regex_中文.html">
<configuration default="true" type="DjangoTestsConfigurationType">
<module name="filetranslate" />
<option name="ENV_FILES" value="" />
@@ -98,10 +104,7 @@
<option name="OPTIONS" value="" />
<method v="2" />
</configuration>
<configuration name="output.html (1)" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/output.html" useBuiltInWebServerPort="true">
<method v="2" />
</configuration>
<configuration name="output.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/filetranslate_packages/output/output.html" useBuiltInWebServerPort="true">
<configuration name="互联网认证授权机制_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/互联网认证授权机制_英文.html" useBuiltInWebServerPort="true">
<method v="2" />
</configuration>
<configuration name="regex_中文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/regex_中文.html" useBuiltInWebServerPort="true">
@@ -176,6 +179,29 @@
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test2" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/test2.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration default="true" type="Python.FlaskServer">
<module name="filetranslate" />
<option name="ENV_FILES" value="" />
@@ -275,11 +301,11 @@
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.test1" />
<item itemvalue="JavaScript 调试.regex_中文.html" />
<item itemvalue="Python.test1" />
<item itemvalue="Python.test2" />
<item itemvalue="Python.agent_utils" />
<item itemvalue="JavaScript 调试.output.html (1)" />
<item itemvalue="JavaScript 调试.output.html" />
<item itemvalue="JavaScript 调试.互联网认证授权机制_英文.html" />
</list>
</recent_temporary>
</component>
@@ -308,7 +334,7 @@
<workItem from="1746697418777" duration="120000" />
<workItem from="1746699070198" duration="51000" />
<workItem from="1746700998566" duration="4012000" />
<workItem from="1746705145591" duration="2321000" />
<workItem from="1746705145591" duration="7086000" />
</task>
<servers />
</component>
@@ -317,8 +343,8 @@
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746706572495" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746707472556" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746712139296" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
@@ -327,6 +353,7 @@
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746710994589" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
</component>
</project>

View File

@@ -15,7 +15,7 @@
# 前置条件获取大模型平台的baseurl、key、model-id
由于需要使用大语言模型进行markdown调整与翻译所以需要预先获取模型的baseurl、key、model-id
常见的大模型平台baseurl与api获取方式可见[常用ai平台](#常用ai平台)
> 比较推荐的模型有阿里云的qwen-plus、智谱的glm-z1-flash等。免费的智谱glm-4-flash可以用但效果欠佳
# 使用方式
## 翻译文件

View File

@@ -84,9 +84,9 @@ Blockchain's origination was Bitcoin, the most successful of the digital currenc
你是一个翻译markdown文本的专家。
# 工作
将输入的markdown文本翻译成{0}
尽量忠实于原文。
尽量忠实于原文(如空行)
形如<ph-abc123>的占位符不要改变。
latex不要改变
code和formula保持原文
# 输出
翻译后的markdown纯文本
# 示例

View File

@@ -24,7 +24,7 @@ class Agent:
data = {
"model": self.model_id,
"messages": [
{"role": "system", "content": "回复必须严格以BEGIN>>开头,包括简短回答。\n示例:BEGIN>>这是示例回答\n"+system_prompt},
{"role": "system", "content": "回复必须以【SSS】开头该规则适用于之后的所有例子必须遵守。示例【SSS】这是示例回答\n"+system_prompt},
{"role": "user", "content": prompt}
],
"temperature": temperature,
@@ -32,19 +32,13 @@ class Agent:
}
return headers, data
# def send_prompt(self,prompt,system_prompt=None,timeout=TIMEOUT):
# if system_prompt is None:
# system_prompt=self.system_prompt
# headers,data=self._prepare_request_data(prompt,system_prompt)
# response=self.client.post(f"{self.baseurl}/chat/completions",json=data,headers=headers,timeout=timeout)
# response.raise_for_status()
# return response.json()["choices"][0]["message"]["content"].lstrip()
async def send_async(self, prompt: str, system_prompt: None | str = None, timeout: int = TIMEOUT) -> str:
if system_prompt is None:
system_prompt = self.system_prompt
"""Sends a single prompt asynchronously."""
headers, data = self._prepare_request_data(prompt, system_prompt)
if self.baseurl.endswith("/"):
self.baseurl=self.baseurl[:-1]
try:
response = await self.client_async.post(
f"{self.baseurl}/chat/completions",
@@ -54,10 +48,10 @@ class Agent:
)
response.raise_for_status()
result=response.json()["choices"][0]["message"]["content"]
pattern=r"BEGIN>>(.*)"
pattern=r".*【SSS】(.*)"
match= re.search(pattern,result, re.DOTALL)
if match is None:
print("检测开头`BEGIN>>`失败")
print("检测开头`【SSS】`失败")
else:
result=match.group(1)
return result

View File

@@ -32,16 +32,28 @@ class MaskDict:
def __contains__(self, item):
with self._lock:
return item in self._dict
# def uris2placeholder(markdown:str, mask_dict:MaskDict):
##替换整个uri
# def uri2placeholder(match: re.Match):
# id = mask_dict.create_id()
# mask_dict.set(id, match.group())
# return f"<ph-{id}>"
#
# uri_pattern = r'!?\[.*?\]\(.*?\)'
# markdown = re.sub(uri_pattern, uri2placeholder, markdown)
# return markdown
def uris2placeholder(markdown:str, mask_dict:MaskDict):
##只替换uri里的链接部分保留标题
def uri2placeholder(match: re.Match):
id = mask_dict.create_id()
mask_dict.set(id, match.group())
return f"<ph-{id}>"
mask_dict.set(id, match.group(2))
return f"{match.group(1)}(<ph-{id}>)"
uri_pattern = r'!?\[.*?\]\(.*?\)'
uri_pattern = r'(!?\[.*?\])\((.*?)\)'
markdown = re.sub(uri_pattern, uri2placeholder, markdown)
return markdown
def placeholder2_uris(markdown:str, mask_dict:MaskDict):
def placeholder2uri(match:re.Match):
id=match.group(1)

View File

@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
version = "0.0.4"
version = "0.0.5"
description = "能翻译pdf和markdown的软件"
readme = "README.md"
requires-python = ">=3.10"