diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 3dc4da5..46036cf 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -6,7 +6,11 @@
+
+
+
+
@@ -32,38 +36,40 @@
- {
- "keyToString": {
- "DefaultHtmlFileTemplate": "HTML File",
- "JavaScript 调试.output.html (1).executor": "Run",
- "JavaScript 调试.output.html.executor": "Run",
- "JavaScript 调试.regex_中文.html.executor": "Run",
- "JavaScript 调试.test2_英文.html.executor": "Run",
- "ModuleVcsDetector.initialDetectionPerformed": "true",
- "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
- "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
- "Python.PDFtranslater (1).executor": "Run",
- "Python.PDFtranslater (2).executor": "Run",
- "Python.agent_utils.executor": "Run",
- "Python.convert.executor": "Run",
- "Python.markdown_splitter.executor": "Run",
- "Python.markdown_utils.executor": "Run",
- "Python.test.executor": "Run",
- "Python.test1.executor": "Run",
- "Python.translater.executor": "Debug",
- "RunOnceActivity.ShowReadmeOnStart": "true",
- "RunOnceActivity.git.unshallow": "true",
- "git-widget-placeholder": "main",
- "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/resource",
- "node.js.detected.package.eslint": "true",
- "node.js.detected.package.tslint": "true",
- "node.js.selected.package.eslint": "(autodetect)",
- "node.js.selected.package.tslint": "(autodetect)",
- "nodejs_package_manager_path": "npm",
- "settings.editor.selected.configurable": "Errors",
- "vue.rearranger.settings.migration": "true"
+
+}]]>
@@ -76,7 +82,7 @@
-
+
@@ -98,10 +104,7 @@
-
-
-
-
+
@@ -176,6 +179,29 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -275,11 +301,11 @@
-
+
+
-
-
+
@@ -308,7 +334,7 @@
-
+
@@ -317,8 +343,8 @@
-
-
+
+
@@ -327,6 +353,7 @@
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 9e130bf..1ea9d06 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
# 前置条件(获取大模型平台的baseurl、key、model-id)
由于需要使用大语言模型进行markdown调整与翻译,所以需要预先获取模型的baseurl、key、model-id
常见的大模型平台baseurl与api获取方式可见[常用ai平台](#常用ai平台)
-
+> 比较推荐的模型有阿里云的qwen-plus、智谱的glm-z1-flash等。免费的智谱glm-4-flash可以用但效果欠佳
# 使用方式
## 翻译文件
diff --git a/docutranslate/translater.py b/docutranslate/translater.py
index bcc420f..0d2b333 100644
--- a/docutranslate/translater.py
+++ b/docutranslate/translater.py
@@ -84,9 +84,9 @@ Blockchain's origination was Bitcoin, the most successful of the digital currenc
你是一个翻译markdown文本的专家。
# 工作
将输入的markdown文本翻译成{0}。
-尽量忠实于原文。
+尽量忠实于原文(如空行)。
形如的占位符不要改变。
-latex不要改变。
+code和formula保持原文。
# 输出
翻译后的markdown纯文本
# 示例
diff --git a/docutranslate/utils/agent_utils.py b/docutranslate/utils/agent_utils.py
index 8edeed0..0b0efaa 100644
--- a/docutranslate/utils/agent_utils.py
+++ b/docutranslate/utils/agent_utils.py
@@ -24,7 +24,7 @@ class Agent:
data = {
"model": self.model_id,
"messages": [
- {"role": "system", "content": "回复必须严格以BEGIN>>开头,包括简短回答。\n示例:BEGIN>>这是示例回答\n"+system_prompt},
+ {"role": "system", "content": "回复必须以【SSS】开头(该规则适用于之后的所有例子),必须遵守。示例:【SSS】这是示例回答\n"+system_prompt},
{"role": "user", "content": prompt}
],
"temperature": temperature,
@@ -32,19 +32,13 @@ class Agent:
}
return headers, data
- # def send_prompt(self,prompt,system_prompt=None,timeout=TIMEOUT):
- # if system_prompt is None:
- # system_prompt=self.system_prompt
- # headers,data=self._prepare_request_data(prompt,system_prompt)
- # response=self.client.post(f"{self.baseurl}/chat/completions",json=data,headers=headers,timeout=timeout)
- # response.raise_for_status()
- # return response.json()["choices"][0]["message"]["content"].lstrip()
-
async def send_async(self, prompt: str, system_prompt: None | str = None, timeout: int = TIMEOUT) -> str:
if system_prompt is None:
system_prompt = self.system_prompt
"""Sends a single prompt asynchronously."""
headers, data = self._prepare_request_data(prompt, system_prompt)
+ if self.baseurl.endswith("/"):
+ self.baseurl=self.baseurl[:-1]
try:
response = await self.client_async.post(
f"{self.baseurl}/chat/completions",
@@ -54,10 +48,10 @@ class Agent:
)
response.raise_for_status()
result=response.json()["choices"][0]["message"]["content"]
- pattern=r"BEGIN>>(.*)"
+ pattern=r".*【SSS】(.*)"
match= re.search(pattern,result, re.DOTALL)
if match is None:
- print("检测开头`BEGIN>>`失败")
+ print("检测开头`【SSS】`失败")
else:
result=match.group(1)
return result
diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py
index 96174e3..0098356 100644
--- a/docutranslate/utils/markdown_utils.py
+++ b/docutranslate/utils/markdown_utils.py
@@ -32,16 +32,28 @@ class MaskDict:
def __contains__(self, item):
with self._lock:
return item in self._dict
+
+# def uris2placeholder(markdown:str, mask_dict:MaskDict):
+ ##替换整个uri
+# def uri2placeholder(match: re.Match):
+# id = mask_dict.create_id()
+# mask_dict.set(id, match.group())
+# return f""
+#
+# uri_pattern = r'!?\[.*?\]\(.*?\)'
+# markdown = re.sub(uri_pattern, uri2placeholder, markdown)
+# return markdown
+
def uris2placeholder(markdown:str, mask_dict:MaskDict):
+ ##只替换uri里的链接部分,保留标题
def uri2placeholder(match: re.Match):
id = mask_dict.create_id()
- mask_dict.set(id, match.group())
- return f""
+ mask_dict.set(id, match.group(2))
+ return f"{match.group(1)}()"
- uri_pattern = r'!?\[.*?\]\(.*?\)'
+ uri_pattern = r'(!?\[.*?\])\((.*?)\)'
markdown = re.sub(uri_pattern, uri2placeholder, markdown)
return markdown
-
def placeholder2_uris(markdown:str, mask_dict:MaskDict):
def placeholder2uri(match:re.Match):
id=match.group(1)
diff --git a/pyproject.toml b/pyproject.toml
index 488f1c8..dddc98b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
-version = "0.0.4"
+version = "0.0.5"
description = "能翻译pdf和markdown的软件"
readme = "README.md"
requires-python = ">=3.10"