diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 3dc4da5..46036cf 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -6,7 +6,11 @@ + + + + - { - "keyToString": { - "DefaultHtmlFileTemplate": "HTML File", - "JavaScript 调试.output.html (1).executor": "Run", - "JavaScript 调试.output.html.executor": "Run", - "JavaScript 调试.regex_中文.html.executor": "Run", - "JavaScript 调试.test2_英文.html.executor": "Run", - "ModuleVcsDetector.initialDetectionPerformed": "true", - "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run", - "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", - "Python.PDFtranslater (1).executor": "Run", - "Python.PDFtranslater (2).executor": "Run", - "Python.agent_utils.executor": "Run", - "Python.convert.executor": "Run", - "Python.markdown_splitter.executor": "Run", - "Python.markdown_utils.executor": "Run", - "Python.test.executor": "Run", - "Python.test1.executor": "Run", - "Python.translater.executor": "Debug", - "RunOnceActivity.ShowReadmeOnStart": "true", - "RunOnceActivity.git.unshallow": "true", - "git-widget-placeholder": "main", - "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/resource", - "node.js.detected.package.eslint": "true", - "node.js.detected.package.tslint": "true", - "node.js.selected.package.eslint": "(autodetect)", - "node.js.selected.package.tslint": "(autodetect)", - "nodejs_package_manager_path": "npm", - "settings.editor.selected.configurable": "Errors", - "vue.rearranger.settings.migration": "true" + +}]]> @@ -76,7 +82,7 @@ - + - - - - + @@ -176,6 +179,29 @@ + + + - + + - - + @@ -308,7 +334,7 @@ - + @@ -317,8 +343,8 @@ - - + + @@ -327,6 +353,7 @@ + \ No newline at end of file diff --git a/README.md b/README.md index 9e130bf..1ea9d06 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ # 前置条件(获取大模型平台的baseurl、key、model-id) 由于需要使用大语言模型进行markdown调整与翻译,所以需要预先获取模型的baseurl、key、model-id 常见的大模型平台baseurl与api获取方式可见[常用ai平台](#常用ai平台) - +> 比较推荐的模型有阿里云的qwen-plus、智谱的glm-z1-flash等。免费的智谱glm-4-flash可以用但效果欠佳 # 使用方式 ## 翻译文件 diff --git a/docutranslate/translater.py b/docutranslate/translater.py index bcc420f..0d2b333 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -84,9 +84,9 @@ Blockchain's origination was Bitcoin, the most successful of the digital currenc 你是一个翻译markdown文本的专家。 # 工作 将输入的markdown文本翻译成{0}。 -尽量忠实于原文。 +尽量忠实于原文(如空行)。 形如的占位符不要改变。 -latex不要改变。 +code和formula保持原文。 # 输出 翻译后的markdown纯文本 # 示例 diff --git a/docutranslate/utils/agent_utils.py b/docutranslate/utils/agent_utils.py index 8edeed0..0b0efaa 100644 --- a/docutranslate/utils/agent_utils.py +++ b/docutranslate/utils/agent_utils.py @@ -24,7 +24,7 @@ class Agent: data = { "model": self.model_id, "messages": [ - {"role": "system", "content": "回复必须严格以BEGIN>>开头,包括简短回答。\n示例:BEGIN>>这是示例回答\n"+system_prompt}, + {"role": "system", "content": "回复必须以【SSS】开头(该规则适用于之后的所有例子),必须遵守。示例:【SSS】这是示例回答\n"+system_prompt}, {"role": "user", "content": prompt} ], "temperature": temperature, @@ -32,19 +32,13 @@ class Agent: } return headers, data - # def send_prompt(self,prompt,system_prompt=None,timeout=TIMEOUT): - # if system_prompt is None: - # system_prompt=self.system_prompt - # headers,data=self._prepare_request_data(prompt,system_prompt) - # response=self.client.post(f"{self.baseurl}/chat/completions",json=data,headers=headers,timeout=timeout) - # response.raise_for_status() - # return response.json()["choices"][0]["message"]["content"].lstrip() - async def send_async(self, prompt: str, system_prompt: None | str = None, timeout: int = TIMEOUT) -> str: if system_prompt is None: system_prompt = self.system_prompt """Sends a single prompt asynchronously.""" headers, data = self._prepare_request_data(prompt, system_prompt) + if self.baseurl.endswith("/"): + self.baseurl=self.baseurl[:-1] try: response = await self.client_async.post( f"{self.baseurl}/chat/completions", @@ -54,10 +48,10 @@ class Agent: ) response.raise_for_status() result=response.json()["choices"][0]["message"]["content"] - pattern=r"BEGIN>>(.*)" + pattern=r".*【SSS】(.*)" match= re.search(pattern,result, re.DOTALL) if match is None: - print("检测开头`BEGIN>>`失败") + print("检测开头`【SSS】`失败") else: result=match.group(1) return result diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index 96174e3..0098356 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -32,16 +32,28 @@ class MaskDict: def __contains__(self, item): with self._lock: return item in self._dict + +# def uris2placeholder(markdown:str, mask_dict:MaskDict): + ##替换整个uri +# def uri2placeholder(match: re.Match): +# id = mask_dict.create_id() +# mask_dict.set(id, match.group()) +# return f"" +# +# uri_pattern = r'!?\[.*?\]\(.*?\)' +# markdown = re.sub(uri_pattern, uri2placeholder, markdown) +# return markdown + def uris2placeholder(markdown:str, mask_dict:MaskDict): + ##只替换uri里的链接部分,保留标题 def uri2placeholder(match: re.Match): id = mask_dict.create_id() - mask_dict.set(id, match.group()) - return f"" + mask_dict.set(id, match.group(2)) + return f"{match.group(1)}()" - uri_pattern = r'!?\[.*?\]\(.*?\)' + uri_pattern = r'(!?\[.*?\])\((.*?)\)' markdown = re.sub(uri_pattern, uri2placeholder, markdown) return markdown - def placeholder2_uris(markdown:str, mask_dict:MaskDict): def placeholder2uri(match:re.Match): id=match.group(1) diff --git a/pyproject.toml b/pyproject.toml index 488f1c8..dddc98b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docutranslate" -version = "0.0.4" +version = "0.0.5" description = "能翻译pdf和markdown的软件" readme = "README.md" requires-python = ">=3.10"