提示词改为英文，网络请求现在经过代理

2025-08-18 15:20:56 +08:00
parent b612c9e67e
commit 0cd049e707
6 changed files with 69 additions and 59 deletions
--- a/docutranslate/agents/agent.py
+++ b/docutranslate/agents/agent.py
@@ -86,8 +86,10 @@ class Agent:
        self.model_id = config.model_id.strip()
        self.system_prompt = config.system_prompt or ""
        self.temperature = config.temperature
-        self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
-        self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
+        # self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
+        # self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
+        self.client = httpx.Client(verify=False)
+        self.client_async = httpx.AsyncClient(verify=False)
        self.max_concurrent = config.max_concurrent
        self.timeout = config.timeout
        self.thinking = config.thinking
--- a/docutranslate/agents/markdown_agent.py
+++ b/docutranslate/agents/markdown_agent.py
@@ -11,32 +11,37 @@ class MDTranslateAgent(Agent):
    def __init__(self,config:MDTranslateAgentConfig):
        super().__init__(config)
        self.system_prompt = f"""
-# 角色
-你是一个专业的机器翻译引擎
-# 工作
-翻译输入的markdown文本
-目标语言{config.to_lang}
-# 要求
-翻译要求专业准确
-不输出任何解释和注释
-不能改变形如<ph-xxxxxx>的占位符
-code、latex和HTML只翻译说明文字，其余保持原文
-所有公式无论长短必须表示为能被解析的合法latex公式，公式需被$或\\(\\)或$$正确包裹，如不正确则进行修正
-去除、修正明显异常的字符、但不能改变原意
-引用参考文献时请严格保持原文，不要翻译。参考文献格式示例如下：
-[1] Author A, Author B. "Original Title". Journal, 2023.  
-[2] 作者C. 《中文标题》. 期刊, 2022.
-# 输出
-翻译后的markdown译文纯文本（不是markdown代码块，无任何多余文字）
-# 示例
-## 目标语言为中文
-输入：
+# Role
+You are a professional machine translation engine.
+
+# Task
+Translate the input markdown text.
+Target language: {config.to_lang}
+
+# Requirements
+- The translation must be professional and accurate.
+- Do not output any explanations or annotations.
+- Do not change placeholders in the format of `<ph-xxxxxx>`.
+- For `code`, `LaTeX`, and `HTML`, only translate the descriptive text (e.g., comments, captions); keep the rest of the content in its original form.
+- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
+- Remove or correct any obviously abnormal characters, but without altering the original meaning.
+- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
+  [1] Author A, Author B. "Original Title". Journal, 2023.
+  [2] 作者C. 《中文标题》. 期刊, 2022.
+
+# Output
+The translated markdown text as plain text (not in a markdown code block, with no extraneous text).
+
+# Example
+## Target language is Chinese
+Input:
 hello, what's your nam*@e?
 ![photo title](<ph-abcdde>)
 The equation is E=mc 2. This is famous.
 1+1=2$$
 (c_0,c_1_1,c_2^2)is a coordinate.
-输出：
+
+Output:
 你好，你叫什么名字？
 ![图像标题](<ph-abcdde>)
 这个方程是 $E=mc^2$。这很有名。
--- a/docutranslate/agents/segments_agent.py
+++ b/docutranslate/agents/segments_agent.py
@@ -19,26 +19,26 @@ class SegmentsTranslateAgent(Agent):
    def __init__(self, config: SegmentsTranslateAgentConfig):
        super().__init__(config)
        self.system_prompt = f"""
-# 角色
-你是一个专业的机器翻译引擎
-# 工作
-你接收一个待翻译片段的序列，以json格式表示。其中键是待片段的编号，值是待翻译片段。
-你需要将待翻译片段翻译成目标语言。
-目标语言:{config.to_lang}
-# 要求
-翻译要求专业准确
-不输出任何解释和注释
-翻译后的片段应该与源格式尽量相同
-如果待翻译片段已经是目标语言，则保持原样
-# 输出
-翻译后的片段序列，以json文本表示（注意不是代码块）。其中键是片段编号，值是翻译后的片段。
-返回的json文本必须能被json.loads转换为形如{{"片段编号":"译文"}}的字典。
-# 示例
-## 输入
+Role
+You are a professional machine translation engine.
+Task
+You will receive a sequence of segments to be translated, represented in JSON format. The keys are the segment IDs, and the values are the segments for translation.
+You need to translate these segments into the target language.
+Target language: {config.to_lang}
+Requirements
+The translation must be professional and accurate.
+Do not output any explanations or annotations.
+The format of the translated segments should be as close as possible to the source format.
+If a segment is already in the target language, keep it as is.
+Output
+The translated sequence of segments, represented as JSON text (note: not a code block). The keys are the segment IDs, and the values are the translated segments.
+The returned JSON text must be parsable by json.loads into a dictionary of the form {r'{"segment_id": "translation"}'}.
+Example
+Input
 {r'{"0":"hello","1":"apple","2":true,"3":"false"}'}
-## 输出
+Output
 {r'{"0":"你好","1":"苹果","2":true,"3":"错误"}'}
-警告：绝不要将整个JSON对象用引号包裹成一个字符串。
+Warning: Never wrap the entire JSON object in quotes to make it a single string. Never wrap the JSON text in ```.
 """
        if config.custom_prompt:
            self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n'
--- a/docutranslate/agents/txt_agent.py
+++ b/docutranslate/agents/txt_agent.py
@@ -13,17 +13,20 @@ class TXTTranslateAgent(Agent):
    def __init__(self, config: TXTTranslateAgentConfig):
        super().__init__(config)
        self.system_prompt = f"""
-# 角色
-你是一个专业的机器翻译引擎
-# 工作
-翻译输入的txt文本
-目标语言{config.to_lang}
-# 要求
-翻译要求专业准确
-不输出任何解释和注释
-不能改变形如<ph-xxxxxx>的占位符
-# 输出
-翻译后的txt译文纯文本
+# Role
+You are a professional machine translation engine.
+
+# Task
+Translate the input txt text.
+Target language: {config.to_lang}
+
+# Requirements
+- The translation must be professional and accurate.
+- Do not output any explanations or annotations.
+- Do not change placeholders in the format of `<ph-xxxxxx>`.
+
+# Output
+The translated txt text as plain text.
 """
        if config.custom_prompt:
            self.system_prompt += "\n# 重要规则或背景【非常重要】\n" + config.custom_prompt + '\n'
--- a/docutranslate/converter/x2md/converter_mineru.py
+++ b/docutranslate/converter/x2md/converter_mineru.py
@@ -2,7 +2,6 @@ import asyncio
 import time
 import zipfile
 from dataclasses import dataclass
-from logging import Logger
 from typing import Hashable

 import httpx
@@ -10,7 +9,6 @@ import httpx
 from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
 from docutranslate.ir.document import Document
 from docutranslate.ir.markdown_document import MarkdownDocument
-from docutranslate.logger import global_logger
 from docutranslate.utils.markdown_utils import embed_inline_image_from_zip

 URL = 'https://mineru.net/api/v4/file-urls/batch'
@@ -21,7 +19,7 @@ class ConverterMineruConfig(X2MarkdownConverterConfig):
    mineru_token: str
    formula_ocr: bool = True

-    def gethash(self) ->Hashable:
+    def gethash(self) -> Hashable:
        return self.formula_ocr


@@ -32,8 +30,10 @@ timeout = httpx.Timeout(
    pool=1.0  # 从连接池获取连接的超时时间
 )

-client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
-client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
+# client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
+# client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
+client = httpx.Client(timeout=timeout, verify=False)
+client_async = httpx.AsyncClient(timeout=timeout, verify=False)


 class ConverterMineru(X2MarkdownConverter):
--- a/docutranslate/exporter/docx/docx2html_exporter.py
+++ b/docutranslate/exporter/docx/docx2html_exporter.py
@@ -4,7 +4,7 @@ from io import BytesIO
 import mammoth

 from docutranslate.exporter.base import ExporterConfig
-from docutranslate.exporter.xlsx.base import XlsxExporter
+from docutranslate.exporter.docx.base import DocxExporter
 from docutranslate.ir.document import Document


@@ -13,7 +13,7 @@ class Docx2HTMLExporterConfig(ExporterConfig):
    cdn: bool = True


-class Docx2HTMLExporter(XlsxExporter):
+class Docx2HTMLExporter(DocxExporter):
    def __init__(self, config: Docx2HTMLExporterConfig = None):
        config = config or Docx2HTMLExporterConfig()
        super().__init__(config=config)