优化segments_agent提示词

2025-10-16 20:50:18 +08:00
parent 14afb0eb6d
commit 324ad77a2e
2 changed files with 71 additions and 54 deletions
--- a/docutranslate/agents/agent.py
+++ b/docutranslate/agents/agent.py
@@ -292,7 +292,7 @@ class Agent:
            response.raise_for_status()
            # print(f"【测试】resp:\n{response.json()}")
            result = response.json()["choices"][0]["message"]["content"]
-
+            # print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}")
            # 获取token使用情况
            response_data = response.json()
            input_tokens, cached_tokens, output_tokens, reasoning_tokens = (
--- a/docutranslate/agents/segments_agent.py
+++ b/docutranslate/agents/segments_agent.py
@@ -3,6 +3,7 @@
 import asyncio
 import json
 import re
 from dataclasses import dataclass
 from json import JSONDecodeError
 from logging import Logger
@@ -15,6 +16,62 @@ from docutranslate.glossary.glossary import Glossary
 from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
 def generate_prompt(json_segments: str, to_lang: str):
    return f"""
 You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.    
 Here is the input:
 <input>
 ```json
 {json_segments}
 ```
 </input>
 For each Key-Value Pair in the JSON, translate the contents of the value into {to_lang}, Write the translation back into the value for that JSON.
 > (Very important) The original text segments and translated segments must strictly correspond one-to-one. It is strictly forbidden for the IDs of the translated segments to differ from those of the original segments.
 > The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
 Here is an example of the expected format:
 <example>
 Input:
 ```json
 {{
 3:source,
 4:source,
 }}
 ```
 Output(target language: {to_lang}):
 ```json
 {{
 3:translation,
 4:translation,
 }}
 ```
 </example>
 Please return the translated JSON directly without including any additional information and preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are.
 """
 def get_original_segments(prompt:str):
    match = re.search(r'<input>(.*)</input>', prompt, re.DOTALL)
    if match:
        return match.group(1)
    else:
        raise ValueError("无法从prompt中提取初始文本")
 def get_target_segments(result:str):
    match = re.search(r'```json(.*)```', result, re.DOTALL)
    if match:
        return match.group(1)
    else:
        return result
@dataclass
 class SegmentsTranslateAgentConfig(AgentConfig):
    to_lang: str
@@ -25,53 +82,10 @@ class SegmentsTranslateAgentConfig(AgentConfig):
 class SegmentsTranslateAgent(Agent):
    def __init__(self, config: SegmentsTranslateAgentConfig):
        super().__init__(config)
        self.to_lang = config.to_lang
        self.system_prompt = f"""
 # Role
- You are a text segment translation engine that needs to translate received original text segments into target language text segments.
+- You are a professional, authentic machine translation engine.
 # Task
 - You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.
 - You need to translate these text segments into the target language.
 - Target language: {config.to_lang}
 # Requirements
 - Translations must be professional and accurate.
 - Do not output any explanations or comments but only the {config.to_lang} translations.
 - Use the most common translations for personal names and proper nouns.
 - Preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are.
 - (Very important) The original text segments and translated segments must strictly correspond one-to-one. It is strictly forbidden for the IDs of the translated segments to differ from those of the original segments.
 # Input Specification
 {{
 "<segment ID>": "<text to be translated>"
 }}
 # Output Specification
 {{
 "<segment ID>": "<translated text>"
 }}
 - The response must be a **valid** JSON object
 - Escape the double quotes within the JSON string.
 - (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
 # Example (assuming the target language in this example is English, {config.to_lang} is the actual target language)
 ## Input
 {{
 "8": "然后呢？我们",
 "9": "就可以看到这个界面了",
 "10": "乔布斯在上海吃泡面",
 "11": "汤姆说：“你好”"
 }}
 ## Correct Output
 {{
 "8": "And then? We",
 "9": "can then see this interface",
 "10": "Steve Jobs ate instant noodles in Shanghai.",
 "11": "Tom says:\\\"hello\\\""
 }}
 """
        self.custom_prompt = config.custom_prompt
        if config.custom_prompt:
@@ -91,13 +105,15 @@ class SegmentsTranslateAgent(Agent):
        - 如果键不匹配，构造一个部分成功的结果，并通过 PartialTranslationError 异常抛出，以触发重试。
        - 其他错误（如JSON解析失败、模型偷懒）则抛出普通 ValueError 触发重试。
        """
        original_segments=get_original_segments(origin_prompt)
        result = get_target_segments(result)
        if result == "":
-            if origin_prompt.strip() != "":
+            if original_segments.strip() != "":
                raise AgentResultError("result为空值但原文不为空")
            return {}
        try:
            result = fix_json_string(result)
-            original_chunk = json.loads(origin_prompt)
+            original_chunk = json_repair.loads(original_segments)
            repaired_result = json_repair.loads(result)
            if not isinstance(repaired_result, dict):
@@ -144,22 +160,23 @@ class SegmentsTranslateAgent(Agent):
        处理在所有重试后仍然失败的请求。
        作为备用方案，返回原文内容，并将所有值转换为字符串。
        """
-        if origin_prompt == "":
+        original_segments=get_original_segments(origin_prompt)
        if original_segments == "":
            return {}
        try:
-            original_chunk = json.loads(origin_prompt)
+            original_chunk = json_repair.loads(original_segments)
            # 此处逻辑保留，作为最终的兜底方案
            for key, value in original_chunk.items():
                original_chunk[key] = f"{value}"
            return original_chunk
        except (RuntimeError, JSONDecodeError):
-            logger.error(f"原始prompt也不是有效的json格式: {origin_prompt}")
+            logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
            # 如果原始prompt本身也无效，返回一个清晰的错误对象
-            return {"error": f"{origin_prompt}"}
+            return {"error": f"{original_segments}"}
    def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
        indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
-        prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
+        prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
        translated_chunks = super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler,
                                                 result_handler=self._result_handler,
@@ -197,7 +214,7 @@ class SegmentsTranslateAgent(Agent):
    async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
        indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
                                                                                 chunk_size)
-        prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
+        prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
        translated_chunks = await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler,
                                                             result_handler=self._result_handler,