diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py
index c329ed8..f000850 100644
--- a/docutranslate/agents/agent.py
+++ b/docutranslate/agents/agent.py
@@ -292,7 +292,7 @@ class Agent:
response.raise_for_status()
# print(f"【测试】resp:\n{response.json()}")
result = response.json()["choices"][0]["message"]["content"]
-
+ # print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}")
# 获取token使用情况
response_data = response.json()
input_tokens, cached_tokens, output_tokens, reasoning_tokens = (
diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py
index bed7312..a1f35a0 100644
--- a/docutranslate/agents/segments_agent.py
+++ b/docutranslate/agents/segments_agent.py
@@ -3,6 +3,7 @@
import asyncio
import json
+import re
from dataclasses import dataclass
from json import JSONDecodeError
from logging import Logger
@@ -15,6 +16,62 @@ from docutranslate.glossary.glossary import Glossary
from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
+def generate_prompt(json_segments: str, to_lang: str):
+ return f"""
+You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.
+Here is the input:
+
+
+
+```json
+{json_segments}
+```
+
+
+
+For each Key-Value Pair in the JSON, translate the contents of the value into {to_lang}, Write the translation back into the value for that JSON.
+> (Very important) The original text segments and translated segments must strictly correspond one-to-one. It is strictly forbidden for the IDs of the translated segments to differ from those of the original segments.
+> The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
+Here is an example of the expected format:
+
+
+
+Input:
+
+```json
+{{
+3:source,
+4:source,
+}}
+```
+
+Output(target language: {to_lang}):
+
+```json
+{{
+3:translation,
+4:translation,
+}}
+```
+
+
+Please return the translated JSON directly without including any additional information and preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are.
+"""
+
+def get_original_segments(prompt:str):
+ match = re.search(r'(.*)', prompt, re.DOTALL)
+ if match:
+ return match.group(1)
+ else:
+ raise ValueError("无法从prompt中提取初始文本")
+
+def get_target_segments(result:str):
+ match = re.search(r'```json(.*)```', result, re.DOTALL)
+ if match:
+ return match.group(1)
+ else:
+ return result
+
@dataclass
class SegmentsTranslateAgentConfig(AgentConfig):
to_lang: str
@@ -25,53 +82,10 @@ class SegmentsTranslateAgentConfig(AgentConfig):
class SegmentsTranslateAgent(Agent):
def __init__(self, config: SegmentsTranslateAgentConfig):
super().__init__(config)
+ self.to_lang = config.to_lang
self.system_prompt = f"""
# Role
-- You are a text segment translation engine that needs to translate received original text segments into target language text segments.
-
-# Task
-- You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.
-- You need to translate these text segments into the target language.
-- Target language: {config.to_lang}
-
-# Requirements
-- Translations must be professional and accurate.
-- Do not output any explanations or comments but only the {config.to_lang} translations.
-- Use the most common translations for personal names and proper nouns.
-- Preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are.
-- (Very important) The original text segments and translated segments must strictly correspond one-to-one. It is strictly forbidden for the IDs of the translated segments to differ from those of the original segments.
-
-# Input Specification
-{{
-"": ""
-}}
-
-# Output Specification
-{{
-"": ""
-}}
-- The response must be a **valid** JSON object
-- Escape the double quotes within the JSON string.
-- (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
-
-# Example (assuming the target language in this example is English, {config.to_lang} is the actual target language)
-
-## Input
-{{
-"8": "然后呢?我们",
-"9": "就可以看到这个界面了",
-"10": "乔布斯在上海吃泡面",
-"11": "汤姆说:“你好”"
-}}
-
-## Correct Output
-{{
-"8": "And then? We",
-"9": "can then see this interface",
-"10": "Steve Jobs ate instant noodles in Shanghai.",
-"11": "Tom says:\\\"hello\\\""
-}}
-
+- You are a professional, authentic machine translation engine.
"""
self.custom_prompt = config.custom_prompt
if config.custom_prompt:
@@ -91,13 +105,15 @@ class SegmentsTranslateAgent(Agent):
- 如果键不匹配,构造一个部分成功的结果,并通过 PartialTranslationError 异常抛出,以触发重试。
- 其他错误(如JSON解析失败、模型偷懒)则抛出普通 ValueError 触发重试。
"""
+ original_segments=get_original_segments(origin_prompt)
+ result = get_target_segments(result)
if result == "":
- if origin_prompt.strip() != "":
+ if original_segments.strip() != "":
raise AgentResultError("result为空值但原文不为空")
return {}
try:
result = fix_json_string(result)
- original_chunk = json.loads(origin_prompt)
+ original_chunk = json_repair.loads(original_segments)
repaired_result = json_repair.loads(result)
if not isinstance(repaired_result, dict):
@@ -144,22 +160,23 @@ class SegmentsTranslateAgent(Agent):
处理在所有重试后仍然失败的请求。
作为备用方案,返回原文内容,并将所有值转换为字符串。
"""
- if origin_prompt == "":
+ original_segments=get_original_segments(origin_prompt)
+ if original_segments == "":
return {}
try:
- original_chunk = json.loads(origin_prompt)
+ original_chunk = json_repair.loads(original_segments)
# 此处逻辑保留,作为最终的兜底方案
for key, value in original_chunk.items():
original_chunk[key] = f"{value}"
return original_chunk
except (RuntimeError, JSONDecodeError):
- logger.error(f"原始prompt也不是有效的json格式: {origin_prompt}")
+ logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
# 如果原始prompt本身也无效,返回一个清晰的错误对象
- return {"error": f"{origin_prompt}"}
+ return {"error": f"{original_segments}"}
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
- prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
+ prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
translated_chunks = super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler,
result_handler=self._result_handler,
@@ -197,7 +214,7 @@ class SegmentsTranslateAgent(Agent):
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
chunk_size)
- prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
+ prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
translated_chunks = await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler,
result_handler=self._result_handler,