fix: MT模式改为逐条翻译,彻底消除标记依赖和29.4%内容丢失
- MT模式不再打包segment为<<<SEG:n>>>标记格式 - 改为每个segment一次纯文本API调用,1:1映射,零标记零解析 - 删除_chunk_to_mt_prompt/_parse_mt_response/_result_handler_mt等~100行死代码 - 新增_mt_simple_result_handler/_mt_simple_error_handler - 非MT模式(JSON批处理)不受影响 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -15,10 +15,6 @@ from docutranslate.agents.agent import PartialAgentResultError, AgentResultError
|
||||
from docutranslate.glossary.glossary import Glossary
|
||||
from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
|
||||
|
||||
# MT mode plain-text segment marker — designed to survive machine translation unchanged
|
||||
MT_SEG_MARKER_RE = re.compile(r'<<<SEG:(\d+)>>>\s*\n(.*?)(?=<<<SEG:\d+>>>|\Z)', re.DOTALL)
|
||||
|
||||
|
||||
def generate_prompt(json_segments: str, to_lang: str):
|
||||
return f"""
|
||||
You will receive a sequence of original text segments to be translated, represented in JSON format. The keys are segment IDs, and the values are the text content to be translated.
|
||||
@@ -95,44 +91,6 @@ def get_target_segments(result: str):
|
||||
return result
|
||||
|
||||
|
||||
def _chunk_to_mt_prompt(chunk: dict) -> str:
|
||||
"""Convert a JSON chunk like {'0': 'text1', '1': 'text2'} to MT-friendly plain text."""
|
||||
parts = []
|
||||
for key in sorted(chunk.keys(), key=int):
|
||||
parts.append(f"<<<SEG:{key}>>>\n{chunk[key]}")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _parse_mt_prompt_to_dict(mt_prompt: str) -> dict:
|
||||
"""Parse an MT prompt string back to the original segment dict."""
|
||||
result = {}
|
||||
for match in MT_SEG_MARKER_RE.finditer(mt_prompt):
|
||||
key = match.group(1)
|
||||
value = match.group(2).strip()
|
||||
result[key] = value
|
||||
if not result:
|
||||
# MT format parsing failed — wrap entire prompt as single segment
|
||||
result = {"0": mt_prompt}
|
||||
return result
|
||||
|
||||
|
||||
def _parse_mt_response(text: str, original_chunk: dict) -> dict:
|
||||
"""Parse MT plain-text response using <<<SEG:n>>> markers back to dict."""
|
||||
result = {}
|
||||
for match in MT_SEG_MARKER_RE.finditer(text):
|
||||
key = match.group(1)
|
||||
value = match.group(2).strip()
|
||||
if key in original_chunk:
|
||||
result[key] = value
|
||||
|
||||
# Fill missing keys from original
|
||||
for key in original_chunk:
|
||||
if key not in result:
|
||||
result[key] = ""
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@dataclass(kw_only=True)
|
||||
class SegmentsTranslateAgentConfig(AgentConfig):
|
||||
to_lang: str
|
||||
@@ -162,14 +120,8 @@ class SegmentsTranslateAgent(Agent):
|
||||
return system_prompt, prompt
|
||||
|
||||
def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
|
||||
"""
|
||||
处理成功的API响应。
|
||||
MT模式下使用 <<<SEG:n>>> 标记解析纯文本响应,避免JSON格式不兼容问题。
|
||||
"""
|
||||
if self.is_mt_mode:
|
||||
return self._result_handler_mt(result, origin_prompt, logger)
|
||||
|
||||
# --- Non-MT mode (JSON-based) ---
|
||||
"""处理非MT模式的JSON翻译响应。"""
|
||||
# --- JSON-based ---
|
||||
original_segments = get_original_segments(origin_prompt)
|
||||
result = get_target_segments(result)
|
||||
if result == "":
|
||||
@@ -216,77 +168,8 @@ class SegmentsTranslateAgent(Agent):
|
||||
except (RuntimeError, JSONDecodeError) as e:
|
||||
raise AgentResultError(f"结果处理失败: {e.__repr__()}")
|
||||
|
||||
def _result_handler_mt(self, result: str, origin_prompt: str, logger: Logger) -> dict:
|
||||
"""MT模式专用结果处理器:解析 <<<SEG:n>>> 标记格式的纯文本响应。"""
|
||||
result_clean = result.strip()
|
||||
if result_clean == "":
|
||||
if origin_prompt.strip() != "":
|
||||
raise AgentResultError("result为空值但原文不为空")
|
||||
return {}
|
||||
|
||||
original_chunk = _parse_mt_prompt_to_dict(origin_prompt)
|
||||
original_keys = set(original_chunk.keys())
|
||||
|
||||
# Try parsing with <<<SEG:n>>> markers
|
||||
parsed = _parse_mt_response(result_clean, original_chunk)
|
||||
|
||||
if parsed and any(v.strip() for v in parsed.values()):
|
||||
result_keys = set(parsed.keys())
|
||||
if result_keys == original_keys:
|
||||
# Check if result is identical to original (no translation happened)
|
||||
all_same = all(
|
||||
parsed.get(k, "").strip() == str(original_chunk.get(k, "")).strip()
|
||||
for k in original_keys
|
||||
)
|
||||
if all_same:
|
||||
raise AgentResultError("翻译结果与原文完全相同,疑似翻译失败,将进行重试。")
|
||||
return parsed
|
||||
|
||||
# If key mismatch, try as Partial result
|
||||
if result_keys and result_keys != original_keys:
|
||||
final_chunk = {}
|
||||
for key in original_keys:
|
||||
final_chunk[key] = parsed.get(key, str(original_chunk.get(key, "")))
|
||||
raise PartialAgentResultError(
|
||||
"MT模式键不匹配,触发重试",
|
||||
partial_result=final_chunk,
|
||||
append_prompt="\nPreserve all <<<SEG:n>>> markers exactly as they appear.\n"
|
||||
)
|
||||
|
||||
# Fallback: Try line-by-line mapping (MT model might have removed markers)
|
||||
result_lines = [line.strip() for line in result_clean.split('\n') if line.strip()]
|
||||
original_seg_list = [str(original_chunk.get(str(i), "")) for i in range(len(original_chunk))]
|
||||
|
||||
non_empty_lines = [l for l in result_lines if l]
|
||||
if len(non_empty_lines) == len(original_chunk):
|
||||
repaired = {str(i): non_empty_lines[i] for i in range(len(non_empty_lines))}
|
||||
all_same = all(
|
||||
repaired.get(k, "").strip() == str(original_chunk.get(k, "")).strip()
|
||||
for k in original_keys
|
||||
)
|
||||
if all_same:
|
||||
raise AgentResultError("翻译结果与原文完全相同(逐行),疑似翻译失败,将进行重试。")
|
||||
return repaired
|
||||
|
||||
# Last fallback: assign all result text to first key
|
||||
if non_empty_lines:
|
||||
repaired = {str(i): "" for i in range(len(original_chunk))}
|
||||
repaired["0"] = "\n".join(non_empty_lines)
|
||||
return repaired
|
||||
|
||||
raise AgentResultError("MT模式无法解析响应")
|
||||
|
||||
def _error_result_handler(self, origin_prompt: str, logger: Logger):
|
||||
"""
|
||||
处理在所有重试后仍然失败的请求。
|
||||
作为备用方案,返回原文内容。
|
||||
"""
|
||||
if self.is_mt_mode:
|
||||
original_chunk = _parse_mt_prompt_to_dict(origin_prompt)
|
||||
for key in list(original_chunk.keys()):
|
||||
original_chunk[key] = f"{original_chunk[key]}"
|
||||
return original_chunk
|
||||
|
||||
"""非MT模式: 所有重试失败后返回原文。"""
|
||||
original_segments = get_original_segments(origin_prompt)
|
||||
if original_segments == "":
|
||||
return {}
|
||||
@@ -299,11 +182,25 @@ class SegmentsTranslateAgent(Agent):
|
||||
logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
|
||||
return {"error": f"{original_segments}"}
|
||||
|
||||
def _mt_simple_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> str:
|
||||
"""MT mode: 直接返回翻译结果,不解析标记/JSON。"""
|
||||
return result.strip()
|
||||
|
||||
def _mt_simple_error_handler(self, origin_prompt: str, logger: Logger) -> str:
|
||||
"""MT mode error fallback: 返回原文。"""
|
||||
return origin_prompt
|
||||
|
||||
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
|
||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||
if self.is_mt_mode:
|
||||
prompts = [_chunk_to_mt_prompt(chunk) for chunk in chunks]
|
||||
else:
|
||||
# MT mode: send each segment individually as plain text, no markers, no batching
|
||||
return super().send_prompts(
|
||||
prompts=segments,
|
||||
result_handler=self._mt_simple_result_handler,
|
||||
error_result_handler=self._mt_simple_error_handler,
|
||||
)
|
||||
|
||||
# Non-MT mode: JSON batch translation
|
||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
|
||||
translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json,
|
||||
pre_send_handler=self._pre_send_handler,
|
||||
@@ -339,13 +236,18 @@ class SegmentsTranslateAgent(Agent):
|
||||
return result
|
||||
|
||||
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
|
||||
if self.is_mt_mode:
|
||||
# MT mode: send each segment individually as plain text, no markers, no batching
|
||||
return await super().send_prompts_async(
|
||||
prompts=segments,
|
||||
result_handler=self._mt_simple_result_handler,
|
||||
error_result_handler=self._mt_simple_error_handler,
|
||||
)
|
||||
|
||||
# Non-MT mode: JSON batch translation
|
||||
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
||||
chunk_size)
|
||||
if self.is_mt_mode:
|
||||
prompts = [_chunk_to_mt_prompt(chunk) for chunk in chunks]
|
||||
else:
|
||||
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
|
||||
|
||||
translated_chunks = await super().send_prompts_async(prompts=prompts, force_json=self.force_json,
|
||||
pre_send_handler=self._pre_send_handler,
|
||||
result_handler=self._result_handler,
|
||||
|
||||
Reference in New Issue
Block a user