From 2d39c009cf3cb54007195fd595a7d1d7c76e0d32 Mon Sep 17 00:00:00 2001
From: Leon <leon@langcore.cn>
Date: Mon, 8 Jun 2026 16:44:34 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20MT=E6=A8=A1=E5=BC=8F=E6=94=B9=E7=94=A8JS?=
 =?UTF-8?q?ON=E6=89=B9=E5=A4=84=E7=90=86+=E9=80=90=E6=9D=A1=E5=9B=9E?=
 =?UTF-8?q?=E9=80=80=EF=BC=8C=E5=85=BC=E9=A1=BE=E9=80=9F=E5=BA=A6=E5=92=8C?=
 =?UTF-8?q?=E5=8F=AF=E9=9D=A0=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- MT模式: JSON批处理(616段→~12 chunks)
- 自动检测未翻译段(原文==译文)，逐条回退重译
- 比纯逐条快约10倍，比纯批处理可靠

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 docutranslate/agents/segments_agent.py | 129 ++++++++++++++++++++++---
 1 file changed, 118 insertions(+), 11 deletions(-)

diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py
index 5392e19..01ce83a 100644
--- a/docutranslate/agents/segments_agent.py
+++ b/docutranslate/agents/segments_agent.py
@@ -182,6 +182,30 @@ class SegmentsTranslateAgent(Agent):
             logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
             return {"error": f"{original_segments}"}
 
+    def _mt_json_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> dict:
+        """MT JSON batch: 解析JSON响应，返回{key: translation}。"""
+        try:
+            original_chunk = json_repair.loads(origin_prompt)
+            repaired = json_repair.loads(result)
+        except (RuntimeError, JSONDecodeError):
+            raise AgentResultError("MT JSON parse failed")
+
+        if not isinstance(repaired, dict):
+            raise AgentResultError(f"MT JSON result not dict: {type(repaired)}")
+
+        out = {}
+        for key in original_chunk:
+            out[key] = str(repaired.get(key, original_chunk[key]))
+        return out
+
+    def _mt_json_error_handler(self, origin_prompt: str, logger: Logger) -> dict:
+        """MT JSON batch error: 返回原文。"""
+        try:
+            original_chunk = json_repair.loads(origin_prompt)
+            return {k: str(v) for k, v in original_chunk.items()}
+        except Exception:
+            return {"0": origin_prompt}
+
     def _mt_individual_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> str:
         """MT individual: 直接返回翻译结果。"""
         return result.strip()
@@ -235,16 +259,56 @@ class SegmentsTranslateAgent(Agent):
 
     def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
         if self.is_mt_mode:
-            # MT models (qwen-mt-*) destroy ALL text separators — batching is impossible.
-            # Each segment must be sent individually for 100% reliable 1:1 mapping.
             if not segments:
                 return []
-            return super().send_prompts(
-                prompts=segments,
-                result_handler=self._mt_individual_result_handler,
-                error_result_handler=self._mt_individual_error_handler,
+            # JSON batching: 616 segments → ~12 JSON chunks
+            indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
+            prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
+            translated_chunks = super().send_prompts(
+                prompts=prompts,
+                result_handler=self._mt_json_result_handler,
+                error_result_handler=self._mt_json_error_handler,
             )
 
+            # Detect unchanged segments (MT model returned original text)
+            indexed_translated = indexed_originals.copy()
+            failed_indices = []
+            for chunk_result, chunk_original in zip(translated_chunks, chunks):
+                if not isinstance(chunk_result, dict):
+                    for k in chunk_original:
+                        failed_indices.append(int(k))
+                    continue
+                for key in chunk_original:
+                    val = chunk_result.get(key, "")
+                    if isinstance(val, str) and val.strip() == str(chunk_original[key]).strip():
+                        failed_indices.append(int(key))
+                    indexed_translated[key] = str(val)
+
+            # Retry failed segments individually
+            if failed_indices:
+                self.logger.info(
+                    f"MT JSON batch: {len(failed_indices)}/{len(segments)} segments unchanged, retrying individually"
+                )
+                retry_segments = [segments[i] for i in failed_indices]
+                retry_results = super().send_prompts(
+                    prompts=retry_segments,
+                    result_handler=self._mt_individual_result_handler,
+                    error_result_handler=self._mt_individual_error_handler,
+                )
+                for idx, trans in zip(failed_indices, retry_results):
+                    indexed_translated[str(idx)] = trans
+
+            # Reconstruct result list
+            result = []
+            last_end = 0
+            ls = list(indexed_translated.values())
+            for start, end in merged_indices_list:
+                result.extend(ls[last_end:start])
+                result.append("".join(map(str, ls[start:end])))
+                last_end = end
+            result.extend(ls[last_end:])
+            return result
+
         # Non-MT mode: JSON batch translation
         indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
         prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
@@ -283,14 +347,57 @@ class SegmentsTranslateAgent(Agent):
 
     async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
         if self.is_mt_mode:
-            # MT models destroy ALL text separators — each segment must be sent individually.
             if not segments:
                 return []
-            return await super().send_prompts_async(
-                prompts=segments,
-                result_handler=self._mt_individual_result_handler,
-                error_result_handler=self._mt_individual_error_handler,
+            # JSON batching: 616 segments → ~12 JSON chunks
+            indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(
+                segments2json_chunks, segments, chunk_size
             )
+            prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
+            translated_chunks = await super().send_prompts_async(
+                prompts=prompts,
+                result_handler=self._mt_json_result_handler,
+                error_result_handler=self._mt_json_error_handler,
+            )
+
+            # Detect unchanged segments
+            indexed_translated = indexed_originals.copy()
+            failed_indices = []
+            for chunk_result, chunk_original in zip(translated_chunks, chunks):
+                if not isinstance(chunk_result, dict):
+                    for k in chunk_original:
+                        failed_indices.append(int(k))
+                    continue
+                for key in chunk_original:
+                    val = chunk_result.get(key, "")
+                    if isinstance(val, str) and val.strip() == str(chunk_original[key]).strip():
+                        failed_indices.append(int(key))
+                    indexed_translated[key] = str(val)
+
+            # Retry failed segments individually
+            if failed_indices:
+                self.logger.info(
+                    f"MT JSON batch: {len(failed_indices)}/{len(segments)} segments unchanged, retrying individually"
+                )
+                retry_segments = [segments[i] for i in failed_indices]
+                retry_results = await super().send_prompts_async(
+                    prompts=retry_segments,
+                    result_handler=self._mt_individual_result_handler,
+                    error_result_handler=self._mt_individual_error_handler,
+                )
+                for idx, trans in zip(failed_indices, retry_results):
+                    indexed_translated[str(idx)] = trans
+
+            # Reconstruct result list
+            result = []
+            last_end = 0
+            ls = list(indexed_translated.values())
+            for start, end in merged_indices_list:
+                result.extend(ls[last_end:start])
+                result.append("".join(map(str, ls[start:end])))
+                last_end = end
+            result.extend(ls[last_end:])
+            return result
 
         # Non-MT mode: JSON batch translation
         indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,