From 4893163221ff58a2aa0fd636c6b4f1441d069f94 Mon Sep 17 00:00:00 2001 From: Leon Date: Mon, 8 Jun 2026 16:39:47 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=BD=BB=E5=BA=95=E7=A7=BB=E9=99=A4MT?= =?UTF-8?q?=E6=89=B9=E5=A4=84=E7=90=86=EF=BC=8Cqwen-mt=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=E4=BC=9A=E7=A0=B4=E5=9D=8F=E6=89=80=E6=9C=89=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E5=88=86=E9=9A=94=E7=AC=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 日志证实: \n\n---\n\n分隔符被MT模型破坏率82%(28/34批次) MT模型逐条翻译是唯一可靠方案,无法批处理 Co-Authored-By: Claude Opus 4.7 --- docutranslate/agents/segments_agent.py | 82 ++++---------------------- 1 file changed, 11 insertions(+), 71 deletions(-) diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index a70d733..5392e19 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -182,47 +182,6 @@ class SegmentsTranslateAgent(Agent): logger.error(f"原始prompt也不是有效的json格式: {original_segments}") return {"error": f"{original_segments}"} - MT_BATCH_SEP = "\n\n---\n\n" - - def _batch_segments_for_mt(self, segments: list[str], chunk_size: int) -> tuple[list[str], list[list[int]]]: - """将 segments 按字符数分批,用分隔符连接。返回(批文本列表, 每批的索引列表)。""" - batches = [] - index_groups = [] - current_parts = [] - current_indices = [] - current_size = 0 - sep = self.MT_BATCH_SEP - sep_size = len(sep.encode('utf-8')) - - for i, seg in enumerate(segments): - seg_size = len(seg.encode('utf-8')) - add_size = (sep_size if current_parts else 0) + seg_size - - if current_parts and current_size + add_size > chunk_size: - batches.append(sep.join(current_parts)) - index_groups.append(current_indices) - current_parts = [seg] - current_indices = [i] - current_size = seg_size - else: - current_parts.append(seg) - current_indices.append(i) - current_size += add_size - - if current_parts: - batches.append(sep.join(current_parts)) - index_groups.append(current_indices) - - return batches, index_groups - - def _mt_batch_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> list[str]: - """MT batch: 按分隔符拆分翻译结果。""" - return [p.strip() for p in result.strip().split(self.MT_BATCH_SEP)] - - def _mt_batch_error_handler(self, origin_prompt: str, logger: Logger) -> list[str]: - """MT batch error: 返回原文各段。""" - return [p.strip() for p in origin_prompt.split(self.MT_BATCH_SEP)] - def _mt_individual_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> str: """MT individual: 直接返回翻译结果。""" return result.strip() @@ -276,22 +235,15 @@ class SegmentsTranslateAgent(Agent): def send_segments(self, segments: list[str], chunk_size: int) -> list[str]: if self.is_mt_mode: + # MT models (qwen-mt-*) destroy ALL text separators — batching is impossible. + # Each segment must be sent individually for 100% reliable 1:1 mapping. if not segments: return [] - batch_texts, batch_indices = self._batch_segments_for_mt(segments, chunk_size) - batch_results = super().send_prompts( - prompts=batch_texts, - result_handler=self._mt_batch_result_handler, - error_result_handler=self._mt_batch_error_handler, + return super().send_prompts( + prompts=segments, + result_handler=self._mt_individual_result_handler, + error_result_handler=self._mt_individual_error_handler, ) - all_translated, mismatched = self._apply_mt_batch_results( - segments, batch_results, batch_indices - ) - if mismatched: - retranslated = self._retranslate_mismatched(segments, mismatched) - for idx, trans in retranslated.items(): - all_translated[idx] = trans - return all_translated # Non-MT mode: JSON batch translation indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) @@ -331,26 +283,14 @@ class SegmentsTranslateAgent(Agent): async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]: if self.is_mt_mode: + # MT models destroy ALL text separators — each segment must be sent individually. if not segments: return [] - batch_texts, batch_indices = await asyncio.to_thread( - self._batch_segments_for_mt, segments, chunk_size + return await super().send_prompts_async( + prompts=segments, + result_handler=self._mt_individual_result_handler, + error_result_handler=self._mt_individual_error_handler, ) - batch_results = await super().send_prompts_async( - prompts=batch_texts, - result_handler=self._mt_batch_result_handler, - error_result_handler=self._mt_batch_error_handler, - ) - all_translated, mismatched = self._apply_mt_batch_results( - segments, batch_results, batch_indices - ) - if mismatched: - retranslated = await asyncio.to_thread( - self._retranslate_mismatched, segments, mismatched - ) - for idx, trans in retranslated.items(): - all_translated[idx] = trans - return all_translated # Non-MT mode: JSON batch translation indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,