fix: 彻底移除MT批处理,qwen-mt模型会破坏所有文本分隔符
日志证实: \n\n---\n\n分隔符被MT模型破坏率82%(28/34批次) MT模型逐条翻译是唯一可靠方案,无法批处理 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -182,47 +182,6 @@ class SegmentsTranslateAgent(Agent):
|
|||||||
logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
|
logger.error(f"原始prompt也不是有效的json格式: {original_segments}")
|
||||||
return {"error": f"{original_segments}"}
|
return {"error": f"{original_segments}"}
|
||||||
|
|
||||||
MT_BATCH_SEP = "\n\n---\n\n"
|
|
||||||
|
|
||||||
def _batch_segments_for_mt(self, segments: list[str], chunk_size: int) -> tuple[list[str], list[list[int]]]:
|
|
||||||
"""将 segments 按字符数分批,用分隔符连接。返回(批文本列表, 每批的索引列表)。"""
|
|
||||||
batches = []
|
|
||||||
index_groups = []
|
|
||||||
current_parts = []
|
|
||||||
current_indices = []
|
|
||||||
current_size = 0
|
|
||||||
sep = self.MT_BATCH_SEP
|
|
||||||
sep_size = len(sep.encode('utf-8'))
|
|
||||||
|
|
||||||
for i, seg in enumerate(segments):
|
|
||||||
seg_size = len(seg.encode('utf-8'))
|
|
||||||
add_size = (sep_size if current_parts else 0) + seg_size
|
|
||||||
|
|
||||||
if current_parts and current_size + add_size > chunk_size:
|
|
||||||
batches.append(sep.join(current_parts))
|
|
||||||
index_groups.append(current_indices)
|
|
||||||
current_parts = [seg]
|
|
||||||
current_indices = [i]
|
|
||||||
current_size = seg_size
|
|
||||||
else:
|
|
||||||
current_parts.append(seg)
|
|
||||||
current_indices.append(i)
|
|
||||||
current_size += add_size
|
|
||||||
|
|
||||||
if current_parts:
|
|
||||||
batches.append(sep.join(current_parts))
|
|
||||||
index_groups.append(current_indices)
|
|
||||||
|
|
||||||
return batches, index_groups
|
|
||||||
|
|
||||||
def _mt_batch_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> list[str]:
|
|
||||||
"""MT batch: 按分隔符拆分翻译结果。"""
|
|
||||||
return [p.strip() for p in result.strip().split(self.MT_BATCH_SEP)]
|
|
||||||
|
|
||||||
def _mt_batch_error_handler(self, origin_prompt: str, logger: Logger) -> list[str]:
|
|
||||||
"""MT batch error: 返回原文各段。"""
|
|
||||||
return [p.strip() for p in origin_prompt.split(self.MT_BATCH_SEP)]
|
|
||||||
|
|
||||||
def _mt_individual_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> str:
|
def _mt_individual_result_handler(self, result: str, origin_prompt: str, logger: Logger) -> str:
|
||||||
"""MT individual: 直接返回翻译结果。"""
|
"""MT individual: 直接返回翻译结果。"""
|
||||||
return result.strip()
|
return result.strip()
|
||||||
@@ -276,22 +235,15 @@ class SegmentsTranslateAgent(Agent):
|
|||||||
|
|
||||||
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
|
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
|
||||||
if self.is_mt_mode:
|
if self.is_mt_mode:
|
||||||
|
# MT models (qwen-mt-*) destroy ALL text separators — batching is impossible.
|
||||||
|
# Each segment must be sent individually for 100% reliable 1:1 mapping.
|
||||||
if not segments:
|
if not segments:
|
||||||
return []
|
return []
|
||||||
batch_texts, batch_indices = self._batch_segments_for_mt(segments, chunk_size)
|
return super().send_prompts(
|
||||||
batch_results = super().send_prompts(
|
prompts=segments,
|
||||||
prompts=batch_texts,
|
result_handler=self._mt_individual_result_handler,
|
||||||
result_handler=self._mt_batch_result_handler,
|
error_result_handler=self._mt_individual_error_handler,
|
||||||
error_result_handler=self._mt_batch_error_handler,
|
|
||||||
)
|
)
|
||||||
all_translated, mismatched = self._apply_mt_batch_results(
|
|
||||||
segments, batch_results, batch_indices
|
|
||||||
)
|
|
||||||
if mismatched:
|
|
||||||
retranslated = self._retranslate_mismatched(segments, mismatched)
|
|
||||||
for idx, trans in retranslated.items():
|
|
||||||
all_translated[idx] = trans
|
|
||||||
return all_translated
|
|
||||||
|
|
||||||
# Non-MT mode: JSON batch translation
|
# Non-MT mode: JSON batch translation
|
||||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||||
@@ -331,26 +283,14 @@ class SegmentsTranslateAgent(Agent):
|
|||||||
|
|
||||||
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
|
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
|
||||||
if self.is_mt_mode:
|
if self.is_mt_mode:
|
||||||
|
# MT models destroy ALL text separators — each segment must be sent individually.
|
||||||
if not segments:
|
if not segments:
|
||||||
return []
|
return []
|
||||||
batch_texts, batch_indices = await asyncio.to_thread(
|
return await super().send_prompts_async(
|
||||||
self._batch_segments_for_mt, segments, chunk_size
|
prompts=segments,
|
||||||
|
result_handler=self._mt_individual_result_handler,
|
||||||
|
error_result_handler=self._mt_individual_error_handler,
|
||||||
)
|
)
|
||||||
batch_results = await super().send_prompts_async(
|
|
||||||
prompts=batch_texts,
|
|
||||||
result_handler=self._mt_batch_result_handler,
|
|
||||||
error_result_handler=self._mt_batch_error_handler,
|
|
||||||
)
|
|
||||||
all_translated, mismatched = self._apply_mt_batch_results(
|
|
||||||
segments, batch_results, batch_indices
|
|
||||||
)
|
|
||||||
if mismatched:
|
|
||||||
retranslated = await asyncio.to_thread(
|
|
||||||
self._retranslate_mismatched, segments, mismatched
|
|
||||||
)
|
|
||||||
for idx, trans in retranslated.items():
|
|
||||||
all_translated[idx] = trans
|
|
||||||
return all_translated
|
|
||||||
|
|
||||||
# Non-MT mode: JSON batch translation
|
# Non-MT mode: JSON batch translation
|
||||||
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
||||||
|
|||||||
Reference in New Issue
Block a user