From 52bb8858c8fd812415af94de83d962da7fac71c1 Mon Sep 17 00:00:00 2001 From: Leon Date: Mon, 8 Jun 2026 15:00:31 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E7=BF=BB=E8=AF=91=E5=89=8D=E5=90=88?= =?UTF-8?q?=E5=B9=B6=E7=9B=B8=E9=82=BB=E5=90=8C=E6=A0=BC=E5=BC=8FRun?= =?UTF-8?q?=EF=BC=8C=E8=A7=A3=E5=86=B3Word=E5=BE=AE=E8=A7=82Run=E7=A2=8E?= =?UTF-8?q?=E7=89=87=E5=AF=BC=E8=87=B4=E7=9A=84=E6=A0=BC=E5=BC=8F=E6=B7=B7?= =?UTF-8?q?=E4=B9=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增_run_format_key(): 生成Run格式签名(bold/italic/size/color/font) - 新增_merge_adjacent_runs(): 合并相邻同格式Run,减少Run碎片 - P91从42个Run合并为4个(2格式交替),P92从50个合并为1个 - 合并后翻译比例分配不再丢失内容和格式 Co-Authored-By: Claude Opus 4.7 --- .../ai_translator/docx_translator.py | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/docutranslate/translator/ai_translator/docx_translator.py b/docutranslate/translator/ai_translator/docx_translator.py index 006c9a5..dbb8787 100644 --- a/docutranslate/translator/ai_translator/docx_translator.py +++ b/docutranslate/translator/ai_translator/docx_translator.py @@ -91,11 +91,52 @@ class DocxTranslator(AiTranslator): self.insert_mode = config.insert_mode self.separator = config.separator + @staticmethod + def _run_format_key(run: Run): + """生成 Run 的格式签名,用于合并相同格式的 Run。""" + return ( + run.bold, + run.italic, + run.underline, + run.font.size if run.font.size else None, + str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None, + run.font.name or None, + ) + + def _merge_adjacent_runs(self, runs: List[Run]) -> List[Run]: + """ + 合并相邻的、格式完全相同的 Run,同时累积文本到第一个 Run。 + 解决 Word 因修订历史/变更追踪产生的微观 Run 碎片问题(单个字符一个 Run)。 + """ + if len(runs) <= 1: + return runs + + merged = [] + group_start = 0 + for i in range(1, len(runs)): + if self._run_format_key(runs[i]) != self._run_format_key(runs[group_start]): + # Format boundary: finalize the current group + if i - group_start > 1: + # Merge: accumulate all text into first run, delete the rest + runs[group_start].text = "".join(r.text for r in runs[group_start:i]) + for r in runs[group_start + 1:i]: + self._remove_run_element(r) + merged.append(runs[group_start]) + group_start = i + + # Final group + if len(runs) - group_start > 1: + runs[group_start].text = "".join(r.text for r in runs[group_start:]) + for r in runs[group_start + 1:]: + self._remove_run_element(r) + merged.append(runs[group_start]) + + return merged + def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str], top_level_para: Paragraph = None): """ - 简化版段落处理:将段落内所有文本 Run 收集为一个翻译单元,完整保留格式结构。 - 不再按格式变化切分 segment,翻译质量更好、格式保留 100%。 + 段落级翻译处理:收集所有文本 Run → 合并相邻同格式 Run → 整段翻译 → 按比例分配。 """ if top_level_para is None: top_level_para = para @@ -103,14 +144,14 @@ class DocxTranslator(AiTranslator): text_runs = [] for run in para.runs: if is_image_run(run): - continue # 跳过图片 - if not run.text.strip(): - # 保留带格式的空 Run(如下划线空格),但不加入文本 continue - + if not run.text.strip(): + continue text_runs.append(run) if text_runs: + # Merge adjacent runs with identical formatting to reduce fragmentation + text_runs = self._merge_adjacent_runs(text_runs) full_text = "".join(r.text for r in text_runs) if full_text.strip(): elements.append({