fix: 翻译前合并相邻同格式Run,解决Word微观Run碎片导致的格式混乱
- 新增_run_format_key(): 生成Run格式签名(bold/italic/size/color/font) - 新增_merge_adjacent_runs(): 合并相邻同格式Run,减少Run碎片 - P91从42个Run合并为4个(2格式交替),P92从50个合并为1个 - 合并后翻译比例分配不再丢失内容和格式 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -91,11 +91,52 @@ class DocxTranslator(AiTranslator):
|
|||||||
self.insert_mode = config.insert_mode
|
self.insert_mode = config.insert_mode
|
||||||
self.separator = config.separator
|
self.separator = config.separator
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _run_format_key(run: Run):
|
||||||
|
"""生成 Run 的格式签名,用于合并相同格式的 Run。"""
|
||||||
|
return (
|
||||||
|
run.bold,
|
||||||
|
run.italic,
|
||||||
|
run.underline,
|
||||||
|
run.font.size if run.font.size else None,
|
||||||
|
str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None,
|
||||||
|
run.font.name or None,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _merge_adjacent_runs(self, runs: List[Run]) -> List[Run]:
|
||||||
|
"""
|
||||||
|
合并相邻的、格式完全相同的 Run,同时累积文本到第一个 Run。
|
||||||
|
解决 Word 因修订历史/变更追踪产生的微观 Run 碎片问题(单个字符一个 Run)。
|
||||||
|
"""
|
||||||
|
if len(runs) <= 1:
|
||||||
|
return runs
|
||||||
|
|
||||||
|
merged = []
|
||||||
|
group_start = 0
|
||||||
|
for i in range(1, len(runs)):
|
||||||
|
if self._run_format_key(runs[i]) != self._run_format_key(runs[group_start]):
|
||||||
|
# Format boundary: finalize the current group
|
||||||
|
if i - group_start > 1:
|
||||||
|
# Merge: accumulate all text into first run, delete the rest
|
||||||
|
runs[group_start].text = "".join(r.text for r in runs[group_start:i])
|
||||||
|
for r in runs[group_start + 1:i]:
|
||||||
|
self._remove_run_element(r)
|
||||||
|
merged.append(runs[group_start])
|
||||||
|
group_start = i
|
||||||
|
|
||||||
|
# Final group
|
||||||
|
if len(runs) - group_start > 1:
|
||||||
|
runs[group_start].text = "".join(r.text for r in runs[group_start:])
|
||||||
|
for r in runs[group_start + 1:]:
|
||||||
|
self._remove_run_element(r)
|
||||||
|
merged.append(runs[group_start])
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str],
|
def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str],
|
||||||
top_level_para: Paragraph = None):
|
top_level_para: Paragraph = None):
|
||||||
"""
|
"""
|
||||||
简化版段落处理:将段落内所有文本 Run 收集为一个翻译单元,完整保留格式结构。
|
段落级翻译处理:收集所有文本 Run → 合并相邻同格式 Run → 整段翻译 → 按比例分配。
|
||||||
不再按格式变化切分 segment,翻译质量更好、格式保留 100%。
|
|
||||||
"""
|
"""
|
||||||
if top_level_para is None:
|
if top_level_para is None:
|
||||||
top_level_para = para
|
top_level_para = para
|
||||||
@@ -103,14 +144,14 @@ class DocxTranslator(AiTranslator):
|
|||||||
text_runs = []
|
text_runs = []
|
||||||
for run in para.runs:
|
for run in para.runs:
|
||||||
if is_image_run(run):
|
if is_image_run(run):
|
||||||
continue # 跳过图片
|
|
||||||
if not run.text.strip():
|
|
||||||
# 保留带格式的空 Run(如下划线空格),但不加入文本
|
|
||||||
continue
|
continue
|
||||||
|
if not run.text.strip():
|
||||||
|
continue
|
||||||
text_runs.append(run)
|
text_runs.append(run)
|
||||||
|
|
||||||
if text_runs:
|
if text_runs:
|
||||||
|
# Merge adjacent runs with identical formatting to reduce fragmentation
|
||||||
|
text_runs = self._merge_adjacent_runs(text_runs)
|
||||||
full_text = "".join(r.text for r in text_runs)
|
full_text = "".join(r.text for r in text_runs)
|
||||||
if full_text.strip():
|
if full_text.strip():
|
||||||
elements.append({
|
elements.append({
|
||||||
|
|||||||
Reference in New Issue
Block a user