优化docx附加译文效果v1.0

This commit is contained in:
xunbu
2025-10-25 11:04:47 +08:00
parent 6ec5747e30
commit fc72a370a3

View File

@@ -1,6 +1,8 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
import asyncio import asyncio
from collections import defaultdict
from copy import deepcopy
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple from typing import Self, Literal, List, Dict, Any, Tuple
@@ -8,6 +10,7 @@ from typing import Self, Literal, List, Dict, Any, Tuple
import docx import docx
from docx.document import Document as DocumentObject from docx.document import Document as DocumentObject
from docx.opc.part import Part from docx.opc.part import Part
from docx.oxml import OxmlElement
from docx.oxml.ns import qn from docx.oxml.ns import qn
from docx.oxml.text.run import CT_R from docx.oxml.text.run import CT_R
from docx.section import _Header, _Footer from docx.section import _Header, _Footer
@@ -161,7 +164,8 @@ class DocxTranslator(AiTranslator):
return return
full_text = "".join(r.text for r in current_runs) full_text = "".join(r.text for r in current_runs)
if full_text.strip(): if full_text.strip():
elements.append({"type": "text_runs", "runs": list(current_runs)}) # 在 elements 中增加对父段落的引用
elements.append({"type": "text_runs", "runs": list(current_runs), "paragraph": parent_paragraph})
texts.append(full_text) texts.append(full_text)
state['current_runs'].clear() state['current_runs'].clear()
@@ -232,7 +236,7 @@ class DocxTranslator(AiTranslator):
if current_runs: if current_runs:
full_text = "".join(r.text for r in current_runs) full_text = "".join(r.text for r in current_runs)
if full_text.strip(): if full_text.strip():
elements.append({"type": "text_runs", "runs": list(current_runs)}) elements.append({"type": "text_runs", "runs": list(current_runs), "paragraph": para})
texts.append(full_text) texts.append(full_text)
current_runs.clear() current_runs.clear()
@@ -306,6 +310,9 @@ class DocxTranslator(AiTranslator):
# 找到第一个可以写入文本的run # 找到第一个可以写入文本的run
for i, run in enumerate(runs): for i, run in enumerate(runs):
if run.element.getparent() is not None: if run.element.getparent() is not None:
# 如果 run 是副本的一部分,其 _parent 可能仍然指向原始文档的段落
# 但我们需要确保它与 element_info["paragraph"] 同步
run._parent = element_info["paragraph"]
run.text = final_text run.text = final_text
first_real_run_index = i first_real_run_index = i
break break
@@ -327,6 +334,7 @@ class DocxTranslator(AiTranslator):
self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。") self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。")
pass pass
# ---------- 代码修改部分:重写 _after_translate 方法 ----------
def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str], def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str],
originals: List[str]) -> bytes: originals: List[str]) -> bytes:
if len(elements) != len(translated): if len(elements) != len(translated):
@@ -335,21 +343,69 @@ class DocxTranslator(AiTranslator):
min_len = min(len(elements), len(translated), len(originals)) min_len = min(len(elements), len(translated), len(originals))
elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len] elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len]
for info, orig, trans in zip(elements, originals, translated): if self.insert_mode == "replace":
if self.insert_mode == "replace": for info, trans in zip(elements, translated):
final_text = trans self._apply_translation(info, trans)
elif self.insert_mode == "append": else:
final_text = orig + self.separator + trans # 1. 按段落对所有翻译信息进行分组
elif self.insert_mode == "prepend": paragraph_segments = defaultdict(list)
final_text = trans + self.separator + orig for i, info in enumerate(elements):
else: # 'paragraph' 是在 _process_paragraph 中添加的
final_text = trans paragraph = info["paragraph"]
self._apply_translation(info, final_text) para_id = id(paragraph._p)
# 保存元素索引和对应的译文
paragraph_segments[para_id].append({"index": i, "translation": translated[i]})
# 2. 遍历每个需要翻译的段落
processed_paragraphs = set()
for info in elements:
paragraph = info["paragraph"]
p_element = paragraph._p
para_id = id(p_element)
if para_id in processed_paragraphs:
continue
processed_paragraphs.add(para_id)
# 3. 创建一个原始段落的深层XML副本
translated_p_element = deepcopy(p_element)
# 为这个副本创建一个临时的 Paragraph 对象
# 父级容器(如 _body, _tc, etc.)对于应用翻译是必要的
translated_paragraph_obj = Paragraph(translated_p_element, paragraph._parent)
# 4. 在副本上执行“替换”操作
segments_for_this_para = paragraph_segments[para_id]
# 需要将属于这个副本的文本块 (elements) 找出来并应用翻译
for seg_info in segments_for_this_para:
element_index = seg_info["index"]
translation = seg_info["translation"]
# 关键:创建一个指向副本内部元素的 element_info
# 我们不能直接用原始的 info因为它指向原始段落
# 但我们可以重用 runs 的结构,然后更新其父级
original_element_info = elements[element_index]
translated_element_info = {
"type": "text_runs",
"runs": [Run(r.element, translated_paragraph_obj) for r in original_element_info["runs"]],
"paragraph": translated_paragraph_obj
}
self._apply_translation(translated_element_info, translation)
# 5. 将样式完美的译文段落副本插入到原始段落旁边
if self.insert_mode == "append":
p_element.addnext(translated_p_element)
elif self.insert_mode == "prepend":
p_element.addprevious(translated_p_element)
doc_output_stream = BytesIO() doc_output_stream = BytesIO()
doc.save(doc_output_stream) doc.save(doc_output_stream)
return doc_output_stream.getvalue() return doc_output_stream.getvalue()
# ---------------------- 修改结束 ----------------------
def translate(self, document: Document) -> Self: def translate(self, document: Document) -> Self:
doc, elements, originals = self._pre_translate(document) doc, elements, originals = self._pre_translate(document)
if not originals: if not originals: