优化docx附加译文效果v1.0
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from collections import defaultdict
|
||||||
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Self, Literal, List, Dict, Any, Tuple
|
from typing import Self, Literal, List, Dict, Any, Tuple
|
||||||
@@ -8,6 +10,7 @@ from typing import Self, Literal, List, Dict, Any, Tuple
|
|||||||
import docx
|
import docx
|
||||||
from docx.document import Document as DocumentObject
|
from docx.document import Document as DocumentObject
|
||||||
from docx.opc.part import Part
|
from docx.opc.part import Part
|
||||||
|
from docx.oxml import OxmlElement
|
||||||
from docx.oxml.ns import qn
|
from docx.oxml.ns import qn
|
||||||
from docx.oxml.text.run import CT_R
|
from docx.oxml.text.run import CT_R
|
||||||
from docx.section import _Header, _Footer
|
from docx.section import _Header, _Footer
|
||||||
@@ -161,7 +164,8 @@ class DocxTranslator(AiTranslator):
|
|||||||
return
|
return
|
||||||
full_text = "".join(r.text for r in current_runs)
|
full_text = "".join(r.text for r in current_runs)
|
||||||
if full_text.strip():
|
if full_text.strip():
|
||||||
elements.append({"type": "text_runs", "runs": list(current_runs)})
|
# 在 elements 中增加对父段落的引用
|
||||||
|
elements.append({"type": "text_runs", "runs": list(current_runs), "paragraph": parent_paragraph})
|
||||||
texts.append(full_text)
|
texts.append(full_text)
|
||||||
state['current_runs'].clear()
|
state['current_runs'].clear()
|
||||||
|
|
||||||
@@ -232,7 +236,7 @@ class DocxTranslator(AiTranslator):
|
|||||||
if current_runs:
|
if current_runs:
|
||||||
full_text = "".join(r.text for r in current_runs)
|
full_text = "".join(r.text for r in current_runs)
|
||||||
if full_text.strip():
|
if full_text.strip():
|
||||||
elements.append({"type": "text_runs", "runs": list(current_runs)})
|
elements.append({"type": "text_runs", "runs": list(current_runs), "paragraph": para})
|
||||||
texts.append(full_text)
|
texts.append(full_text)
|
||||||
current_runs.clear()
|
current_runs.clear()
|
||||||
|
|
||||||
@@ -306,6 +310,9 @@ class DocxTranslator(AiTranslator):
|
|||||||
# 找到第一个可以写入文本的run
|
# 找到第一个可以写入文本的run
|
||||||
for i, run in enumerate(runs):
|
for i, run in enumerate(runs):
|
||||||
if run.element.getparent() is not None:
|
if run.element.getparent() is not None:
|
||||||
|
# 如果 run 是副本的一部分,其 _parent 可能仍然指向原始文档的段落
|
||||||
|
# 但我们需要确保它与 element_info["paragraph"] 同步
|
||||||
|
run._parent = element_info["paragraph"]
|
||||||
run.text = final_text
|
run.text = final_text
|
||||||
first_real_run_index = i
|
first_real_run_index = i
|
||||||
break
|
break
|
||||||
@@ -327,6 +334,7 @@ class DocxTranslator(AiTranslator):
|
|||||||
self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。")
|
self.logger.debug(f"尝试删除一个不存在的run元素。这通常是安全的。")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# ---------- 代码修改部分:重写 _after_translate 方法 ----------
|
||||||
def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str],
|
def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str],
|
||||||
originals: List[str]) -> bytes:
|
originals: List[str]) -> bytes:
|
||||||
if len(elements) != len(translated):
|
if len(elements) != len(translated):
|
||||||
@@ -335,21 +343,69 @@ class DocxTranslator(AiTranslator):
|
|||||||
min_len = min(len(elements), len(translated), len(originals))
|
min_len = min(len(elements), len(translated), len(originals))
|
||||||
elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len]
|
elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len]
|
||||||
|
|
||||||
for info, orig, trans in zip(elements, originals, translated):
|
if self.insert_mode == "replace":
|
||||||
if self.insert_mode == "replace":
|
for info, trans in zip(elements, translated):
|
||||||
final_text = trans
|
self._apply_translation(info, trans)
|
||||||
elif self.insert_mode == "append":
|
else:
|
||||||
final_text = orig + self.separator + trans
|
# 1. 按段落对所有翻译信息进行分组
|
||||||
elif self.insert_mode == "prepend":
|
paragraph_segments = defaultdict(list)
|
||||||
final_text = trans + self.separator + orig
|
for i, info in enumerate(elements):
|
||||||
else:
|
# 'paragraph' 是在 _process_paragraph 中添加的
|
||||||
final_text = trans
|
paragraph = info["paragraph"]
|
||||||
self._apply_translation(info, final_text)
|
para_id = id(paragraph._p)
|
||||||
|
# 保存元素索引和对应的译文
|
||||||
|
paragraph_segments[para_id].append({"index": i, "translation": translated[i]})
|
||||||
|
|
||||||
|
# 2. 遍历每个需要翻译的段落
|
||||||
|
processed_paragraphs = set()
|
||||||
|
for info in elements:
|
||||||
|
paragraph = info["paragraph"]
|
||||||
|
p_element = paragraph._p
|
||||||
|
para_id = id(p_element)
|
||||||
|
|
||||||
|
if para_id in processed_paragraphs:
|
||||||
|
continue
|
||||||
|
processed_paragraphs.add(para_id)
|
||||||
|
|
||||||
|
# 3. 创建一个原始段落的深层XML副本
|
||||||
|
translated_p_element = deepcopy(p_element)
|
||||||
|
|
||||||
|
# 为这个副本创建一个临时的 Paragraph 对象
|
||||||
|
# 父级容器(如 _body, _tc, etc.)对于应用翻译是必要的
|
||||||
|
translated_paragraph_obj = Paragraph(translated_p_element, paragraph._parent)
|
||||||
|
|
||||||
|
# 4. 在副本上执行“替换”操作
|
||||||
|
segments_for_this_para = paragraph_segments[para_id]
|
||||||
|
|
||||||
|
# 需要将属于这个副本的文本块 (elements) 找出来并应用翻译
|
||||||
|
for seg_info in segments_for_this_para:
|
||||||
|
element_index = seg_info["index"]
|
||||||
|
translation = seg_info["translation"]
|
||||||
|
|
||||||
|
# 关键:创建一个指向副本内部元素的 element_info
|
||||||
|
# 我们不能直接用原始的 info,因为它指向原始段落
|
||||||
|
# 但我们可以重用 runs 的结构,然后更新其父级
|
||||||
|
original_element_info = elements[element_index]
|
||||||
|
translated_element_info = {
|
||||||
|
"type": "text_runs",
|
||||||
|
"runs": [Run(r.element, translated_paragraph_obj) for r in original_element_info["runs"]],
|
||||||
|
"paragraph": translated_paragraph_obj
|
||||||
|
}
|
||||||
|
|
||||||
|
self._apply_translation(translated_element_info, translation)
|
||||||
|
|
||||||
|
# 5. 将样式完美的译文段落副本插入到原始段落旁边
|
||||||
|
if self.insert_mode == "append":
|
||||||
|
p_element.addnext(translated_p_element)
|
||||||
|
elif self.insert_mode == "prepend":
|
||||||
|
p_element.addprevious(translated_p_element)
|
||||||
|
|
||||||
doc_output_stream = BytesIO()
|
doc_output_stream = BytesIO()
|
||||||
doc.save(doc_output_stream)
|
doc.save(doc_output_stream)
|
||||||
return doc_output_stream.getvalue()
|
return doc_output_stream.getvalue()
|
||||||
|
|
||||||
|
# ---------------------- 修改结束 ----------------------
|
||||||
|
|
||||||
def translate(self, document: Document) -> Self:
|
def translate(self, document: Document) -> Self:
|
||||||
doc, elements, originals = self._pre_translate(document)
|
doc, elements, originals = self._pre_translate(document)
|
||||||
if not originals:
|
if not originals:
|
||||||
|
|||||||
Reference in New Issue
Block a user