From 9676fdaf99dd46f6267382aca3583a3412c4966d Mon Sep 17 00:00:00 2001 From: xunbu Date: Mon, 13 Oct 2025 20:51:53 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96docx=E3=80=81=E6=8F=90?= =?UTF-8?q?=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/agent.py | 2 +- docutranslate/agents/segments_agent.py | 1 + .../ai_translator/docx_translator.py | 137 +++++++++++++----- 3 files changed, 100 insertions(+), 40 deletions(-) diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index 1c1cfef..fed47a0 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -319,7 +319,7 @@ class Agent: should_retry = True # 专门捕获部分翻译错误(软错误) except PartialAgentResultError as e: - # print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}") + print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}") self.logger.error(f"收到部分返回结果,将尝试重试: {e}") current_partial_result = e.partial_result should_retry = True diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index ccabbaf..fb0c243 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -52,6 +52,7 @@ class SegmentsTranslateAgent(Agent): "": "" }} - The response must be a **valid** JSON object +- Escape the double quotes within the JSON string. - (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output. # Example (assuming the target language in this example is English, {config.to_lang} is the actual target language) diff --git a/docutranslate/translator/ai_translator/docx_translator.py b/docutranslate/translator/ai_translator/docx_translator.py index 4213897..4cbcf21 100644 --- a/docutranslate/translator/ai_translator/docx_translator.py +++ b/docutranslate/translator/ai_translator/docx_translator.py @@ -55,9 +55,29 @@ class DocxTranslator(AiTranslator): """ 用于翻译 .docx 文件的高级翻译器,能够高精度保留样式、处理超链接、 域代码(如图注),并支持翻译脚注、尾注等。 - [v3.1 - 采纳新规则:将带格式的空文本视为分割点] + [v3.6 - 引入递归解析以处理嵌套内容标签,如 smartTag] """ + # 包含所有应被解析器完全忽略的、不影响文本内容的元数据标签 + IGNORED_TAGS = { + qn('w:proofErr'), # 拼写和语法错误标记 + qn('w:lastRenderedPageBreak'), # 上次渲染的分页符位置 + qn('w:bookmarkStart'), # 书签开始 + qn('w:bookmarkEnd'), # 书签结束 + qn('w:commentRangeStart'), # 批注范围开始 + qn('w:commentRangeEnd'), # 批注范围结束 + qn('w:del'), # 修订:删除 + qn('w:ins'), # 修订:插入 + qn('w:moveFrom'), # 修订:移动源 + qn('w:moveTo'), # 修订:移动目标 + } + + # 包含应递归处理其内部内容的容器标签 + RECURSIVE_CONTAINER_TAGS = { + qn('w:smartTag'), # 智能标记 (包含文本) + qn('w:sdtContent'), # 结构化文档标签内容 (包含文本) + } + def __init__(self, config: DocxTranslatorConfig): super().__init__(config=config) self.chunk_size = config.chunk_size @@ -104,16 +124,13 @@ class DocxTranslator(AiTranslator): pass return False - def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str]): + def _process_element_children(self, element, elements: List[Dict[str, Any]], texts: List[str], + state: Dict[str, Any]): """ - 使用状态机处理段落,将连续的文本Run合并为一个翻译单元, - 同时将图片、超链接、带格式的空文本等视为分割点。 + [新函数] 递归处理任何给定XML元素的子节点。 + 'state' 字典用于跨递归调用传递状态,如 current_runs 和 is_inside_field。 """ - if not para.text.strip(): - return - - current_runs = [] - is_inside_field = False + current_runs = state['current_runs'] def flush_segment(): nonlocal current_runs @@ -124,43 +141,73 @@ class DocxTranslator(AiTranslator): elements.append({"type": "text_runs", "runs": current_runs}) texts.append(full_text) current_runs = [] + state['current_runs'] = current_runs - for child in para._p: - if self._is_seq_field(child): - flush_segment() + for child in element: + if child.tag in self.IGNORED_TAGS: continue - if child.tag == qn('w:fldChar'): - fld_type = child.get(qn('w:fldCharType')) - if fld_type == 'begin': - flush_segment() - is_inside_field = True - elif fld_type == 'end': - is_inside_field = False + if child.tag in self.RECURSIVE_CONTAINER_TAGS: + self._process_element_children(child, elements, texts, state) continue - if is_inside_field: - continue - - if isinstance(child, CT_R): - run = Run(child, para) - # 如果是图片、制表符,或者【带格式的空文本】,则视为分割点 - if is_image_run(run) or run.element.find(qn('w:tab')) is not None or is_formatting_only_run(run): - flush_segment() - # 这个 run 本身被保留,不参与翻译 - else: - # 否则,它是一个普通的文本 Run,收集起来 - current_runs.append(run) - elif child.tag == qn('w:hyperlink'): + if child.tag == qn('w:hyperlink'): flush_segment() hyperlink_text = self._extract_hyperlink_text(child) if hyperlink_text.strip(): elements.append({"type": "hyperlink", "element": child}) texts.append(hyperlink_text) + continue + + field_char_element = None + if child.tag == qn('w:fldChar'): + field_char_element = child + elif isinstance(child, CT_R): + field_char_element = child.find(qn('w:fldChar')) + + if field_char_element is not None: + flush_segment() + fld_type = field_char_element.get(qn('w:fldCharType')) + if fld_type == 'begin': + state['is_inside_field'] = True + elif fld_type == 'end': + state['is_inside_field'] = False + continue + + if state.get('is_inside_field', False): + continue + + if self._is_seq_field(child): + flush_segment() + continue + + if isinstance(child, CT_R): + run = Run(child, None) + if is_image_run(run) or run.element.find(qn('w:tab')) is not None or is_formatting_only_run(run): + flush_segment() + else: + current_runs.append(run) else: flush_segment() - flush_segment() + state['current_runs'] = current_runs + + def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str]): + """ + [重构] 作为递归处理器的入口点,初始化状态并调用递归函数。 + """ + if not para.text.strip(): + return + + state = {'current_runs': [], 'is_inside_field': False} + self._process_element_children(para._p, elements, texts, state) + + current_runs = state['current_runs'] + if current_runs: + full_text = "".join(r.text for r in current_runs) + if full_text.strip(): + elements.append({"type": "text_runs", "runs": current_runs}) + texts.append(full_text) def _process_container(self, container, elements: List[Dict[str, Any]], texts: List[str]): """递归处理包含段落和表格的容器(如文档、单元格、页眉)。""" @@ -204,14 +251,23 @@ class DocxTranslator(AiTranslator): return doc, elements, texts def _apply_translation(self, element_info: Dict[str, Any], final_text: str): - """将翻译后的文本写回对应的 OXML 元素。对于多Run的文本段,写入第一个Run并清空其余。""" + """ + 将翻译后的文本写回对应的 OXML 元素。 + 对于多Run的文本段,写入第一个Run并【删除】其余,以避免产生方框占位符。 + """ el_type = element_info["type"] if el_type == "text_runs": runs = element_info["runs"] - if runs: - runs[0].text = final_text - for run in runs[1:]: - run.text = "" + if not runs: + return + + runs[0].text = final_text + + for run in runs[1:]: + p_element = run.element.getparent() + if p_element is not None: + p_element.remove(run.element) + elif el_type == "hyperlink": hyperlink = element_info["element"] r_elements = hyperlink.findall(f'.//{qn("w:r")}') @@ -219,10 +275,12 @@ class DocxTranslator(AiTranslator): first_r = r_elements[0] for t in first_r.findall(f'.//{qn("w:t")}'): first_r.remove(t) + new_t = OxmlElement('w:t') new_t.text = final_text new_t.set(qn('xml:space'), 'preserve') first_r.append(new_t) + for other_r in r_elements[1:]: if (parent := other_r.getparent()) is not None: parent.remove(other_r) @@ -230,7 +288,8 @@ class DocxTranslator(AiTranslator): def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str], originals: List[str]) -> bytes: if len(elements) != len(translated): - self.logger.error(f"Translation count mismatch! Originals: {len(originals)}, Translated: {len(translated)}. Processing common part only.") + self.logger.error( + f"Translation count mismatch! Originals: {len(originals)}, Translated: {len(translated)}. Processing common part only.") min_len = min(len(elements), len(translated), len(originals)) elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len]