优化docx、提示词
This commit is contained in:
@@ -319,7 +319,7 @@ class Agent:
|
|||||||
should_retry = True
|
should_retry = True
|
||||||
# 专门捕获部分翻译错误(软错误)
|
# 专门捕获部分翻译错误(软错误)
|
||||||
except PartialAgentResultError as e:
|
except PartialAgentResultError as e:
|
||||||
# print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}")
|
print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}")
|
||||||
self.logger.error(f"收到部分返回结果,将尝试重试: {e}")
|
self.logger.error(f"收到部分返回结果,将尝试重试: {e}")
|
||||||
current_partial_result = e.partial_result
|
current_partial_result = e.partial_result
|
||||||
should_retry = True
|
should_retry = True
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ class SegmentsTranslateAgent(Agent):
|
|||||||
"<segment ID>": "<translated text>"
|
"<segment ID>": "<translated text>"
|
||||||
}}
|
}}
|
||||||
- The response must be a **valid** JSON object
|
- The response must be a **valid** JSON object
|
||||||
|
- Escape the double quotes within the JSON string.
|
||||||
- (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
|
- (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
|
||||||
|
|
||||||
# Example (assuming the target language in this example is English, {config.to_lang} is the actual target language)
|
# Example (assuming the target language in this example is English, {config.to_lang} is the actual target language)
|
||||||
|
|||||||
@@ -55,9 +55,29 @@ class DocxTranslator(AiTranslator):
|
|||||||
"""
|
"""
|
||||||
用于翻译 .docx 文件的高级翻译器,能够高精度保留样式、处理超链接、
|
用于翻译 .docx 文件的高级翻译器,能够高精度保留样式、处理超链接、
|
||||||
域代码(如图注),并支持翻译脚注、尾注等。
|
域代码(如图注),并支持翻译脚注、尾注等。
|
||||||
[v3.1 - 采纳新规则:将带格式的空文本视为分割点]
|
[v3.6 - 引入递归解析以处理嵌套内容标签,如 smartTag]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# 包含所有应被解析器完全忽略的、不影响文本内容的元数据标签
|
||||||
|
IGNORED_TAGS = {
|
||||||
|
qn('w:proofErr'), # 拼写和语法错误标记
|
||||||
|
qn('w:lastRenderedPageBreak'), # 上次渲染的分页符位置
|
||||||
|
qn('w:bookmarkStart'), # 书签开始
|
||||||
|
qn('w:bookmarkEnd'), # 书签结束
|
||||||
|
qn('w:commentRangeStart'), # 批注范围开始
|
||||||
|
qn('w:commentRangeEnd'), # 批注范围结束
|
||||||
|
qn('w:del'), # 修订:删除
|
||||||
|
qn('w:ins'), # 修订:插入
|
||||||
|
qn('w:moveFrom'), # 修订:移动源
|
||||||
|
qn('w:moveTo'), # 修订:移动目标
|
||||||
|
}
|
||||||
|
|
||||||
|
# 包含应递归处理其内部内容的容器标签
|
||||||
|
RECURSIVE_CONTAINER_TAGS = {
|
||||||
|
qn('w:smartTag'), # 智能标记 (包含文本)
|
||||||
|
qn('w:sdtContent'), # 结构化文档标签内容 (包含文本)
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self, config: DocxTranslatorConfig):
|
def __init__(self, config: DocxTranslatorConfig):
|
||||||
super().__init__(config=config)
|
super().__init__(config=config)
|
||||||
self.chunk_size = config.chunk_size
|
self.chunk_size = config.chunk_size
|
||||||
@@ -104,16 +124,13 @@ class DocxTranslator(AiTranslator):
|
|||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str]):
|
def _process_element_children(self, element, elements: List[Dict[str, Any]], texts: List[str],
|
||||||
|
state: Dict[str, Any]):
|
||||||
"""
|
"""
|
||||||
使用状态机处理段落,将连续的文本Run合并为一个翻译单元,
|
[新函数] 递归处理任何给定XML元素的子节点。
|
||||||
同时将图片、超链接、带格式的空文本等视为分割点。
|
'state' 字典用于跨递归调用传递状态,如 current_runs 和 is_inside_field。
|
||||||
"""
|
"""
|
||||||
if not para.text.strip():
|
current_runs = state['current_runs']
|
||||||
return
|
|
||||||
|
|
||||||
current_runs = []
|
|
||||||
is_inside_field = False
|
|
||||||
|
|
||||||
def flush_segment():
|
def flush_segment():
|
||||||
nonlocal current_runs
|
nonlocal current_runs
|
||||||
@@ -124,43 +141,73 @@ class DocxTranslator(AiTranslator):
|
|||||||
elements.append({"type": "text_runs", "runs": current_runs})
|
elements.append({"type": "text_runs", "runs": current_runs})
|
||||||
texts.append(full_text)
|
texts.append(full_text)
|
||||||
current_runs = []
|
current_runs = []
|
||||||
|
state['current_runs'] = current_runs
|
||||||
|
|
||||||
for child in para._p:
|
for child in element:
|
||||||
if self._is_seq_field(child):
|
if child.tag in self.IGNORED_TAGS:
|
||||||
flush_segment()
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if child.tag == qn('w:fldChar'):
|
if child.tag in self.RECURSIVE_CONTAINER_TAGS:
|
||||||
fld_type = child.get(qn('w:fldCharType'))
|
self._process_element_children(child, elements, texts, state)
|
||||||
if fld_type == 'begin':
|
|
||||||
flush_segment()
|
|
||||||
is_inside_field = True
|
|
||||||
elif fld_type == 'end':
|
|
||||||
is_inside_field = False
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if is_inside_field:
|
if child.tag == qn('w:hyperlink'):
|
||||||
continue
|
|
||||||
|
|
||||||
if isinstance(child, CT_R):
|
|
||||||
run = Run(child, para)
|
|
||||||
# 如果是图片、制表符,或者【带格式的空文本】,则视为分割点
|
|
||||||
if is_image_run(run) or run.element.find(qn('w:tab')) is not None or is_formatting_only_run(run):
|
|
||||||
flush_segment()
|
|
||||||
# 这个 run 本身被保留,不参与翻译
|
|
||||||
else:
|
|
||||||
# 否则,它是一个普通的文本 Run,收集起来
|
|
||||||
current_runs.append(run)
|
|
||||||
elif child.tag == qn('w:hyperlink'):
|
|
||||||
flush_segment()
|
flush_segment()
|
||||||
hyperlink_text = self._extract_hyperlink_text(child)
|
hyperlink_text = self._extract_hyperlink_text(child)
|
||||||
if hyperlink_text.strip():
|
if hyperlink_text.strip():
|
||||||
elements.append({"type": "hyperlink", "element": child})
|
elements.append({"type": "hyperlink", "element": child})
|
||||||
texts.append(hyperlink_text)
|
texts.append(hyperlink_text)
|
||||||
|
continue
|
||||||
|
|
||||||
|
field_char_element = None
|
||||||
|
if child.tag == qn('w:fldChar'):
|
||||||
|
field_char_element = child
|
||||||
|
elif isinstance(child, CT_R):
|
||||||
|
field_char_element = child.find(qn('w:fldChar'))
|
||||||
|
|
||||||
|
if field_char_element is not None:
|
||||||
|
flush_segment()
|
||||||
|
fld_type = field_char_element.get(qn('w:fldCharType'))
|
||||||
|
if fld_type == 'begin':
|
||||||
|
state['is_inside_field'] = True
|
||||||
|
elif fld_type == 'end':
|
||||||
|
state['is_inside_field'] = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if state.get('is_inside_field', False):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self._is_seq_field(child):
|
||||||
|
flush_segment()
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(child, CT_R):
|
||||||
|
run = Run(child, None)
|
||||||
|
if is_image_run(run) or run.element.find(qn('w:tab')) is not None or is_formatting_only_run(run):
|
||||||
|
flush_segment()
|
||||||
|
else:
|
||||||
|
current_runs.append(run)
|
||||||
else:
|
else:
|
||||||
flush_segment()
|
flush_segment()
|
||||||
|
|
||||||
flush_segment()
|
state['current_runs'] = current_runs
|
||||||
|
|
||||||
|
def _process_paragraph(self, para: Paragraph, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
|
"""
|
||||||
|
[重构] 作为递归处理器的入口点,初始化状态并调用递归函数。
|
||||||
|
"""
|
||||||
|
if not para.text.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
state = {'current_runs': [], 'is_inside_field': False}
|
||||||
|
self._process_element_children(para._p, elements, texts, state)
|
||||||
|
|
||||||
|
current_runs = state['current_runs']
|
||||||
|
if current_runs:
|
||||||
|
full_text = "".join(r.text for r in current_runs)
|
||||||
|
if full_text.strip():
|
||||||
|
elements.append({"type": "text_runs", "runs": current_runs})
|
||||||
|
texts.append(full_text)
|
||||||
|
|
||||||
def _process_container(self, container, elements: List[Dict[str, Any]], texts: List[str]):
|
def _process_container(self, container, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
"""递归处理包含段落和表格的容器(如文档、单元格、页眉)。"""
|
"""递归处理包含段落和表格的容器(如文档、单元格、页眉)。"""
|
||||||
@@ -204,14 +251,23 @@ class DocxTranslator(AiTranslator):
|
|||||||
return doc, elements, texts
|
return doc, elements, texts
|
||||||
|
|
||||||
def _apply_translation(self, element_info: Dict[str, Any], final_text: str):
|
def _apply_translation(self, element_info: Dict[str, Any], final_text: str):
|
||||||
"""将翻译后的文本写回对应的 OXML 元素。对于多Run的文本段,写入第一个Run并清空其余。"""
|
"""
|
||||||
|
将翻译后的文本写回对应的 OXML 元素。
|
||||||
|
对于多Run的文本段,写入第一个Run并【删除】其余,以避免产生方框占位符。
|
||||||
|
"""
|
||||||
el_type = element_info["type"]
|
el_type = element_info["type"]
|
||||||
if el_type == "text_runs":
|
if el_type == "text_runs":
|
||||||
runs = element_info["runs"]
|
runs = element_info["runs"]
|
||||||
if runs:
|
if not runs:
|
||||||
runs[0].text = final_text
|
return
|
||||||
for run in runs[1:]:
|
|
||||||
run.text = ""
|
runs[0].text = final_text
|
||||||
|
|
||||||
|
for run in runs[1:]:
|
||||||
|
p_element = run.element.getparent()
|
||||||
|
if p_element is not None:
|
||||||
|
p_element.remove(run.element)
|
||||||
|
|
||||||
elif el_type == "hyperlink":
|
elif el_type == "hyperlink":
|
||||||
hyperlink = element_info["element"]
|
hyperlink = element_info["element"]
|
||||||
r_elements = hyperlink.findall(f'.//{qn("w:r")}')
|
r_elements = hyperlink.findall(f'.//{qn("w:r")}')
|
||||||
@@ -219,10 +275,12 @@ class DocxTranslator(AiTranslator):
|
|||||||
first_r = r_elements[0]
|
first_r = r_elements[0]
|
||||||
for t in first_r.findall(f'.//{qn("w:t")}'):
|
for t in first_r.findall(f'.//{qn("w:t")}'):
|
||||||
first_r.remove(t)
|
first_r.remove(t)
|
||||||
|
|
||||||
new_t = OxmlElement('w:t')
|
new_t = OxmlElement('w:t')
|
||||||
new_t.text = final_text
|
new_t.text = final_text
|
||||||
new_t.set(qn('xml:space'), 'preserve')
|
new_t.set(qn('xml:space'), 'preserve')
|
||||||
first_r.append(new_t)
|
first_r.append(new_t)
|
||||||
|
|
||||||
for other_r in r_elements[1:]:
|
for other_r in r_elements[1:]:
|
||||||
if (parent := other_r.getparent()) is not None:
|
if (parent := other_r.getparent()) is not None:
|
||||||
parent.remove(other_r)
|
parent.remove(other_r)
|
||||||
@@ -230,7 +288,8 @@ class DocxTranslator(AiTranslator):
|
|||||||
def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str],
|
def _after_translate(self, doc: DocumentObject, elements: List[Dict[str, Any]], translated: List[str],
|
||||||
originals: List[str]) -> bytes:
|
originals: List[str]) -> bytes:
|
||||||
if len(elements) != len(translated):
|
if len(elements) != len(translated):
|
||||||
self.logger.error(f"Translation count mismatch! Originals: {len(originals)}, Translated: {len(translated)}. Processing common part only.")
|
self.logger.error(
|
||||||
|
f"Translation count mismatch! Originals: {len(originals)}, Translated: {len(translated)}. Processing common part only.")
|
||||||
min_len = min(len(elements), len(translated), len(originals))
|
min_len = min(len(elements), len(translated), len(originals))
|
||||||
elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len]
|
elements, translated, originals = elements[:min_len], translated[:min_len], originals[:min_len]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user