修复html分隔符不能换行的问题

This commit is contained in:
xunbu
2025-10-18 15:15:03 +08:00
parent c95d6316a7
commit ec41df5226
3 changed files with 121 additions and 93 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -150,6 +150,7 @@ class HtmlTranslator(AiTranslator):
translated_texts: list[str], original_texts: list[str]) -> bytes:
"""
将翻译后的文本写回到BeautifulSoup对象中对应的节点或属性并返回最终的HTML字节流。
【版本 3.0: 修正了对HTML分隔符的支持】
"""
if len(translatable_items) != len(translated_texts):
self.logger.error("翻译前后的文本片段数量不匹配 (%d vs %d),跳过写入操作以防损坏文件。",
@@ -160,36 +161,59 @@ class HtmlTranslator(AiTranslator):
translated_text = translated_texts[i]
original_text = original_texts[i]
new_content = ""
if self.insert_mode == "replace":
if item['type'] == 'node':
# 对于文本节点,保留原文前后的空白字符,这对维持内联元素的间距至关重要。
leading_space = original_text[:len(original_text) - len(original_text.lstrip())]
trailing_space = original_text[len(original_text.rstrip()):]
new_content = leading_space + translated_text + trailing_space
else: # 属性
new_content = translated_text
elif self.insert_mode == "append":
new_content = original_text + self.separator + translated_text
elif self.insert_mode == "prepend":
new_content = translated_text + self.separator + original_text
else:
self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'")
new_content = original_text # 出错时恢复原文
# 根据类型将内容写回
if item['type'] == 'node':
node = item['object']
# 检查节点是否仍然在解析树中,以防在处理过程中被移动或删除
if node.parent:
node.replace_with(NavigableString(new_content))
if not node.parent: # 确保节点仍然在树中
continue
# --- 构造包含HTML的新内容字符串 ---
new_content_str = ""
if self.insert_mode == "replace":
leading_space = original_text[:len(original_text) - len(original_text.lstrip())]
trailing_space = original_text[len(original_text.rstrip()):]
new_content_str = leading_space + translated_text + trailing_space
elif self.insert_mode == "append":
new_content_str = original_text + self.separator + translated_text
elif self.insert_mode == "prepend":
new_content_str = translated_text + self.separator + original_text
else:
self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'")
new_content_str = original_text
# --- 核心修改正确地将HTML字符串片段插入DOM ---
# 1. 使用一个临时的父标签(如此处的'div'来解析HTML片段
# 这是在BeautifulSoup中处理片段的标准做法避免了自动添加<html><body>。
temp_soup = BeautifulSoup(f"<div>{new_content_str}</div>", 'html.parser')
new_elements = temp_soup.div.contents
# 2. 将解析出的新元素(可能是文本节点和<br>标签的混合)
# 以相反的顺序插入到原始节点之后。这样做可以保持它们的原始顺序。
for element in reversed(new_elements):
node.insert_after(element)
# 3. 移除原始的文本节点
node.decompose()
elif item['type'] == 'attribute':
# --- 属性逻辑保持不变因为属性值不支持HTML ---
tag = item['tag']
attr = item['attribute']
tag[attr] = new_content
new_attr_value = ""
# 在属性值中,<br>将被视为普通文本,这是正确的行为
separator_for_attr = self.separator.replace('<br>', ' ').replace('<br/>', ' ')
if self.insert_mode == "replace":
new_attr_value = translated_text
elif self.insert_mode == "append":
new_attr_value = original_text + separator_for_attr + translated_text
elif self.insert_mode == "prepend":
new_attr_value = translated_text + separator_for_attr + original_text
else:
new_attr_value = original_text
tag[attr] = new_attr_value.strip()
# 将修改后的BeautifulSoup对象编码为utf-8字节流
return soup.encode('utf-8')
def translate(self, document: Document) -> Self: