修复epub分隔符不能换行的问题

This commit is contained in:
xunbu
2025-10-18 00:06:24 +08:00
parent f611eb8501
commit 4805577338

View File

@@ -6,7 +6,7 @@ import xml.etree.ElementTree as ET
import zipfile import zipfile
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any from typing import Self, Literal, List, Dict, Any, Tuple
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@@ -18,7 +18,8 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTr
@dataclass @dataclass
class EpubTranslatorConfig(AiTranslatorConfig): class EpubTranslatorConfig(AiTranslatorConfig):
insert_mode: Literal["replace", "append", "prepend"] = "replace" insert_mode: Literal["replace", "append", "prepend"] = "replace"
separator: str = "\n" # 建议使用 <br />,它在 XHTML (EPUB标准) 中更规范
separator: str = "<br />"
class EpubTranslator(AiTranslator): class EpubTranslator(AiTranslator):
@@ -33,7 +34,7 @@ class EpubTranslator(AiTranslator):
self.translate_agent = None self.translate_agent = None
if not self.skip_translate: if not self.skip_translate:
agent_config = SegmentsTranslateAgentConfig( agent_config = SegmentsTranslateAgentConfig(
custom_prompt=config.custom_prompt, # 使用优化后的prompt custom_prompt=config.custom_prompt,
to_lang=config.to_lang, to_lang=config.to_lang,
base_url=config.base_url, base_url=config.base_url,
api_key=config.api_key, api_key=config.api_key,
@@ -52,11 +53,15 @@ class EpubTranslator(AiTranslator):
self.separator = config.separator self.separator = config.separator
def _pre_translate(self, document: Document) -> tuple[ def _pre_translate(self, document: Document) -> tuple[
Dict[str, bytes], List[Dict[str, Any]], List[str] Dict[str, bytes], # all_files: 原始文件内容
Dict[str, BeautifulSoup], # soups: 解析后的HTML对象
List[Dict[str, Any]], # items_to_translate: 待翻译项
List[str] # original_texts: 原始HTML片段
]: ]:
all_files = {} all_files = {}
soups = {} # << [关键修改] 存储解析后的BS对象
items_to_translate = [] items_to_translate = []
original_texts = [] # 现在这里存储的是HTML片段 original_texts = []
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf: with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
for filename in zf.namelist(): for filename in zf.namelist():
@@ -94,41 +99,38 @@ class EpubTranslator(AiTranslator):
self.logger.warning(f"在 EPUB 中找不到文件: {file_path}") self.logger.warning(f"在 EPUB 中找不到文件: {file_path}")
continue continue
soup = BeautifulSoup(content_bytes, "html.parser") # << [关键修改] 解析一次并存储
if file_path not in soups:
soups[file_path] = BeautifulSoup(content_bytes, "html.parser")
soup = soups[file_path]
for tag in soup.find_all(TAGS_TO_TRANSLATE): for tag in soup.find_all(TAGS_TO_TRANSLATE):
# 获取标签内部的HTML而不是纯文本
inner_html = tag.decode_contents() inner_html = tag.decode_contents()
# 只有在内部有实际内容(不仅仅是空白)时才翻译
if inner_html and not inner_html.isspace(): if inner_html and not inner_html.isspace():
item_info = { item_info = {
"file_path": file_path, "file_path": file_path,
"tag": tag, "tag": tag, # 这个tag是soups[file_path]中的活引用
"original_html": inner_html, "original_html": inner_html,
} }
items_to_translate.append(item_info) items_to_translate.append(item_info)
original_texts.append(inner_html) original_texts.append(inner_html)
return all_files, items_to_translate, original_texts return all_files, soups, items_to_translate, original_texts
def _after_translate( def _after_translate(
self, self,
all_files: Dict[str, bytes], all_files: Dict[str, bytes],
soups: Dict[str, BeautifulSoup], # << [关键修改] 接收解析好的BS对象
items_to_translate: List[Dict[str, Any]], items_to_translate: List[Dict[str, Any]],
translated_texts: List[str], translated_texts: List[str],
original_texts: List[str], # 这里是 original_htmls original_texts: List[str],
) -> bytes: ) -> bytes:
modified_soups = {}
for i, item_info in enumerate(items_to_translate): for i, item_info in enumerate(items_to_translate):
file_path = item_info["file_path"] # << [关键修改] 直接使用 item_info 中的活引用 tag它属于 soups 字典中的一个对象
tag: Tag = item_info["tag"] tag: Tag = item_info["tag"]
translated_html = translated_texts[i] translated_html = translated_texts[i]
original_html = original_texts[i] original_html = original_texts[i]
if file_path not in modified_soups:
modified_soups[file_path] = tag.find_parent('html')
# [修改] 处理 insert_mode现在操作的是HTML片段
if self.insert_mode == "replace": if self.insert_mode == "replace":
final_html = translated_html final_html = translated_html
elif self.insert_mode == "append": elif self.insert_mode == "append":
@@ -138,15 +140,24 @@ class EpubTranslator(AiTranslator):
else: else:
final_html = translated_html final_html = translated_html
# [修改] 清空旧内容并追加解析后的新HTML内容 # 清空旧内容
tag.clear() tag.clear()
# 解析AI返回的HTML字符串'html.parser'对此有很好的容错性
new_content = BeautifulSoup(final_html, 'html.parser')
# 将解析后的所有子节点追加到原tag中
for child in new_content.contents:
tag.append(child.extract()) # extract()会从原文档树中移除节点,避免重复
for file_path, soup in modified_soups.items(): # 解析新的HTML片段
new_content_soup = BeautifulSoup(final_html, 'html.parser')
# << [关键修复] 将新片段的*内容*(而不是整个文档)移动到原始标签中
# 使用 list() 创建副本以安全地迭代和修改
if new_content_soup.body:
nodes_to_insert = list(new_content_soup.body.children)
else:
nodes_to_insert = list(new_content_soup.children)
for node in nodes_to_insert:
tag.append(node.extract()) # .extract() 从旧树中移除并返回节点
# << [关键修改] 从修改后的soups对象生成新的文件内容
for file_path, soup in soups.items():
all_files[file_path] = str(soup).encode('utf-8') all_files[file_path] = str(soup).encode('utf-8')
output_buffer = BytesIO() output_buffer = BytesIO()
@@ -158,11 +169,10 @@ class EpubTranslator(AiTranslator):
zf_out.writestr(filename, content, compress_type=zipfile.ZIP_DEFLATED) zf_out.writestr(filename, content, compress_type=zipfile.ZIP_DEFLATED)
return output_buffer.getvalue() return output_buffer.getvalue()
# translate 和 translate_async 方法无需修改因为它们调用的_pre_translate和_after_translate已经被更新了
def translate(self, document: Document) -> Self: def translate(self, document: Document) -> Self:
all_files, items_to_translate, original_texts = self._pre_translate(document) all_files, soups, items_to_translate, original_texts = self._pre_translate(document)
if not items_to_translate: if not items_to_translate:
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。") self.logger.info("\n文件中没有找到需要翻译的内容。")
return self return self
if self.glossary_agent: if self.glossary_agent:
self.glossary_dict_gen = self.glossary_agent.send_segments(original_texts, self.chunk_size) self.glossary_dict_gen = self.glossary_agent.send_segments(original_texts, self.chunk_size)
@@ -172,17 +182,18 @@ class EpubTranslator(AiTranslator):
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
else: else:
translated_texts = original_texts translated_texts = original_texts
# << [关键修改] 传递 soups 对象
document.content = self._after_translate( document.content = self._after_translate(
all_files, items_to_translate, translated_texts, original_texts all_files, soups, items_to_translate, translated_texts, original_texts
) )
return self return self
async def translate_async(self, document: Document) -> Self: async def translate_async(self, document: Document) -> Self:
all_files, items_to_translate, original_texts = await asyncio.to_thread( all_files, soups, items_to_translate, original_texts = await asyncio.to_thread(
self._pre_translate, document self._pre_translate, document
) )
if not items_to_translate: if not items_to_translate:
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。") self.logger.info("\n文件中没有找到需要翻译的内容。")
return self return self
if self.glossary_agent: if self.glossary_agent:
@@ -195,7 +206,8 @@ class EpubTranslator(AiTranslator):
) )
else: else:
translated_texts = original_texts translated_texts = original_texts
# << [关键修改] 传递 soups 对象
document.content = await asyncio.to_thread( document.content = await asyncio.to_thread(
self._after_translate, all_files, items_to_translate, translated_texts, original_texts self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts
) )
return self return self