优化epub翻译效果

This commit is contained in:
xunbu
2025-10-09 17:21:34 +08:00
parent 40d4aa4315
commit 6fb3bae5a5

View File

@@ -8,7 +8,7 @@ from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any from typing import Self, Literal, List, Dict, Any
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, Tag
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
@@ -24,7 +24,7 @@ class EpubTranslatorConfig(AiTranslatorConfig):
class EpubTranslator(AiTranslator): class EpubTranslator(AiTranslator):
""" """
一个用于翻译 EPUB 文件中内容的翻译器。 一个用于翻译 EPUB 文件中内容的翻译器。
此版本使用内置的 `zipfile` 和 `xml` 库,不依赖 `ebooklib` 【高级版】此版本直接翻译HTML内容以保留内联格式
""" """
def __init__(self, config: EpubTranslatorConfig): def __init__(self, config: EpubTranslatorConfig):
@@ -33,7 +33,7 @@ class EpubTranslator(AiTranslator):
self.translate_agent = None self.translate_agent = None
if not self.skip_translate: if not self.skip_translate:
agent_config = SegmentsTranslateAgentConfig( agent_config = SegmentsTranslateAgentConfig(
custom_prompt=config.custom_prompt, custom_prompt=config.custom_prompt, # 使用优化后的prompt
to_lang=config.to_lang, to_lang=config.to_lang,
base_url=config.base_url, base_url=config.base_url,
api_key=config.api_key, api_key=config.api_key,
@@ -54,35 +54,25 @@ class EpubTranslator(AiTranslator):
def _pre_translate(self, document: Document) -> tuple[ def _pre_translate(self, document: Document) -> tuple[
Dict[str, bytes], List[Dict[str, Any]], List[str] Dict[str, bytes], List[Dict[str, Any]], List[str]
]: ]:
"""
预处理 EPUB 文件,提取所有需要翻译的文本。
"""
all_files = {} all_files = {}
items_to_translate = [] items_to_translate = []
original_texts = [] original_texts = [] # 现在这里存储的是HTML片段
# --- 步骤 1: 使用 zipfile 读取 EPUB 内容到内存 ---
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf: with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
for filename in zf.namelist(): for filename in zf.namelist():
all_files[filename] = zf.read(filename) all_files[filename] = zf.read(filename)
# --- 步骤 2: 解析元数据以找到内容文件 ---
# 2.1: 解析 container.xml 找到 .opf 文件的路径
container_xml = all_files.get('META-INF/container.xml') container_xml = all_files.get('META-INF/container.xml')
if not container_xml: if not container_xml:
raise ValueError("无效的 EPUB找不到 META-INF/container.xml") raise ValueError("无效的 EPUB找不到 META-INF/container.xml")
root = ET.fromstring(container_xml) root = ET.fromstring(container_xml)
# XML 命名空间,解析时必须使用
ns = {'cn': 'urn:oasis:names:tc:opendocument:xmlns:container'} ns = {'cn': 'urn:oasis:names:tc:opendocument:xmlns:container'}
opf_path = root.find('cn:rootfiles/cn:rootfile', ns).get('full-path') opf_path = root.find('cn:rootfiles/cn:rootfile', ns).get('full-path')
opf_dir = os.path.dirname(opf_path) opf_dir = os.path.dirname(opf_path)
# 2.2: 解析 .opf 文件找到 manifest 和 spine
opf_xml = all_files.get(opf_path) opf_xml = all_files.get(opf_path)
if not opf_xml: if not opf_xml:
raise ValueError(f"无效的 EPUB找不到 {opf_path}") raise ValueError(f"无效的 EPUB找不到 {opf_path}")
opf_root = ET.fromstring(opf_xml) opf_root = ET.fromstring(opf_xml)
ns_opf = {'opf': 'http://www.idpf.org/2007/opf'} ns_opf = {'opf': 'http://www.idpf.org/2007/opf'}
@@ -90,14 +80,11 @@ class EpubTranslator(AiTranslator):
for item in opf_root.findall('opf:manifest/opf:item', ns_opf): for item in opf_root.findall('opf:manifest/opf:item', ns_opf):
item_id = item.get('id') item_id = item.get('id')
href = item.get('href') href = item.get('href')
# 路径需要相对于 .opf 文件的位置
full_href = os.path.join(opf_dir, href).replace('\\', '/') full_href = os.path.join(opf_dir, href).replace('\\', '/')
manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')} manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')}
spine_itemrefs = [item.get('idref') for item in opf_root.findall('opf:spine/opf:itemref', ns_opf)] TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div']
# --- 步骤 3: 提取可翻译内容 ---
# 我们这里简单地翻译 manifest 中所有的 xhtml/html 文件
for item_id, item_data in manifest_items.items(): for item_id, item_data in manifest_items.items():
media_type = item_data['media_type'] media_type = item_data['media_type']
if media_type in ['application/xhtml+xml', 'text/html']: if media_type in ['application/xhtml+xml', 'text/html']:
@@ -108,20 +95,18 @@ class EpubTranslator(AiTranslator):
continue continue
soup = BeautifulSoup(content_bytes, "html.parser") soup = BeautifulSoup(content_bytes, "html.parser")
for text_node in soup.find_all(string=True): for tag in soup.find_all(TAGS_TO_TRANSLATE):
if ( # 获取标签内部的HTML而不是纯文本
text_node.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]'] inner_html = tag.decode_contents()
and not text_node.isspace() # 只有在内部有实际内容(不仅仅是空白)时才翻译
): if inner_html and not inner_html.isspace():
text = text_node.get_text(strip=True) item_info = {
if text: "file_path": file_path,
item_info = { "tag": tag,
"file_path": file_path, "original_html": inner_html,
"text_node": text_node, }
"original_text": text, items_to_translate.append(item_info)
} original_texts.append(inner_html)
items_to_translate.append(item_info)
original_texts.append(text)
return all_files, items_to_translate, original_texts return all_files, items_to_translate, original_texts
@@ -130,57 +115,51 @@ class EpubTranslator(AiTranslator):
all_files: Dict[str, bytes], all_files: Dict[str, bytes],
items_to_translate: List[Dict[str, Any]], items_to_translate: List[Dict[str, Any]],
translated_texts: List[str], translated_texts: List[str],
original_texts: List[str], original_texts: List[str], # 这里是 original_htmls
) -> bytes: ) -> bytes:
""" modified_soups = {}
将翻译后的文本写回,并重新打包成 EPUB 文件。
"""
modified_soups = {} # 缓存每个文件的 soup 对象
for i, item_info in enumerate(items_to_translate): for i, item_info in enumerate(items_to_translate):
file_path = item_info["file_path"] file_path = item_info["file_path"]
text_node = item_info["text_node"] tag: Tag = item_info["tag"]
translated_text = translated_texts[i] translated_html = translated_texts[i]
original_text = original_texts[i] original_html = original_texts[i]
# 获取或创建该文件的 soup 对象
if file_path not in modified_soups: if file_path not in modified_soups:
# 找到该节点所属的根 soup 对象 modified_soups[file_path] = tag.find_parent('html')
modified_soups[file_path] = text_node.find_parent('html')
# [修改] 处理 insert_mode现在操作的是HTML片段
if self.insert_mode == "replace": if self.insert_mode == "replace":
new_text = translated_text final_html = translated_html
elif self.insert_mode == "append": elif self.insert_mode == "append":
new_text = original_text + self.separator + translated_text final_html = original_html + self.separator + translated_html
elif self.insert_mode == "prepend": elif self.insert_mode == "prepend":
new_text = translated_text + self.separator + original_text final_html = translated_html + self.separator + original_html
else: else:
new_text = translated_text final_html = translated_html
text_node.replace_with(new_text) # [修改] 清空旧内容并追加解析后的新HTML内容
tag.clear()
# 解析AI返回的HTML字符串'html.parser'对此有很好的容错性
new_content = BeautifulSoup(final_html, 'html.parser')
# 将解析后的所有子节点追加到原tag中
for child in new_content.contents:
tag.append(child.extract()) # extract()会从原文档树中移除节点,避免重复
# 将修改后的 soup 对象转换回字节串
for file_path, soup in modified_soups.items(): for file_path, soup in modified_soups.items():
all_files[file_path] = str(soup).encode('utf-8') all_files[file_path] = str(soup).encode('utf-8')
# --- 步骤 4: 创建新的 EPUB (ZIP) 文件 ---
output_buffer = BytesIO() output_buffer = BytesIO()
with zipfile.ZipFile(output_buffer, 'w') as zf_out: with zipfile.ZipFile(output_buffer, 'w') as zf_out:
# 关键mimetype 必须是第一个文件且不能压缩
if 'mimetype' in all_files: if 'mimetype' in all_files:
zf_out.writestr('mimetype', all_files['mimetype'], compress_type=zipfile.ZIP_STORED) zf_out.writestr('mimetype', all_files['mimetype'], compress_type=zipfile.ZIP_STORED)
# 写入其他所有文件
for filename, content in all_files.items(): for filename, content in all_files.items():
if filename != 'mimetype': if filename != 'mimetype':
zf_out.writestr(filename, content, compress_type=zipfile.ZIP_DEFLATED) zf_out.writestr(filename, content, compress_type=zipfile.ZIP_DEFLATED)
return output_buffer.getvalue() return output_buffer.getvalue()
# translate 和 translate_async 方法无需修改因为它们调用的_pre_translate和_after_translate已经被更新了
def translate(self, document: Document) -> Self: def translate(self, document: Document) -> Self:
"""
同步翻译 EPUB 文档。
"""
all_files, items_to_translate, original_texts = self._pre_translate(document) all_files, items_to_translate, original_texts = self._pre_translate(document)
if not items_to_translate: if not items_to_translate:
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。") self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
@@ -199,9 +178,6 @@ class EpubTranslator(AiTranslator):
return self return self
async def translate_async(self, document: Document) -> Self: async def translate_async(self, document: Document) -> Self:
"""
异步翻译 EPUB 文档。
"""
all_files, items_to_translate, original_texts = await asyncio.to_thread( all_files, items_to_translate, original_texts = await asyncio.to_thread(
self._pre_translate, document self._pre_translate, document
) )
@@ -222,4 +198,4 @@ class EpubTranslator(AiTranslator):
document.content = await asyncio.to_thread( document.content = await asyncio.to_thread(
self._after_translate, all_files, items_to_translate, translated_texts, original_texts self._after_translate, all_files, items_to_translate, translated_texts, original_texts
) )
return self return self