epub按段落翻译

This commit is contained in:
xunbu
2025-10-20 11:12:42 +08:00
parent 9f9a6837c3
commit ff4a88c1fb

View File

@@ -2,8 +2,10 @@
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
import asyncio import asyncio
import os import os
import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import zipfile import zipfile
from collections import defaultdict
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple from typing import Self, Literal, List, Dict, Any, Tuple
@@ -88,6 +90,7 @@ class EpubTranslator(AiTranslator):
full_href = os.path.join(opf_dir, href).replace('\\', '/') full_href = os.path.join(opf_dir, href).replace('\\', '/')
manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')} manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')}
# TAGS_TO_TRANSLATE 定义了哪些块级标签的内容需要被翻译
TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div'] TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div']
for item_id, item_data in manifest_items.items(): for item_id, item_data in manifest_items.items():
@@ -99,21 +102,35 @@ class EpubTranslator(AiTranslator):
self.logger.warning(f"在 EPUB 中找不到文件: {file_path}") self.logger.warning(f"在 EPUB 中找不到文件: {file_path}")
continue continue
# << [关键修改] 解析一次并存储
if file_path not in soups: if file_path not in soups:
soups[file_path] = BeautifulSoup(content_bytes, "html.parser") soups[file_path] = BeautifulSoup(content_bytes, "html.parser")
soup = soups[file_path] soup = soups[file_path]
for tag in soup.find_all(TAGS_TO_TRANSLATE): for tag in soup.find_all(TAGS_TO_TRANSLATE):
inner_html = tag.decode_contents() inner_html = tag.decode_contents()
if inner_html and not inner_html.isspace(): if not inner_html or inner_html.isspace():
continue
# 使用正则表达式按 <br> 标签分割内容,同时保留 <br> 标签本身
html_parts = re.split(r'(<br\s*/?>)', inner_html, flags=re.IGNORECASE)
is_split = len(html_parts) > 1
for part in html_parts:
part_stripped = part.strip()
# 判断当前部分是否是 <br> 标签
is_br_tag = re.fullmatch(r'<br\s*/?>', part_stripped, flags=re.IGNORECASE)
# 我们只翻译那些不是 <br> 标签且有实际内容的片段
if not is_br_tag and part_stripped:
item_info = { item_info = {
"file_path": file_path, "file_path": file_path,
"tag": tag, # 这个tag是soups[file_path]中的活引用 "tag": tag, # 父标签的引用
"original_html": inner_html, "original_html": part, # 这部分是需要翻译的原文
"original_full_html": inner_html if is_split else None # 仅在分割时保存完整原文
} }
items_to_translate.append(item_info) items_to_translate.append(item_info)
original_texts.append(inner_html) original_texts.append(part)
return all_files, soups, items_to_translate, original_texts return all_files, soups, items_to_translate, original_texts
@@ -125,38 +142,61 @@ class EpubTranslator(AiTranslator):
translated_texts: List[str], translated_texts: List[str],
original_texts: List[str], original_texts: List[str],
) -> bytes: ) -> bytes:
# 由于一个父标签可能被<br>分割成多个翻译块,我们需要重构替换逻辑
# 按父标签通过其对象id对所有翻译块进行分组
tag_reconstruction_map = defaultdict(lambda: {'new_html': None, 'chunks': []})
# 1. 初始化每个父标签的重建信息
for i, item_info in enumerate(items_to_translate): for i, item_info in enumerate(items_to_translate):
# << [关键修改] 直接使用 item_info 中的活引用 tag它属于 soups 字典中的一个对象 tag = item_info["tag"]
tag: Tag = item_info["tag"] tag_id = id(tag)
translated_html = translated_texts[i] if tag_reconstruction_map[tag_id]['new_html'] is None:
original_html = original_texts[i] # 如果有分割,使用保存的完整原文;否则,使用当前块的原文(因为就这一个块)
original_full_html = item_info.get("original_full_html") or item_info["original_html"]
tag_reconstruction_map[tag_id]['new_html'] = original_full_html
tag_reconstruction_map[tag_id]['tag_obj'] = tag
# 2. 为每个父标签准备好所有原始块和翻译块的对应关系
for i, item_info in enumerate(items_to_translate):
tag = item_info["tag"]
tag_id = id(tag)
original_chunk = original_texts[i]
translated_chunk = translated_texts[i]
if self.insert_mode == "replace": if self.insert_mode == "replace":
final_html = translated_html final_chunk = translated_chunk
elif self.insert_mode == "append": elif self.insert_mode == "append":
final_html = original_html + self.separator + translated_html final_chunk = original_chunk + self.separator + translated_chunk
elif self.insert_mode == "prepend": elif self.insert_mode == "prepend":
final_html = translated_html + self.separator + original_html final_chunk = translated_chunk + self.separator + original_chunk
else: else:
final_html = translated_html final_chunk = translated_chunk
# 清空旧内容 tag_reconstruction_map[tag_id]['chunks'].append({'original': original_chunk, 'final': final_chunk})
# 3. 对每个父标签,用其所有的翻译块重建完整内容
for tag_id, data in tag_reconstruction_map.items():
tag: Tag = data['tag_obj']
reconstructed_html = data['new_html']
for chunk_info in data['chunks']:
# 使用replace函数进行替换。为避免错误替换处理原始文本中的特殊字符
# 这个方法在原始文本块在父标签中重复出现时可能出错,但对于大多数情况是有效的
reconstructed_html = reconstructed_html.replace(chunk_info['original'], chunk_info['final'], 1)
# 4. 更新父标签的内容
tag.clear() tag.clear()
new_content_soup = BeautifulSoup(reconstructed_html, 'html.parser')
# 解析新的HTML片段
new_content_soup = BeautifulSoup(final_html, 'html.parser')
# << [关键修复] 将新片段的*内容*(而不是整个文档)移动到原始标签中
# 使用 list() 创建副本以安全地迭代和修改
if new_content_soup.body: if new_content_soup.body:
nodes_to_insert = list(new_content_soup.body.children) nodes_to_insert = list(new_content_soup.body.children)
else: else:
nodes_to_insert = list(new_content_soup.children) nodes_to_insert = list(new_content_soup.children)
for node in nodes_to_insert: for node in nodes_to_insert:
tag.append(node.extract()) # .extract() 从旧树中移除并返回节点 tag.append(node.extract())
# << [关键修改] 从修改后的soups对象生成新的文件内容 # 从修改后的soups对象生成新的文件内容
for file_path, soup in soups.items(): for file_path, soup in soups.items():
all_files[file_path] = str(soup).encode('utf-8') all_files[file_path] = str(soup).encode('utf-8')
@@ -182,7 +222,6 @@ class EpubTranslator(AiTranslator):
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
else: else:
translated_texts = original_texts translated_texts = original_texts
# << [关键修改] 传递 soups 对象
document.content = self._after_translate( document.content = self._after_translate(
all_files, soups, items_to_translate, translated_texts, original_texts all_files, soups, items_to_translate, translated_texts, original_texts
) )
@@ -206,7 +245,6 @@ class EpubTranslator(AiTranslator):
) )
else: else:
translated_texts = original_texts translated_texts = original_texts
# << [关键修改] 传递 soups 对象
document.content = await asyncio.to_thread( document.content = await asyncio.to_thread(
self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts
) )