diff --git a/docutranslate/translator/ai_translator/epub_translator.py b/docutranslate/translator/ai_translator/epub_translator.py
index cdbf056..ad6c0c6 100644
--- a/docutranslate/translator/ai_translator/epub_translator.py
+++ b/docutranslate/translator/ai_translator/epub_translator.py
@@ -2,8 +2,10 @@
# SPDX-License-Identifier: MPL-2.0
import asyncio
import os
+import re
import xml.etree.ElementTree as ET
import zipfile
+from collections import defaultdict
from dataclasses import dataclass
from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple
@@ -88,6 +90,7 @@ class EpubTranslator(AiTranslator):
full_href = os.path.join(opf_dir, href).replace('\\', '/')
manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')}
+ # TAGS_TO_TRANSLATE 定义了哪些块级标签的内容需要被翻译
TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div']
for item_id, item_data in manifest_items.items():
@@ -99,21 +102,35 @@ class EpubTranslator(AiTranslator):
self.logger.warning(f"在 EPUB 中找不到文件: {file_path}")
continue
- # << [关键修改] 解析一次并存储
if file_path not in soups:
soups[file_path] = BeautifulSoup(content_bytes, "html.parser")
soup = soups[file_path]
for tag in soup.find_all(TAGS_TO_TRANSLATE):
inner_html = tag.decode_contents()
- if inner_html and not inner_html.isspace():
- item_info = {
- "file_path": file_path,
- "tag": tag, # 这个tag是soups[file_path]中的活引用
- "original_html": inner_html,
- }
- items_to_translate.append(item_info)
- original_texts.append(inner_html)
+ if not inner_html or inner_html.isspace():
+ continue
+
+ # 使用正则表达式按
标签分割内容,同时保留
标签本身
+ html_parts = re.split(r'(
)', inner_html, flags=re.IGNORECASE)
+
+ is_split = len(html_parts) > 1
+
+ for part in html_parts:
+ part_stripped = part.strip()
+ # 判断当前部分是否是
标签
+ is_br_tag = re.fullmatch(r'
', part_stripped, flags=re.IGNORECASE)
+
+ # 我们只翻译那些不是
标签且有实际内容的片段
+ if not is_br_tag and part_stripped:
+ item_info = {
+ "file_path": file_path,
+ "tag": tag, # 父标签的引用
+ "original_html": part, # 这部分是需要翻译的原文
+ "original_full_html": inner_html if is_split else None # 仅在分割时保存完整原文
+ }
+ items_to_translate.append(item_info)
+ original_texts.append(part)
return all_files, soups, items_to_translate, original_texts
@@ -125,38 +142,61 @@ class EpubTranslator(AiTranslator):
translated_texts: List[str],
original_texts: List[str],
) -> bytes:
+ # 由于一个父标签可能被
分割成多个翻译块,我们需要重构替换逻辑
+ # 按父标签(通过其对象id)对所有翻译块进行分组
+ tag_reconstruction_map = defaultdict(lambda: {'new_html': None, 'chunks': []})
+
+ # 1. 初始化每个父标签的重建信息
for i, item_info in enumerate(items_to_translate):
- # << [关键修改] 直接使用 item_info 中的活引用 tag,它属于 soups 字典中的一个对象
- tag: Tag = item_info["tag"]
- translated_html = translated_texts[i]
- original_html = original_texts[i]
+ tag = item_info["tag"]
+ tag_id = id(tag)
+ if tag_reconstruction_map[tag_id]['new_html'] is None:
+ # 如果有分割,使用保存的完整原文;否则,使用当前块的原文(因为就这一个块)
+ original_full_html = item_info.get("original_full_html") or item_info["original_html"]
+ tag_reconstruction_map[tag_id]['new_html'] = original_full_html
+ tag_reconstruction_map[tag_id]['tag_obj'] = tag
+
+ # 2. 为每个父标签准备好所有原始块和翻译块的对应关系
+ for i, item_info in enumerate(items_to_translate):
+ tag = item_info["tag"]
+ tag_id = id(tag)
+ original_chunk = original_texts[i]
+ translated_chunk = translated_texts[i]
if self.insert_mode == "replace":
- final_html = translated_html
+ final_chunk = translated_chunk
elif self.insert_mode == "append":
- final_html = original_html + self.separator + translated_html
+ final_chunk = original_chunk + self.separator + translated_chunk
elif self.insert_mode == "prepend":
- final_html = translated_html + self.separator + original_html
+ final_chunk = translated_chunk + self.separator + original_chunk
else:
- final_html = translated_html
+ final_chunk = translated_chunk
- # 清空旧内容
+ tag_reconstruction_map[tag_id]['chunks'].append({'original': original_chunk, 'final': final_chunk})
+
+ # 3. 对每个父标签,用其所有的翻译块重建完整内容
+ for tag_id, data in tag_reconstruction_map.items():
+ tag: Tag = data['tag_obj']
+ reconstructed_html = data['new_html']
+
+ for chunk_info in data['chunks']:
+ # 使用replace函数进行替换。为避免错误替换,处理原始文本中的特殊字符
+ # 这个方法在原始文本块在父标签中重复出现时可能出错,但对于大多数情况是有效的
+ reconstructed_html = reconstructed_html.replace(chunk_info['original'], chunk_info['final'], 1)
+
+ # 4. 更新父标签的内容
tag.clear()
+ new_content_soup = BeautifulSoup(reconstructed_html, 'html.parser')
- # 解析新的HTML片段
- new_content_soup = BeautifulSoup(final_html, 'html.parser')
-
- # << [关键修复] 将新片段的*内容*(而不是整个文档)移动到原始标签中
- # 使用 list() 创建副本以安全地迭代和修改
if new_content_soup.body:
nodes_to_insert = list(new_content_soup.body.children)
else:
nodes_to_insert = list(new_content_soup.children)
for node in nodes_to_insert:
- tag.append(node.extract()) # .extract() 从旧树中移除并返回节点
+ tag.append(node.extract())
- # << [关键修改] 从修改后的soups对象生成新的文件内容
+ # 从修改后的soups对象生成新的文件内容
for file_path, soup in soups.items():
all_files[file_path] = str(soup).encode('utf-8')
@@ -182,7 +222,6 @@ class EpubTranslator(AiTranslator):
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
else:
translated_texts = original_texts
- # << [关键修改] 传递 soups 对象
document.content = self._after_translate(
all_files, soups, items_to_translate, translated_texts, original_texts
)
@@ -206,7 +245,6 @@ class EpubTranslator(AiTranslator):
)
else:
translated_texts = original_texts
- # << [关键修改] 传递 soups 对象
document.content = await asyncio.to_thread(
self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts
)