diff --git a/docutranslate/translator/ai_translator/epub_translator.py b/docutranslate/translator/ai_translator/epub_translator.py
index 1deb0a1..cc4dcee 100644
--- a/docutranslate/translator/ai_translator/epub_translator.py
+++ b/docutranslate/translator/ai_translator/epub_translator.py
@@ -10,7 +10,7 @@ from dataclasses import dataclass
from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple
-from bs4 import BeautifulSoup, Tag
+from bs4 import BeautifulSoup, Tag, NavigableString
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
@@ -27,7 +27,7 @@ class EpubTranslatorConfig(AiTranslatorConfig):
class EpubTranslator(AiTranslator):
"""
一个用于翻译 EPUB 文件中内容的翻译器。
- 【高级版】此版本直接翻译HTML内容,以保留内联格式。
+ 【高级版】此版本直接翻译HTML内容,以保留内联格式,并支持表格翻译。
"""
def __init__(self, config: EpubTranslatorConfig):
@@ -55,10 +55,10 @@ class EpubTranslator(AiTranslator):
self.separator = config.separator
def _pre_translate(self, document: Document) -> tuple[
- Dict[str, bytes], # all_files: 原始文件内容
- Dict[str, BeautifulSoup], # soups: 解析后的HTML对象
- List[Dict[str, Any]], # items_to_translate: 待翻译项
- List[str] # original_texts: 原始HTML片段
+ Dict[str, bytes],
+ Dict[str, BeautifulSoup],
+ List[Dict[str, Any]],
+ List[str]
]:
all_files = {}
soups = {}
@@ -90,8 +90,8 @@ class EpubTranslator(AiTranslator):
full_href = os.path.join(opf_dir, href).replace('\\', '/')
manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')}
- TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div']
-
+ # ==================== 代码修改 1: 添加表格相关的标签 ====================
+ TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'td', 'th']
# 定义一个正则表达式,用于按
和
标签分割内容
split_pattern = re.compile(r'(
|
]*>)', re.IGNORECASE)
@@ -109,18 +109,12 @@ class EpubTranslator(AiTranslator):
soup = soups[file_path]
- # ==================== 关键代码修改 ====================
- # 采用“Bottom-Up”逻辑:只选择不包含其他可翻译块级标签的“叶子”标签。
- # 这种方法能准确地选取段落,并自动忽略像
这样的父容器。
-
- # 1. 找到所有可能的翻译标签
all_potential_tags = soup.find_all(TAGS_TO_TRANSLATE)
- all_potential_tags_set = set(all_potential_tags) # 用于快速查找
+ all_potential_tags_set = set(all_potential_tags)
tags_to_process = []
for tag in all_potential_tags:
- # 2. 检查当前标签内部是否还包含其他需要翻译的标签
- # 如果没有,说明它是一个“叶子”节点,是我们要找的翻译单元。
+ # 采用“Bottom-Up”逻辑,只选择不包含其他可翻译块级标签的“叶子”标签。
contains_other_block = tag.find(
lambda child_tag: child_tag in all_potential_tags_set and child_tag is not tag
)
@@ -133,34 +127,50 @@ class EpubTranslator(AiTranslator):
if not inner_html or inner_html.isspace():
continue
- # 使用正则表达式分割内容,同时保留
和
![]()
标签
+ # ==================== 代码修改 2: 增加对表格标签的特殊处理 ====================
+ # 对于表格单元格(td, th),我们希望直接翻译其内容,并替换。
+ # 这样做可以避免在单元格内部错误地插入
导致表格布局破坏。
+ # 其他标签(如 p, div)则可以继续使用分割逻辑,以支持段内换行。
+ is_table_cell = tag.name in ['td', 'th']
+ # ==================== 修改结束 ====================
+
html_parts = split_pattern.split(inner_html)
- is_split = len(html_parts) > 1
+ # 如果不是表格单元格,且存在
或
![]()
,则按片段处理
+ is_split = len(html_parts) > 1 and not is_table_cell
- for part in html_parts:
- part_stripped = part.strip()
- if not part_stripped:
- continue
+ if is_split:
+ # 逻辑保持不变:处理被
或
![]()
分割的段落
+ for part in html_parts:
+ part_stripped = part.strip()
+ if not part_stripped:
+ continue
- # 判断当前部分是否是
或
![]()
分隔符标签
- is_separator_tag = split_pattern.fullmatch(part_stripped)
+ is_separator_tag = split_pattern.fullmatch(part_stripped)
+ plain_text = BeautifulSoup(part, 'html.parser').get_text(strip=True)
- # ==================== 关键代码修改 ====================
- # 检查片段是否包含实际可翻译的文本内容,而不仅仅是空白、 或空的HTML标签
- plain_text = BeautifulSoup(part, 'html.parser').get_text(strip=True)
-
- # 我们只翻译那些不是分隔符标签(如
,
![]()
)且含有实际文本内容的片段
- if not is_separator_tag and plain_text:
+ if not is_separator_tag and plain_text:
+ item_info = {
+ "file_path": file_path,
+ "tag": tag,
+ "original_html": part,
+ "original_full_html": inner_html,
+ "is_split": True
+ }
+ items_to_translate.append(item_info)
+ original_texts.append(part)
+ else:
+ # 对于完整的标签内容(或表格单元格),我们整体处理
+ plain_text = tag.get_text(strip=True)
+ if plain_text:
item_info = {
"file_path": file_path,
"tag": tag,
- "original_html": part,
- "original_full_html": inner_html if is_split else None
+ "original_html": inner_html,
+ "is_split": False
}
items_to_translate.append(item_info)
- original_texts.append(part)
- # ==================== 修改结束 ====================
+ original_texts.append(inner_html)
return all_files, soups, items_to_translate, original_texts
@@ -172,50 +182,85 @@ class EpubTranslator(AiTranslator):
translated_texts: List[str],
original_texts: List[str],
) -> bytes:
- tag_reconstruction_map = defaultdict(lambda: {'new_html': None, 'chunks': []})
+
+ # ==================== 代码修改 3: 重构 _after_translate 逻辑 ====================
+ # 使用一个更清晰的 defaultdict 来处理内容的重构
+ # key 是每个独立 tag 对象的 id,value 是待处理的信息
+ tag_reconstruction_map = defaultdict(
+ lambda: {'chunks': [], 'is_split': False, 'original_full_html': None, 'tag_obj': None})
for i, item_info in enumerate(items_to_translate):
tag = item_info["tag"]
tag_id = id(tag)
- if tag_reconstruction_map[tag_id]['new_html'] is None:
- original_full_html = item_info.get("original_full_html") or item_info["original_html"]
- tag_reconstruction_map[tag_id]['new_html'] = original_full_html
- tag_reconstruction_map[tag_id]['tag_obj'] = tag
- for i, item_info in enumerate(items_to_translate):
- tag = item_info["tag"]
- tag_id = id(tag)
- original_chunk = original_texts[i]
- translated_chunk = translated_texts[i]
+ tag_reconstruction_map[tag_id]['is_split'] = item_info['is_split']
+ tag_reconstruction_map[tag_id]['tag_obj'] = tag
+ if item_info['is_split']:
+ tag_reconstruction_map[tag_id]['original_full_html'] = item_info['original_full_html']
- if self.insert_mode == "replace":
- final_chunk = translated_chunk
- elif self.insert_mode == "append":
- final_chunk = original_chunk + self.separator + translated_chunk
- elif self.insert_mode == "prepend":
- final_chunk = translated_chunk + self.separator + original_chunk
- else:
- final_chunk = translated_chunk
-
- tag_reconstruction_map[tag_id]['chunks'].append({'original': original_chunk, 'final': final_chunk})
+ tag_reconstruction_map[tag_id]['chunks'].append({
+ 'original': original_texts[i],
+ 'translated': translated_texts[i]
+ })
for tag_id, data in tag_reconstruction_map.items():
tag: Tag = data['tag_obj']
- reconstructed_html = data['new_html']
+ final_html = ""
- for chunk_info in data['chunks']:
- reconstructed_html = reconstructed_html.replace(chunk_info['original'], chunk_info['final'], 1)
+ if data['is_split']:
+ # 如果是分割的段落,我们需要重组它
+ reconstructed_html = data['original_full_html']
+ for chunk in data['chunks']:
+ original_chunk = chunk['original']
+ translated_chunk = chunk['translated']
- tag.clear()
- new_content_soup = BeautifulSoup(reconstructed_html, 'html.parser')
+ if self.insert_mode == "replace":
+ final_chunk = translated_chunk
+ elif self.insert_mode == "append":
+ final_chunk = original_chunk + self.separator + translated_chunk
+ else: # prepend
+ final_chunk = translated_chunk + self.separator + original_chunk
- if new_content_soup.body:
- nodes_to_insert = list(new_content_soup.body.children)
+ # 使用带计数的替换,确保只替换第一个匹配项
+ reconstructed_html = reconstructed_html.replace(original_chunk, final_chunk, 1)
+
+ final_html = reconstructed_html
else:
- nodes_to_insert = list(new_content_soup.children)
+ # 如果是完整的标签内容(包括表格单元格),则直接处理
+ chunk = data['chunks'][0]
+ original_chunk = chunk['original']
+ translated_chunk = chunk['translated']
+
+ if self.insert_mode == "replace":
+ final_html = translated_chunk
+ elif self.insert_mode == "append":
+ # 对于表格,即使是 append 模式,直接拼接也可能破坏格式。
+ # 因此,对于 td/th,我们强制在内部用 separator 分隔,而不是在标签外。
+ if tag.name in ['td', 'th']:
+ final_html = f"{original_chunk}{self.separator}{translated_chunk}"
+ else:
+ final_html = original_chunk + self.separator + translated_chunk
+ else: # prepend
+ if tag.name in ['td', 'th']:
+ final_html = f"{translated_chunk}{self.separator}{original_chunk}"
+ else:
+ final_html = translated_chunk + self.separator + original_chunk
+
+ # 清空旧内容并插入新内容
+ tag.clear()
+ # 使用 BeautifulSoup 解析最终的 HTML 片段,以正确处理嵌套标签
+ new_content_soup = BeautifulSoup(final_html, 'html.parser')
+
+ # new_content_soup.body 可能不存在,如果 final_html 不含 body 标签。
+ # 我们需要从 soup 的顶层子节点开始插入。
+ nodes_to_insert = list(new_content_soup.children)
+ if len(nodes_to_insert) == 1 and nodes_to_insert[0].name == 'html':
+ nodes_to_insert = list(nodes_to_insert[0].body.children)
for node in nodes_to_insert:
+ # .extract() 会将节点从原文档树中移除,这样可以直接 append 到新位置
tag.append(node.extract())
+ # ==================== 修改结束 ====================
for file_path, soup in soups.items():
all_files[file_path] = str(soup).encode('utf-8')