diff --git a/docutranslate/translator/ai_translator/epub_translator.py b/docutranslate/translator/ai_translator/epub_translator.py index 1deb0a1..cc4dcee 100644 --- a/docutranslate/translator/ai_translator/epub_translator.py +++ b/docutranslate/translator/ai_translator/epub_translator.py @@ -10,7 +10,7 @@ from dataclasses import dataclass from io import BytesIO from typing import Self, Literal, List, Dict, Any, Tuple -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup, Tag, NavigableString from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.ir.document import Document @@ -27,7 +27,7 @@ class EpubTranslatorConfig(AiTranslatorConfig): class EpubTranslator(AiTranslator): """ 一个用于翻译 EPUB 文件中内容的翻译器。 - 【高级版】此版本直接翻译HTML内容,以保留内联格式。 + 【高级版】此版本直接翻译HTML内容,以保留内联格式,并支持表格翻译。 """ def __init__(self, config: EpubTranslatorConfig): @@ -55,10 +55,10 @@ class EpubTranslator(AiTranslator): self.separator = config.separator def _pre_translate(self, document: Document) -> tuple[ - Dict[str, bytes], # all_files: 原始文件内容 - Dict[str, BeautifulSoup], # soups: 解析后的HTML对象 - List[Dict[str, Any]], # items_to_translate: 待翻译项 - List[str] # original_texts: 原始HTML片段 + Dict[str, bytes], + Dict[str, BeautifulSoup], + List[Dict[str, Any]], + List[str] ]: all_files = {} soups = {} @@ -90,8 +90,8 @@ class EpubTranslator(AiTranslator): full_href = os.path.join(opf_dir, href).replace('\\', '/') manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')} - TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div'] - + # ==================== 代码修改 1: 添加表格相关的标签 ==================== + TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'td', 'th'] # 定义一个正则表达式,用于按
标签分割内容 split_pattern = re.compile(r'(|]*>)', re.IGNORECASE) @@ -109,18 +109,12 @@ class EpubTranslator(AiTranslator): soup = soups[file_path] - # ==================== 关键代码修改 ==================== - # 采用“Bottom-Up”逻辑:只选择不包含其他可翻译块级标签的“叶子”标签。 - # 这种方法能准确地选取段落,并自动忽略像
这样的父容器。 - - # 1. 找到所有可能的翻译标签 all_potential_tags = soup.find_all(TAGS_TO_TRANSLATE) - all_potential_tags_set = set(all_potential_tags) # 用于快速查找 + all_potential_tags_set = set(all_potential_tags) tags_to_process = [] for tag in all_potential_tags: - # 2. 检查当前标签内部是否还包含其他需要翻译的标签 - # 如果没有,说明它是一个“叶子”节点,是我们要找的翻译单元。 + # 采用“Bottom-Up”逻辑,只选择不包含其他可翻译块级标签的“叶子”标签。 contains_other_block = tag.find( lambda child_tag: child_tag in all_potential_tags_set and child_tag is not tag ) @@ -133,34 +127,50 @@ class EpubTranslator(AiTranslator): if not inner_html or inner_html.isspace(): continue - # 使用正则表达式分割内容,同时保留
标签 + # ==================== 代码修改 2: 增加对表格标签的特殊处理 ==================== + # 对于表格单元格(td, th),我们希望直接翻译其内容,并替换。 + # 这样做可以避免在单元格内部错误地插入
导致表格布局破坏。 + # 其他标签(如 p, div)则可以继续使用分割逻辑,以支持段内换行。 + is_table_cell = tag.name in ['td', 'th'] + # ==================== 修改结束 ==================== + html_parts = split_pattern.split(inner_html) - is_split = len(html_parts) > 1 + # 如果不是表格单元格,且存在
,则按片段处理 + is_split = len(html_parts) > 1 and not is_table_cell - for part in html_parts: - part_stripped = part.strip() - if not part_stripped: - continue + if is_split: + # 逻辑保持不变:处理被
分割的段落 + for part in html_parts: + part_stripped = part.strip() + if not part_stripped: + continue - # 判断当前部分是否是
分隔符标签 - is_separator_tag = split_pattern.fullmatch(part_stripped) + is_separator_tag = split_pattern.fullmatch(part_stripped) + plain_text = BeautifulSoup(part, 'html.parser').get_text(strip=True) - # ==================== 关键代码修改 ==================== - # 检查片段是否包含实际可翻译的文本内容,而不仅仅是空白、 或空的HTML标签 - plain_text = BeautifulSoup(part, 'html.parser').get_text(strip=True) - - # 我们只翻译那些不是分隔符标签(如
, )且含有实际文本内容的片段 - if not is_separator_tag and plain_text: + if not is_separator_tag and plain_text: + item_info = { + "file_path": file_path, + "tag": tag, + "original_html": part, + "original_full_html": inner_html, + "is_split": True + } + items_to_translate.append(item_info) + original_texts.append(part) + else: + # 对于完整的标签内容(或表格单元格),我们整体处理 + plain_text = tag.get_text(strip=True) + if plain_text: item_info = { "file_path": file_path, "tag": tag, - "original_html": part, - "original_full_html": inner_html if is_split else None + "original_html": inner_html, + "is_split": False } items_to_translate.append(item_info) - original_texts.append(part) - # ==================== 修改结束 ==================== + original_texts.append(inner_html) return all_files, soups, items_to_translate, original_texts @@ -172,50 +182,85 @@ class EpubTranslator(AiTranslator): translated_texts: List[str], original_texts: List[str], ) -> bytes: - tag_reconstruction_map = defaultdict(lambda: {'new_html': None, 'chunks': []}) + + # ==================== 代码修改 3: 重构 _after_translate 逻辑 ==================== + # 使用一个更清晰的 defaultdict 来处理内容的重构 + # key 是每个独立 tag 对象的 id,value 是待处理的信息 + tag_reconstruction_map = defaultdict( + lambda: {'chunks': [], 'is_split': False, 'original_full_html': None, 'tag_obj': None}) for i, item_info in enumerate(items_to_translate): tag = item_info["tag"] tag_id = id(tag) - if tag_reconstruction_map[tag_id]['new_html'] is None: - original_full_html = item_info.get("original_full_html") or item_info["original_html"] - tag_reconstruction_map[tag_id]['new_html'] = original_full_html - tag_reconstruction_map[tag_id]['tag_obj'] = tag - for i, item_info in enumerate(items_to_translate): - tag = item_info["tag"] - tag_id = id(tag) - original_chunk = original_texts[i] - translated_chunk = translated_texts[i] + tag_reconstruction_map[tag_id]['is_split'] = item_info['is_split'] + tag_reconstruction_map[tag_id]['tag_obj'] = tag + if item_info['is_split']: + tag_reconstruction_map[tag_id]['original_full_html'] = item_info['original_full_html'] - if self.insert_mode == "replace": - final_chunk = translated_chunk - elif self.insert_mode == "append": - final_chunk = original_chunk + self.separator + translated_chunk - elif self.insert_mode == "prepend": - final_chunk = translated_chunk + self.separator + original_chunk - else: - final_chunk = translated_chunk - - tag_reconstruction_map[tag_id]['chunks'].append({'original': original_chunk, 'final': final_chunk}) + tag_reconstruction_map[tag_id]['chunks'].append({ + 'original': original_texts[i], + 'translated': translated_texts[i] + }) for tag_id, data in tag_reconstruction_map.items(): tag: Tag = data['tag_obj'] - reconstructed_html = data['new_html'] + final_html = "" - for chunk_info in data['chunks']: - reconstructed_html = reconstructed_html.replace(chunk_info['original'], chunk_info['final'], 1) + if data['is_split']: + # 如果是分割的段落,我们需要重组它 + reconstructed_html = data['original_full_html'] + for chunk in data['chunks']: + original_chunk = chunk['original'] + translated_chunk = chunk['translated'] - tag.clear() - new_content_soup = BeautifulSoup(reconstructed_html, 'html.parser') + if self.insert_mode == "replace": + final_chunk = translated_chunk + elif self.insert_mode == "append": + final_chunk = original_chunk + self.separator + translated_chunk + else: # prepend + final_chunk = translated_chunk + self.separator + original_chunk - if new_content_soup.body: - nodes_to_insert = list(new_content_soup.body.children) + # 使用带计数的替换,确保只替换第一个匹配项 + reconstructed_html = reconstructed_html.replace(original_chunk, final_chunk, 1) + + final_html = reconstructed_html else: - nodes_to_insert = list(new_content_soup.children) + # 如果是完整的标签内容(包括表格单元格),则直接处理 + chunk = data['chunks'][0] + original_chunk = chunk['original'] + translated_chunk = chunk['translated'] + + if self.insert_mode == "replace": + final_html = translated_chunk + elif self.insert_mode == "append": + # 对于表格,即使是 append 模式,直接拼接也可能破坏格式。 + # 因此,对于 td/th,我们强制在内部用 separator 分隔,而不是在标签外。 + if tag.name in ['td', 'th']: + final_html = f"{original_chunk}{self.separator}{translated_chunk}" + else: + final_html = original_chunk + self.separator + translated_chunk + else: # prepend + if tag.name in ['td', 'th']: + final_html = f"{translated_chunk}{self.separator}{original_chunk}" + else: + final_html = translated_chunk + self.separator + original_chunk + + # 清空旧内容并插入新内容 + tag.clear() + # 使用 BeautifulSoup 解析最终的 HTML 片段,以正确处理嵌套标签 + new_content_soup = BeautifulSoup(final_html, 'html.parser') + + # new_content_soup.body 可能不存在,如果 final_html 不含 body 标签。 + # 我们需要从 soup 的顶层子节点开始插入。 + nodes_to_insert = list(new_content_soup.children) + if len(nodes_to_insert) == 1 and nodes_to_insert[0].name == 'html': + nodes_to_insert = list(nodes_to_insert[0].body.children) for node in nodes_to_insert: + # .extract() 会将节点从原文档树中移除,这样可以直接 append 到新位置 tag.append(node.extract()) + # ==================== 修改结束 ==================== for file_path, soup in soups.items(): all_files[file_path] = str(soup).encode('utf-8')