支持表格翻译

This commit is contained in:
xunbu
2025-10-22 23:56:18 +08:00
parent 057b0f2456
commit 6ec5747e30

View File

@@ -10,7 +10,7 @@ from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple from typing import Self, Literal, List, Dict, Any, Tuple
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag, NavigableString
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
@@ -27,7 +27,7 @@ class EpubTranslatorConfig(AiTranslatorConfig):
class EpubTranslator(AiTranslator): class EpubTranslator(AiTranslator):
""" """
一个用于翻译 EPUB 文件中内容的翻译器。 一个用于翻译 EPUB 文件中内容的翻译器。
【高级版】此版本直接翻译HTML内容以保留内联格式。 【高级版】此版本直接翻译HTML内容以保留内联格式,并支持表格翻译
""" """
def __init__(self, config: EpubTranslatorConfig): def __init__(self, config: EpubTranslatorConfig):
@@ -55,10 +55,10 @@ class EpubTranslator(AiTranslator):
self.separator = config.separator self.separator = config.separator
def _pre_translate(self, document: Document) -> tuple[ def _pre_translate(self, document: Document) -> tuple[
Dict[str, bytes], # all_files: 原始文件内容 Dict[str, bytes],
Dict[str, BeautifulSoup], # soups: 解析后的HTML对象 Dict[str, BeautifulSoup],
List[Dict[str, Any]], # items_to_translate: 待翻译项 List[Dict[str, Any]],
List[str] # original_texts: 原始HTML片段 List[str]
]: ]:
all_files = {} all_files = {}
soups = {} soups = {}
@@ -90,8 +90,8 @@ class EpubTranslator(AiTranslator):
full_href = os.path.join(opf_dir, href).replace('\\', '/') full_href = os.path.join(opf_dir, href).replace('\\', '/')
manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')} manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')}
TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div'] # ==================== 代码修改 1: 添加表格相关的标签 ====================
TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'td', 'th']
# 定义一个正则表达式,用于按 <br> 和 <img> 标签分割内容 # 定义一个正则表达式,用于按 <br> 和 <img> 标签分割内容
split_pattern = re.compile(r'(<br\s*/?>|<img[^>]*>)', re.IGNORECASE) split_pattern = re.compile(r'(<br\s*/?>|<img[^>]*>)', re.IGNORECASE)
@@ -109,18 +109,12 @@ class EpubTranslator(AiTranslator):
soup = soups[file_path] soup = soups[file_path]
# ==================== 关键代码修改 ====================
# 采用“Bottom-Up”逻辑只选择不包含其他可翻译块级标签的“叶子”标签。
# 这种方法能准确地选取段落,并自动忽略像 <div class="calibre1"> 这样的父容器。
# 1. 找到所有可能的翻译标签
all_potential_tags = soup.find_all(TAGS_TO_TRANSLATE) all_potential_tags = soup.find_all(TAGS_TO_TRANSLATE)
all_potential_tags_set = set(all_potential_tags) # 用于快速查找 all_potential_tags_set = set(all_potential_tags)
tags_to_process = [] tags_to_process = []
for tag in all_potential_tags: for tag in all_potential_tags:
# 2. 检查当前标签内部是否还包含其他需要翻译的标签 # 采用“Bottom-Up”逻辑只选择不包含其他可翻译块级标签的“叶子”标签
# 如果没有,说明它是一个“叶子”节点,是我们要找的翻译单元。
contains_other_block = tag.find( contains_other_block = tag.find(
lambda child_tag: child_tag in all_potential_tags_set and child_tag is not tag lambda child_tag: child_tag in all_potential_tags_set and child_tag is not tag
) )
@@ -133,34 +127,50 @@ class EpubTranslator(AiTranslator):
if not inner_html or inner_html.isspace(): if not inner_html or inner_html.isspace():
continue continue
# 使用正则表达式分割内容,同时保留 <br> 和 <img> 标签 # ==================== 代码修改 2: 增加对表格标签的特殊处理 ====================
# 对于表格单元格td, th我们希望直接翻译其内容并替换。
# 这样做可以避免在单元格内部错误地插入 <br> 导致表格布局破坏。
# 其他标签(如 p, div则可以继续使用分割逻辑以支持段内换行。
is_table_cell = tag.name in ['td', 'th']
# ==================== 修改结束 ====================
html_parts = split_pattern.split(inner_html) html_parts = split_pattern.split(inner_html)
is_split = len(html_parts) > 1 # 如果不是表格单元格,且存在 <br> 或 <img>,则按片段处理
is_split = len(html_parts) > 1 and not is_table_cell
for part in html_parts: if is_split:
part_stripped = part.strip() # 逻辑保持不变:处理被 <br> 或 <img> 分割的段落
if not part_stripped: for part in html_parts:
continue part_stripped = part.strip()
if not part_stripped:
continue
# 判断当前部分是否是 <br> 或 <img> 分隔符标签 is_separator_tag = split_pattern.fullmatch(part_stripped)
is_separator_tag = split_pattern.fullmatch(part_stripped) plain_text = BeautifulSoup(part, 'html.parser').get_text(strip=True)
# ==================== 关键代码修改 ==================== if not is_separator_tag and plain_text:
# 检查片段是否包含实际可翻译的文本内容,而不仅仅是空白、&nbsp;或空的HTML标签 item_info = {
plain_text = BeautifulSoup(part, 'html.parser').get_text(strip=True) "file_path": file_path,
"tag": tag,
# 我们只翻译那些不是分隔符标签(如<br>, <img>)且含有实际文本内容的片段 "original_html": part,
if not is_separator_tag and plain_text: "original_full_html": inner_html,
"is_split": True
}
items_to_translate.append(item_info)
original_texts.append(part)
else:
# 对于完整的标签内容(或表格单元格),我们整体处理
plain_text = tag.get_text(strip=True)
if plain_text:
item_info = { item_info = {
"file_path": file_path, "file_path": file_path,
"tag": tag, "tag": tag,
"original_html": part, "original_html": inner_html,
"original_full_html": inner_html if is_split else None "is_split": False
} }
items_to_translate.append(item_info) items_to_translate.append(item_info)
original_texts.append(part) original_texts.append(inner_html)
# ==================== 修改结束 ====================
return all_files, soups, items_to_translate, original_texts return all_files, soups, items_to_translate, original_texts
@@ -172,50 +182,85 @@ class EpubTranslator(AiTranslator):
translated_texts: List[str], translated_texts: List[str],
original_texts: List[str], original_texts: List[str],
) -> bytes: ) -> bytes:
tag_reconstruction_map = defaultdict(lambda: {'new_html': None, 'chunks': []})
# ==================== 代码修改 3: 重构 _after_translate 逻辑 ====================
# 使用一个更清晰的 defaultdict 来处理内容的重构
# key 是每个独立 tag 对象的 idvalue 是待处理的信息
tag_reconstruction_map = defaultdict(
lambda: {'chunks': [], 'is_split': False, 'original_full_html': None, 'tag_obj': None})
for i, item_info in enumerate(items_to_translate): for i, item_info in enumerate(items_to_translate):
tag = item_info["tag"] tag = item_info["tag"]
tag_id = id(tag) tag_id = id(tag)
if tag_reconstruction_map[tag_id]['new_html'] is None:
original_full_html = item_info.get("original_full_html") or item_info["original_html"]
tag_reconstruction_map[tag_id]['new_html'] = original_full_html
tag_reconstruction_map[tag_id]['tag_obj'] = tag
for i, item_info in enumerate(items_to_translate): tag_reconstruction_map[tag_id]['is_split'] = item_info['is_split']
tag = item_info["tag"] tag_reconstruction_map[tag_id]['tag_obj'] = tag
tag_id = id(tag) if item_info['is_split']:
original_chunk = original_texts[i] tag_reconstruction_map[tag_id]['original_full_html'] = item_info['original_full_html']
translated_chunk = translated_texts[i]
if self.insert_mode == "replace": tag_reconstruction_map[tag_id]['chunks'].append({
final_chunk = translated_chunk 'original': original_texts[i],
elif self.insert_mode == "append": 'translated': translated_texts[i]
final_chunk = original_chunk + self.separator + translated_chunk })
elif self.insert_mode == "prepend":
final_chunk = translated_chunk + self.separator + original_chunk
else:
final_chunk = translated_chunk
tag_reconstruction_map[tag_id]['chunks'].append({'original': original_chunk, 'final': final_chunk})
for tag_id, data in tag_reconstruction_map.items(): for tag_id, data in tag_reconstruction_map.items():
tag: Tag = data['tag_obj'] tag: Tag = data['tag_obj']
reconstructed_html = data['new_html'] final_html = ""
for chunk_info in data['chunks']: if data['is_split']:
reconstructed_html = reconstructed_html.replace(chunk_info['original'], chunk_info['final'], 1) # 如果是分割的段落,我们需要重组它
reconstructed_html = data['original_full_html']
for chunk in data['chunks']:
original_chunk = chunk['original']
translated_chunk = chunk['translated']
tag.clear() if self.insert_mode == "replace":
new_content_soup = BeautifulSoup(reconstructed_html, 'html.parser') final_chunk = translated_chunk
elif self.insert_mode == "append":
final_chunk = original_chunk + self.separator + translated_chunk
else: # prepend
final_chunk = translated_chunk + self.separator + original_chunk
if new_content_soup.body: # 使用带计数的替换,确保只替换第一个匹配项
nodes_to_insert = list(new_content_soup.body.children) reconstructed_html = reconstructed_html.replace(original_chunk, final_chunk, 1)
final_html = reconstructed_html
else: else:
nodes_to_insert = list(new_content_soup.children) # 如果是完整的标签内容(包括表格单元格),则直接处理
chunk = data['chunks'][0]
original_chunk = chunk['original']
translated_chunk = chunk['translated']
if self.insert_mode == "replace":
final_html = translated_chunk
elif self.insert_mode == "append":
# 对于表格,即使是 append 模式,直接拼接也可能破坏格式。
# 因此,对于 td/th我们强制在内部用 separator 分隔,而不是在标签外。
if tag.name in ['td', 'th']:
final_html = f"{original_chunk}{self.separator}{translated_chunk}"
else:
final_html = original_chunk + self.separator + translated_chunk
else: # prepend
if tag.name in ['td', 'th']:
final_html = f"{translated_chunk}{self.separator}{original_chunk}"
else:
final_html = translated_chunk + self.separator + original_chunk
# 清空旧内容并插入新内容
tag.clear()
# 使用 BeautifulSoup 解析最终的 HTML 片段,以正确处理嵌套标签
new_content_soup = BeautifulSoup(final_html, 'html.parser')
# new_content_soup.body 可能不存在,如果 final_html 不含 body 标签。
# 我们需要从 soup 的顶层子节点开始插入。
nodes_to_insert = list(new_content_soup.children)
if len(nodes_to_insert) == 1 and nodes_to_insert[0].name == 'html':
nodes_to_insert = list(nodes_to_insert[0].body.children)
for node in nodes_to_insert: for node in nodes_to_insert:
# .extract() 会将节点从原文档树中移除,这样可以直接 append 到新位置
tag.append(node.extract()) tag.append(node.extract())
# ==================== 修改结束 ====================
for file_path, soup in soups.items(): for file_path, soup in soups.items():
all_files[file_path] = str(soup).encode('utf-8') all_files[file_path] = str(soup).encode('utf-8')