修复epub排版问题

This commit is contained in:
xunbu
2025-12-17 21:00:36 +08:00
parent 49b7954e0f
commit c9ffa8f195
2 changed files with 68 additions and 24 deletions

View File

@@ -1,3 +1,3 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
__version__="1.5.6a1" __version__="1.5.6a2"

View File

@@ -18,7 +18,7 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTr
@dataclass @dataclass
class EpubTranslatorConfig(AiTranslatorConfig): class EpubTranslatorConfig(AiTranslatorConfig):
insert_mode: Literal["replace", "append", "prepend"] = "replace" insert_mode: Literal["replace", "append", "prepend"] = "replace"
# 建议使用 \n代码会将其转换为 <br />,更灵活 # 建议使用 \n代码会将其转换为 <br /> 或 <span> 换行,更灵活
separator: str = "\n" separator: str = "\n"
@@ -26,8 +26,10 @@ class EpubTranslator(AiTranslator):
""" """
一个用于翻译 EPUB 文件中内容的翻译器。 一个用于翻译 EPUB 文件中内容的翻译器。
【高级版】此版本直接翻译HTML内容以保留内联格式并支持表格翻译。 【高级版】此版本直接翻译HTML内容以保留内联格式并支持表格翻译。
【结构化修改版 v2】借鉴 DocxTranslator 的实现,在 append/prepend 模式下, 【结构化修改版 v3】
对常规块级元素创建新标签存放译文,对表格单元格则在内部追加内容,以保证文档结构的正确性 1. 修复了 BeautifulSoup 自动添加 <html><body> 导致嵌套错误的问题
2. 对 li, div, td, th 采用内部追加模式,保护文档结构。
3. 复制标签时自动移除 ID 属性,防止锚点冲突。
""" """
def __init__(self, config: EpubTranslatorConfig): def __init__(self, config: EpubTranslatorConfig):
@@ -66,10 +68,12 @@ class EpubTranslator(AiTranslator):
items_to_translate = [] items_to_translate = []
original_texts = [] original_texts = []
# 读取 Zip 内容
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf: with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
for filename in zf.namelist(): for filename in zf.namelist():
all_files[filename] = zf.read(filename) all_files[filename] = zf.read(filename)
# 解析 container.xml 寻找 OPF
container_xml = all_files.get('META-INF/container.xml') container_xml = all_files.get('META-INF/container.xml')
if not container_xml: if not container_xml:
raise ValueError("无效的 EPUB找不到 META-INF/container.xml") raise ValueError("无效的 EPUB找不到 META-INF/container.xml")
@@ -78,6 +82,7 @@ class EpubTranslator(AiTranslator):
opf_path = root.find('cn:rootfiles/cn:rootfile', ns).get('full-path') opf_path = root.find('cn:rootfiles/cn:rootfile', ns).get('full-path')
opf_dir = os.path.dirname(opf_path) opf_dir = os.path.dirname(opf_path)
# 解析 OPF 获取文件清单
opf_xml = all_files.get(opf_path) opf_xml = all_files.get(opf_path)
if not opf_xml: if not opf_xml:
raise ValueError(f"无效的 EPUB找不到 {opf_path}") raise ValueError(f"无效的 EPUB找不到 {opf_path}")
@@ -91,6 +96,7 @@ class EpubTranslator(AiTranslator):
full_href = os.path.join(opf_dir, href).replace('\\', '/') full_href = os.path.join(opf_dir, href).replace('\\', '/')
manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')} manifest_items[item_id] = {'href': full_href, 'media_type': item.get('media-type')}
# 定义需要翻译的标签
TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'td', 'th'] TAGS_TO_TRANSLATE = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'div', 'td', 'th']
for item_id, item_data in manifest_items.items(): for item_id, item_data in manifest_items.items():
@@ -112,6 +118,7 @@ class EpubTranslator(AiTranslator):
tags_to_process = [] tags_to_process = []
for tag in all_potential_tags: for tag in all_potential_tags:
# 仅处理叶子节点块(不包含其他待翻译块的块)
contains_other_block = tag.find( contains_other_block = tag.find(
lambda child_tag: child_tag in all_potential_tags_set and child_tag is not tag lambda child_tag: child_tag in all_potential_tags_set and child_tag is not tag
) )
@@ -147,34 +154,54 @@ class EpubTranslator(AiTranslator):
original_html = original_texts[i] original_html = original_texts[i]
translated_html = translated_texts[i] translated_html = translated_texts[i]
# --- 关键逻辑:根据标签类型选择不同的处理策略 --- # --- 核心修复:解析并剥离 HTML/BODY 外壳 ---
is_table_cell = original_tag.name in ['td', 'th'] # BeautifulSoup(html, 'lxml') 会自动补全 <html><body>,必须剥离
new_content_soup = BeautifulSoup(translated_html, 'lxml')
content_nodes = []
if new_content_soup.body:
content_nodes = list(new_content_soup.body.contents)
elif new_content_soup.html:
content_nodes = list(new_content_soup.html.contents)
else:
content_nodes = list(new_content_soup.contents)
# 策略 A: 替换模式Replace
if self.insert_mode == "replace": if self.insert_mode == "replace":
original_tag.clear() original_tag.clear()
new_content_soup = BeautifulSoup(translated_html, 'lxml') for node in content_nodes:
for node in list(new_content_soup.children):
original_tag.append(node.extract()) original_tag.append(node.extract())
continue
elif is_table_cell: # 策略 B: 容器型元素 (td, th, li, div) -> 在内部追加
# --- 表格单元格处理:在标签内部组合内容 --- # 这样可以保护 ul/ol 结构不被打断div 布局不被破坏
is_container_node = original_tag.name in ['td', 'th', 'li', 'div']
if is_container_node:
original_tag.clear() original_tag.clear()
# 解析HTML片段 # 解析原文 (注意防范原文被解析出多余标签)
original_nodes = BeautifulSoup(original_html, 'lxml').contents orig_soup = BeautifulSoup(original_html, 'lxml')
translated_nodes = BeautifulSoup(translated_html, 'lxml').contents original_nodes = list(orig_soup.body.contents) if orig_soup.body else list(orig_soup.contents)
# 创建分隔符节点 # 译文节点使用上面剥离好的 content_nodes
translated_nodes = content_nodes
# 构建分隔符 (容器内部使用 span/br不使用 p/div 以防破坏行内流)
separator_nodes = [] separator_nodes = []
if self.separator: if self.separator:
lines = self.separator.split('\n') lines = self.separator.split('\n')
for j, line in enumerate(lines): for j, line in enumerate(lines):
if line: if line:
separator_nodes.append(NavigableString(line)) sep_span = soup.new_tag('span', attrs={'class': 'translate-separator'})
sep_span.string = line
separator_nodes.append(sep_span)
if j < len(lines) - 1: if j < len(lines) - 1:
separator_nodes.append(soup.new_tag('br')) separator_nodes.append(soup.new_tag('br'))
# 额外加一个换行区分原文和译文
separator_nodes.append(soup.new_tag('br'))
# 根据模式按顺序重新填充 # 组装内容
if self.insert_mode == "append": if self.insert_mode == "append":
nodes_order = [original_nodes, separator_nodes, translated_nodes] nodes_order = [original_nodes, separator_nodes, translated_nodes]
else: # prepend else: # prepend
@@ -182,18 +209,32 @@ class EpubTranslator(AiTranslator):
for node_list in nodes_order: for node_list in nodes_order:
for node in node_list: for node in node_list:
original_tag.append(node.extract() if isinstance(node, Tag) else node) if node:
original_tag.append(node.extract() if isinstance(node, Tag) else node)
# 策略 C: 独立文本块 (p, h1-h6) -> 创建兄弟标签
else: else:
# --- 常规块级元素处理:创建新标签 --- # 复制属性,但必须删除 ID 以防冲突
translated_tag = soup.new_tag(original_tag.name, attrs=original_tag.attrs) new_attrs = dict(original_tag.attrs)
new_content_soup = BeautifulSoup(translated_html, 'lxml') if 'id' in new_attrs:
for node in list(new_content_soup.children): del new_attrs['id']
translated_tag.append(node.extract())
translated_tag = soup.new_tag(original_tag.name, attrs=new_attrs)
# 填充译文
for node in content_nodes:
# 额外检查:防止 <p> 里面套 <p>
# 如果译文被识别为 <p>翻译</p>,而外层容器也是 <p>,则只取内部文本
if isinstance(node, Tag) and node.name == original_tag.name and node.name == 'p':
for inner_child in list(node.contents):
translated_tag.append(inner_child.extract())
else:
translated_tag.append(node.extract())
# 创建块级分隔符
separator_tag = None separator_tag = None
if self.separator: if self.separator:
separator_tag = soup.new_tag('p') separator_tag = soup.new_tag('div', attrs={'class': 'translate-separator'})
lines = self.separator.split('\n') lines = self.separator.split('\n')
for j, line in enumerate(lines): for j, line in enumerate(lines):
if line: if line:
@@ -202,16 +243,19 @@ class EpubTranslator(AiTranslator):
separator_tag.append(soup.new_tag('br')) separator_tag.append(soup.new_tag('br'))
if self.insert_mode == "append": if self.insert_mode == "append":
# 插入顺序:原文 -> 分隔符 -> 译文
current_node = original_tag current_node = original_tag
if separator_tag: if separator_tag:
current_node.insert_after(separator_tag) current_node.insert_after(separator_tag)
current_node = separator_tag current_node = separator_tag
current_node.insert_after(translated_tag) current_node.insert_after(translated_tag)
elif self.insert_mode == "prepend": elif self.insert_mode == "prepend":
# 插入顺序:译文 -> 分隔符 -> 原文
original_tag.insert_before(translated_tag) original_tag.insert_before(translated_tag)
if separator_tag: if separator_tag:
translated_tag.insert_after(separator_tag) translated_tag.insert_after(separator_tag)
# 重新打包 EPUB
for file_path, soup in soups.items(): for file_path, soup in soups.items():
all_files[file_path] = str(soup).encode('utf-8') all_files[file_path] = str(soup).encode('utf-8')