修复epub分隔符不能换行的问题
This commit is contained in:
@@ -6,7 +6,7 @@ import xml.etree.ElementTree as ET
|
|||||||
import zipfile
|
import zipfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Self, Literal, List, Dict, Any
|
from typing import Self, Literal, List, Dict, Any, Tuple
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
@@ -18,7 +18,8 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTr
|
|||||||
@dataclass
|
@dataclass
|
||||||
class EpubTranslatorConfig(AiTranslatorConfig):
|
class EpubTranslatorConfig(AiTranslatorConfig):
|
||||||
insert_mode: Literal["replace", "append", "prepend"] = "replace"
|
insert_mode: Literal["replace", "append", "prepend"] = "replace"
|
||||||
separator: str = "\n"
|
# 建议使用 <br />,它在 XHTML (EPUB标准) 中更规范
|
||||||
|
separator: str = "<br />"
|
||||||
|
|
||||||
|
|
||||||
class EpubTranslator(AiTranslator):
|
class EpubTranslator(AiTranslator):
|
||||||
@@ -33,7 +34,7 @@ class EpubTranslator(AiTranslator):
|
|||||||
self.translate_agent = None
|
self.translate_agent = None
|
||||||
if not self.skip_translate:
|
if not self.skip_translate:
|
||||||
agent_config = SegmentsTranslateAgentConfig(
|
agent_config = SegmentsTranslateAgentConfig(
|
||||||
custom_prompt=config.custom_prompt, # 使用优化后的prompt
|
custom_prompt=config.custom_prompt,
|
||||||
to_lang=config.to_lang,
|
to_lang=config.to_lang,
|
||||||
base_url=config.base_url,
|
base_url=config.base_url,
|
||||||
api_key=config.api_key,
|
api_key=config.api_key,
|
||||||
@@ -52,11 +53,15 @@ class EpubTranslator(AiTranslator):
|
|||||||
self.separator = config.separator
|
self.separator = config.separator
|
||||||
|
|
||||||
def _pre_translate(self, document: Document) -> tuple[
|
def _pre_translate(self, document: Document) -> tuple[
|
||||||
Dict[str, bytes], List[Dict[str, Any]], List[str]
|
Dict[str, bytes], # all_files: 原始文件内容
|
||||||
|
Dict[str, BeautifulSoup], # soups: 解析后的HTML对象
|
||||||
|
List[Dict[str, Any]], # items_to_translate: 待翻译项
|
||||||
|
List[str] # original_texts: 原始HTML片段
|
||||||
]:
|
]:
|
||||||
all_files = {}
|
all_files = {}
|
||||||
|
soups = {} # << [关键修改] 存储解析后的BS对象
|
||||||
items_to_translate = []
|
items_to_translate = []
|
||||||
original_texts = [] # 现在这里存储的是HTML片段
|
original_texts = []
|
||||||
|
|
||||||
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
|
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
|
||||||
for filename in zf.namelist():
|
for filename in zf.namelist():
|
||||||
@@ -94,41 +99,38 @@ class EpubTranslator(AiTranslator):
|
|||||||
self.logger.warning(f"在 EPUB 中找不到文件: {file_path}")
|
self.logger.warning(f"在 EPUB 中找不到文件: {file_path}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
soup = BeautifulSoup(content_bytes, "html.parser")
|
# << [关键修改] 解析一次并存储
|
||||||
|
if file_path not in soups:
|
||||||
|
soups[file_path] = BeautifulSoup(content_bytes, "html.parser")
|
||||||
|
|
||||||
|
soup = soups[file_path]
|
||||||
for tag in soup.find_all(TAGS_TO_TRANSLATE):
|
for tag in soup.find_all(TAGS_TO_TRANSLATE):
|
||||||
# 获取标签内部的HTML,而不是纯文本
|
|
||||||
inner_html = tag.decode_contents()
|
inner_html = tag.decode_contents()
|
||||||
# 只有在内部有实际内容(不仅仅是空白)时才翻译
|
|
||||||
if inner_html and not inner_html.isspace():
|
if inner_html and not inner_html.isspace():
|
||||||
item_info = {
|
item_info = {
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
"tag": tag,
|
"tag": tag, # 这个tag是soups[file_path]中的活引用
|
||||||
"original_html": inner_html,
|
"original_html": inner_html,
|
||||||
}
|
}
|
||||||
items_to_translate.append(item_info)
|
items_to_translate.append(item_info)
|
||||||
original_texts.append(inner_html)
|
original_texts.append(inner_html)
|
||||||
|
|
||||||
return all_files, items_to_translate, original_texts
|
return all_files, soups, items_to_translate, original_texts
|
||||||
|
|
||||||
def _after_translate(
|
def _after_translate(
|
||||||
self,
|
self,
|
||||||
all_files: Dict[str, bytes],
|
all_files: Dict[str, bytes],
|
||||||
|
soups: Dict[str, BeautifulSoup], # << [关键修改] 接收解析好的BS对象
|
||||||
items_to_translate: List[Dict[str, Any]],
|
items_to_translate: List[Dict[str, Any]],
|
||||||
translated_texts: List[str],
|
translated_texts: List[str],
|
||||||
original_texts: List[str], # 这里是 original_htmls
|
original_texts: List[str],
|
||||||
) -> bytes:
|
) -> bytes:
|
||||||
modified_soups = {}
|
|
||||||
|
|
||||||
for i, item_info in enumerate(items_to_translate):
|
for i, item_info in enumerate(items_to_translate):
|
||||||
file_path = item_info["file_path"]
|
# << [关键修改] 直接使用 item_info 中的活引用 tag,它属于 soups 字典中的一个对象
|
||||||
tag: Tag = item_info["tag"]
|
tag: Tag = item_info["tag"]
|
||||||
translated_html = translated_texts[i]
|
translated_html = translated_texts[i]
|
||||||
original_html = original_texts[i]
|
original_html = original_texts[i]
|
||||||
|
|
||||||
if file_path not in modified_soups:
|
|
||||||
modified_soups[file_path] = tag.find_parent('html')
|
|
||||||
|
|
||||||
# [修改] 处理 insert_mode,现在操作的是HTML片段
|
|
||||||
if self.insert_mode == "replace":
|
if self.insert_mode == "replace":
|
||||||
final_html = translated_html
|
final_html = translated_html
|
||||||
elif self.insert_mode == "append":
|
elif self.insert_mode == "append":
|
||||||
@@ -138,15 +140,24 @@ class EpubTranslator(AiTranslator):
|
|||||||
else:
|
else:
|
||||||
final_html = translated_html
|
final_html = translated_html
|
||||||
|
|
||||||
# [修改] 清空旧内容,并追加解析后的新HTML内容
|
# 清空旧内容
|
||||||
tag.clear()
|
tag.clear()
|
||||||
# 解析AI返回的HTML字符串,'html.parser'对此有很好的容错性
|
|
||||||
new_content = BeautifulSoup(final_html, 'html.parser')
|
|
||||||
# 将解析后的所有子节点追加到原tag中
|
|
||||||
for child in new_content.contents:
|
|
||||||
tag.append(child.extract()) # extract()会从原文档树中移除节点,避免重复
|
|
||||||
|
|
||||||
for file_path, soup in modified_soups.items():
|
# 解析新的HTML片段
|
||||||
|
new_content_soup = BeautifulSoup(final_html, 'html.parser')
|
||||||
|
|
||||||
|
# << [关键修复] 将新片段的*内容*(而不是整个文档)移动到原始标签中
|
||||||
|
# 使用 list() 创建副本以安全地迭代和修改
|
||||||
|
if new_content_soup.body:
|
||||||
|
nodes_to_insert = list(new_content_soup.body.children)
|
||||||
|
else:
|
||||||
|
nodes_to_insert = list(new_content_soup.children)
|
||||||
|
|
||||||
|
for node in nodes_to_insert:
|
||||||
|
tag.append(node.extract()) # .extract() 从旧树中移除并返回节点
|
||||||
|
|
||||||
|
# << [关键修改] 从修改后的soups对象生成新的文件内容
|
||||||
|
for file_path, soup in soups.items():
|
||||||
all_files[file_path] = str(soup).encode('utf-8')
|
all_files[file_path] = str(soup).encode('utf-8')
|
||||||
|
|
||||||
output_buffer = BytesIO()
|
output_buffer = BytesIO()
|
||||||
@@ -158,11 +169,10 @@ class EpubTranslator(AiTranslator):
|
|||||||
zf_out.writestr(filename, content, compress_type=zipfile.ZIP_DEFLATED)
|
zf_out.writestr(filename, content, compress_type=zipfile.ZIP_DEFLATED)
|
||||||
return output_buffer.getvalue()
|
return output_buffer.getvalue()
|
||||||
|
|
||||||
# translate 和 translate_async 方法无需修改,因为它们调用的_pre_translate和_after_translate已经被更新了
|
|
||||||
def translate(self, document: Document) -> Self:
|
def translate(self, document: Document) -> Self:
|
||||||
all_files, items_to_translate, original_texts = self._pre_translate(document)
|
all_files, soups, items_to_translate, original_texts = self._pre_translate(document)
|
||||||
if not items_to_translate:
|
if not items_to_translate:
|
||||||
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
|
self.logger.info("\n文件中没有找到需要翻译的内容。")
|
||||||
return self
|
return self
|
||||||
if self.glossary_agent:
|
if self.glossary_agent:
|
||||||
self.glossary_dict_gen = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
self.glossary_dict_gen = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
@@ -172,17 +182,18 @@ class EpubTranslator(AiTranslator):
|
|||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
else:
|
else:
|
||||||
translated_texts = original_texts
|
translated_texts = original_texts
|
||||||
|
# << [关键修改] 传递 soups 对象
|
||||||
document.content = self._after_translate(
|
document.content = self._after_translate(
|
||||||
all_files, items_to_translate, translated_texts, original_texts
|
all_files, soups, items_to_translate, translated_texts, original_texts
|
||||||
)
|
)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_async(self, document: Document) -> Self:
|
async def translate_async(self, document: Document) -> Self:
|
||||||
all_files, items_to_translate, original_texts = await asyncio.to_thread(
|
all_files, soups, items_to_translate, original_texts = await asyncio.to_thread(
|
||||||
self._pre_translate, document
|
self._pre_translate, document
|
||||||
)
|
)
|
||||||
if not items_to_translate:
|
if not items_to_translate:
|
||||||
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
|
self.logger.info("\n文件中没有找到需要翻译的内容。")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
if self.glossary_agent:
|
if self.glossary_agent:
|
||||||
@@ -195,7 +206,8 @@ class EpubTranslator(AiTranslator):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
translated_texts = original_texts
|
translated_texts = original_texts
|
||||||
|
# << [关键修改] 传递 soups 对象
|
||||||
document.content = await asyncio.to_thread(
|
document.content = await asyncio.to_thread(
|
||||||
self._after_translate, all_files, items_to_translate, translated_texts, original_texts
|
self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts
|
||||||
)
|
)
|
||||||
return self
|
return self
|
||||||
Reference in New Issue
Block a user