Files
docutranslate/docutranslate/translator/ai_translator/html_translator.py
2026-01-09 18:00:16 +08:00

242 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0
import asyncio
from dataclasses import dataclass
from typing import Self, Literal, Set, Dict, List, Tuple
from bs4 import BeautifulSoup, NavigableString, Comment, Tag
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
# --- 规则定义 ---
# 1. 不可翻译标签(黑名单)
NON_TRANSLATABLE_TAGS: Set[str] = {
'script', 'style', 'pre', 'code', 'kbd', 'samp', 'var', 'noscript', 'meta', 'link', 'head',
}
# 2. 可作为独立翻译单元的块级标签(白名单)
# 这些标签将被视为一个整体进行翻译并且在append/prepend模式下会触发结构化操作。
TRANSLATABLE_BLOCK_TAGS: Set[str] = {
'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'blockquote', 'q', 'caption',
'td', 'th', 'button', 'legend', 'figcaption', 'summary', 'details', 'div',
}
# 3. 可翻译属性(白名单)
SAFE_ATTRIBUTES: Dict[str, List[str]] = {
'img': ['alt', 'title'],
'a': ['title'],
'input': ['placeholder', 'title'],
'textarea': ['placeholder', 'title'],
'abbr': ['title'],
'area': ['alt'],
'*': ['title']
}
@dataclass
class HtmlTranslatorConfig(AiTranslatorConfig):
insert_mode: Literal["replace", "append", "prepend"] = "replace"
separator: str = "\n"
class HtmlTranslator(AiTranslator):
"""
一个用于翻译 HTML 文件内容的翻译器。
【结构化修改版】: 借鉴 Docx/EpubTranslator 的实现,将块级元素作为整体翻译单元。
在 append/prepend 模式下,对常规块级元素创建新标签存放译文,对表格单元格则在内部追加内容,
以保证文档结构的清晰和样式的绝对一致性,同时保留了强大的黑白名单安全规则。
"""
def __init__(self, config: HtmlTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
self.translate_agent = None
glossary_dict = self.glossary.glossary_dict if self.glossary else None
if not self.skip_translate:
agent_config = SegmentsTranslateAgentConfig(
custom_prompt=config.custom_prompt,
to_lang=config.to_lang,
base_url=config.base_url,
api_key=config.api_key,
model_id=config.model_id,
temperature=config.temperature,
thinking=config.thinking,
concurrent=config.concurrent,
timeout=config.timeout,
logger=self.logger,
glossary_dict=glossary_dict,
retry=config.retry,
system_proxy_enable=config.system_proxy_enable,
force_json=config.force_json,
rpm=config.rpm,
tpm=config.tpm,
provider=config.provider,
)
self.translate_agent = SegmentsTranslateAgent(agent_config)
self.insert_mode = config.insert_mode
self.separator = config.separator
def _pre_translate(self, document: Document) -> Tuple[BeautifulSoup, List[Dict], List[str]]:
soup = BeautifulSoup(document.content, 'lxml')
for tag in soup.find_all(NON_TRANSLATABLE_TAGS):
tag.decompose()
translatable_items = []
original_texts = []
# --- 1. 提取块级标签进行翻译 ---
all_potential_blocks = soup.find_all(TRANSLATABLE_BLOCK_TAGS)
all_potential_blocks_set = set(all_potential_blocks)
tags_to_process = []
for tag in all_potential_blocks:
# 采用“Bottom-Up”逻辑只选择不包含其他可翻译块级标签的“叶子”标签。
contains_other_block = tag.find(
lambda child_tag: child_tag in all_potential_blocks_set and child_tag is not tag
)
if not contains_other_block:
tags_to_process.append(tag)
for tag in tags_to_process:
if tag.get_text(strip=True):
translatable_items.append({'type': 'block_tag', 'tag': tag})
original_texts.append(tag.decode_contents())
# --- 2. 提取安全属性进行翻译 ---
for tag in soup.find_all(True):
attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', [])
for attr in set(attributes_to_check):
if tag.has_attr(attr) and tag[attr].strip():
translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr})
original_texts.append(tag[attr])
return soup, translatable_items, original_texts
def _after_translate(self, soup: BeautifulSoup, translatable_items: list,
translated_texts: list[str], original_texts: list[str]) -> bytes:
if len(translatable_items) != len(translated_texts):
self.logger.error("翻译前后的文本片段数量不匹配 (%d vs %d),跳过写入操作以防损坏文件。",
len(translatable_items), len(translated_texts))
return soup.encode('utf-8')
for i, item in enumerate(translatable_items):
original_text = original_texts[i]
translated_text = translated_texts[i]
tag = item['tag']
# --- 分类处理:属性 vs. 块级标签 ---
if item['type'] == 'attribute':
attr = item['attribute']
separator_for_attr = self.separator.replace('\n', ' ').strip()
new_attr_value = ""
if self.insert_mode == "replace":
new_attr_value = translated_text
elif self.insert_mode == "append":
new_attr_value = f"{original_text}{separator_for_attr}{translated_text}"
elif self.insert_mode == "prepend":
new_attr_value = f"{translated_text}{separator_for_attr}{original_text}"
tag[attr] = new_attr_value.strip()
elif item['type'] == 'block_tag':
is_table_cell = tag.name in ['td', 'th']
if self.insert_mode == "replace":
tag.clear()
new_content_soup = BeautifulSoup(translated_text, 'html.parser')
for node in list(new_content_soup.children):
tag.append(node.extract())
elif is_table_cell:
# 表格单元格:在内部组合内容
tag.clear()
original_nodes = BeautifulSoup(original_text, 'html.parser').contents
translated_nodes = BeautifulSoup(translated_text, 'html.parser').contents
separator_nodes = []
if self.separator:
lines = self.separator.split('\n')
for j, line in enumerate(lines):
if line: separator_nodes.append(NavigableString(line))
if j < len(lines) - 1: separator_nodes.append(soup.new_tag('br'))
order = [original_nodes, separator_nodes, translated_nodes] if self.insert_mode == "append" else [
translated_nodes, separator_nodes, original_nodes]
for node_list in order:
for node in node_list:
tag.append(node.extract() if isinstance(node, Tag) else node)
else:
# 常规块级元素:创建新标签
translated_tag = soup.new_tag(tag.name, attrs=tag.attrs)
new_content_soup = BeautifulSoup(translated_text, 'html.parser')
for node in list(new_content_soup.children):
translated_tag.append(node.extract())
separator_tag = None
if self.separator:
separator_tag = soup.new_tag('p')
lines = self.separator.split('\n')
for j, line in enumerate(lines):
if line: separator_tag.append(NavigableString(line))
if j < len(lines) - 1: separator_tag.append(soup.new_tag('br'))
if self.insert_mode == "append":
current_node = tag
if separator_tag:
current_node.insert_after(separator_tag)
current_node = separator_tag
current_node.insert_after(translated_tag)
elif self.insert_mode == "prepend":
tag.insert_before(translated_tag)
if separator_tag:
translated_tag.insert_after(separator_tag)
return soup.encode('utf-8')
def translate(self, document: Document) -> Self:
soup, translatable_items, original_texts = self._pre_translate(document)
if not translatable_items:
self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。")
document.content = soup.encode('utf-8')
return self
if self.glossary_agent:
glossary_dict_gen = self.glossary_agent.send_segments(original_texts, self.chunk_size)
if self.glossary:
self.glossary.update(glossary_dict_gen)
if self.translate_agent:
self.translate_agent.update_glossary_dict(glossary_dict_gen)
if self.translate_agent:
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
else:
translated_texts = original_texts
document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts)
return self
async def translate_async(self, document: Document) -> Self:
soup, translatable_items, original_texts = await asyncio.to_thread(self._pre_translate, document)
if not translatable_items:
self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。")
document.content = await asyncio.to_thread(soup.encode, 'utf-8')
return self
if self.glossary_agent:
glossary_dict_gen = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
if self.glossary:
self.glossary.update(glossary_dict_gen)
if self.translate_agent:
self.translate_agent.update_glossary_dict(glossary_dict_gen)
if self.translate_agent:
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
else:
translated_texts = original_texts
document.content = await asyncio.to_thread(
self._after_translate, soup, translatable_items, translated_texts, original_texts
)
return self