增加html工作流
This commit is contained in:
@@ -89,8 +89,10 @@ class Agent:
|
|||||||
self.temperature = config.temperature
|
self.temperature = config.temperature
|
||||||
# self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
|
# self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
|
||||||
# self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
|
# self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
|
||||||
self.client = httpx.Client(verify=False) if USE_PROXY else httpx.Client(proxy=None, verify=False)
|
self.client = httpx.Client(verify=False) if USE_PROXY else httpx.Client(trust_env=False, proxy=None,
|
||||||
self.client_async = httpx.AsyncClient(verify=False) if USE_PROXY else httpx.AsyncClient(proxy=None,
|
verify=False)
|
||||||
|
self.client_async = httpx.AsyncClient(verify=False) if USE_PROXY else httpx.AsyncClient(trust_env=False,
|
||||||
|
proxy=None,
|
||||||
verify=False)
|
verify=False)
|
||||||
self.max_concurrent = config.max_concurrent
|
self.max_concurrent = config.max_concurrent
|
||||||
self.timeout = config.timeout
|
self.timeout = config.timeout
|
||||||
|
|||||||
@@ -28,6 +28,9 @@ from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
|||||||
from docutranslate.workflow.base import Workflow
|
from docutranslate.workflow.base import Workflow
|
||||||
from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig
|
from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig
|
||||||
from docutranslate.workflow.epub_workflow import EpubWorkflow, EpubWorkflowConfig
|
from docutranslate.workflow.epub_workflow import EpubWorkflow, EpubWorkflowConfig
|
||||||
|
# --- HTML WORKFLOW IMPORT START ---
|
||||||
|
from docutranslate.workflow.html_workflow import HtmlWorkflow, HtmlWorkflowConfig
|
||||||
|
# --- HTML WORKFLOW IMPORT END ---
|
||||||
from docutranslate.workflow.interfaces import DocxExportable, EpubExportable
|
from docutranslate.workflow.interfaces import DocxExportable, EpubExportable
|
||||||
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \
|
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \
|
||||||
XlsxExportable, SrtExportable
|
XlsxExportable, SrtExportable
|
||||||
@@ -54,6 +57,9 @@ from docutranslate.translator.ai_translator.srt_translator import SrtTranslatorC
|
|||||||
from docutranslate.exporter.srt.srt2html_exporter import Srt2HTMLExporterConfig
|
from docutranslate.exporter.srt.srt2html_exporter import Srt2HTMLExporterConfig
|
||||||
from docutranslate.translator.ai_translator.epub_translator import EpubTranslatorConfig
|
from docutranslate.translator.ai_translator.epub_translator import EpubTranslatorConfig
|
||||||
from docutranslate.exporter.epub.epub2html_exporter import Epub2HTMLExporterConfig
|
from docutranslate.exporter.epub.epub2html_exporter import Epub2HTMLExporterConfig
|
||||||
|
# --- HTML TRANSLATOR IMPORT START ---
|
||||||
|
from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig
|
||||||
|
# --- HTML TRANSLATOR IMPORT END ---
|
||||||
# ------------------------------------
|
# ------------------------------------
|
||||||
|
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
@@ -76,6 +82,7 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = {
|
|||||||
"docx": DocxWorkflow,
|
"docx": DocxWorkflow,
|
||||||
"srt": SrtWorkflow,
|
"srt": SrtWorkflow,
|
||||||
"epub": EpubWorkflow,
|
"epub": EpubWorkflow,
|
||||||
|
"html": HtmlWorkflow,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -282,10 +289,24 @@ class EpubWorkflowParams(BaseWorkflowParams):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# --- HTML WORKFLOW PARAMS START ---
|
||||||
|
class HtmlWorkflowParams(BaseWorkflowParams):
|
||||||
|
workflow_type: Literal['html'] = Field(..., description="指定使用HTML的翻译工作流。")
|
||||||
|
insert_mode: Literal["replace", "append", "prepend"] = Field(
|
||||||
|
"replace",
|
||||||
|
description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。"
|
||||||
|
)
|
||||||
|
separator: str = Field(
|
||||||
|
" ",
|
||||||
|
description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。"
|
||||||
|
)
|
||||||
|
# --- HTML WORKFLOW PARAMS END ---
|
||||||
|
|
||||||
|
|
||||||
# 3. 使用可辨识联合类型(Discriminated Union)将它们组合起来
|
# 3. 使用可辨识联合类型(Discriminated Union)将它们组合起来
|
||||||
TranslatePayload = Annotated[
|
TranslatePayload = Annotated[
|
||||||
Union[
|
Union[
|
||||||
MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams],
|
MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams, HtmlWorkflowParams],
|
||||||
Field(discriminator='workflow_type')
|
Field(discriminator='workflow_type')
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -293,7 +314,7 @@ TranslatePayload = Annotated[
|
|||||||
# 4. 创建最终的请求体模型
|
# 4. 创建最终的请求体模型
|
||||||
class TranslateServiceRequest(BaseModel):
|
class TranslateServiceRequest(BaseModel):
|
||||||
file_name: str = Field(..., description="上传的原始文件名,含扩展名。",
|
file_name: str = Field(..., description="上传的原始文件名,含扩展名。",
|
||||||
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub"])
|
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"])
|
||||||
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
|
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
|
||||||
payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")
|
payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")
|
||||||
|
|
||||||
@@ -393,7 +414,24 @@ class TranslateServiceRequest(BaseModel):
|
|||||||
"insert_mode": "replace",
|
"insert_mode": "replace",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
# --- HTML EXAMPLE START ---
|
||||||
|
{
|
||||||
|
"summary": "HTML 工作流示例",
|
||||||
|
"value": {
|
||||||
|
"file_name": "company_about_us.html",
|
||||||
|
"file_content": "PGh0bWw+PGhlYWQ+PHRpdGxlPkFib3V0IFVzPC90aXRsZT48L2hlYWQ+PGJvZHk+PGgxPk91ciBDb21wYW55PC9oMT48cD5XZSBhcmUgYSBsZWFkaW5nIHByb3ZpZGVyIG9mIGlubm92YXRpdmUgc29sdXRpb25zLjwvcD48L2JvZHk+PC9odG1sPg==",
|
||||||
|
"payload": {
|
||||||
|
"workflow_type": "html",
|
||||||
|
"base_url": "https://api.openai.com/v1",
|
||||||
|
"api_key": "sk-your-api-key-here",
|
||||||
|
"model_id": "gpt-4o",
|
||||||
|
"to_lang": "简体中文",
|
||||||
|
"insert_mode": "replace"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
# --- HTML EXAMPLE END ---
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -552,6 +590,23 @@ async def _perform_translation(
|
|||||||
)
|
)
|
||||||
workflow = EpubWorkflow(config=workflow_config)
|
workflow = EpubWorkflow(config=workflow_config)
|
||||||
|
|
||||||
|
# --- HTML WORKFLOW LOGIC START ---
|
||||||
|
elif isinstance(payload, HtmlWorkflowParams):
|
||||||
|
task_logger.info("构建 HtmlWorkflow 配置。")
|
||||||
|
translator_config = HtmlTranslatorConfig(
|
||||||
|
**payload.model_dump(include={
|
||||||
|
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
|
||||||
|
'temperature', 'thinking', 'chunk_size', 'concurrent',
|
||||||
|
'insert_mode', 'separator'
|
||||||
|
}, exclude_none=True)
|
||||||
|
)
|
||||||
|
workflow_config = HtmlWorkflowConfig(
|
||||||
|
translator_config=translator_config,
|
||||||
|
logger=task_logger
|
||||||
|
)
|
||||||
|
workflow = HtmlWorkflow(config=workflow_config)
|
||||||
|
# --- HTML WORKFLOW LOGIC END ---
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
|
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
|
||||||
|
|
||||||
@@ -678,7 +733,7 @@ def _cancel_translation_logic(task_id: str):
|
|||||||
description="""
|
description="""
|
||||||
接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。
|
接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。
|
||||||
|
|
||||||
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`, `srt`, `epub`)。
|
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`, `srt`, `epub`, `html`)。
|
||||||
- **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。
|
- **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。
|
||||||
- **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。
|
- **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。
|
||||||
""",
|
""",
|
||||||
@@ -815,6 +870,21 @@ async def service_release_task(task_id: str):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
# --- HTML STATUS EXAMPLE START ---
|
||||||
|
"completed_html": {
|
||||||
|
"summary": "已完成 (HTML)",
|
||||||
|
"value": {
|
||||||
|
"task_id": "a1b2c3d4", "is_processing": False,
|
||||||
|
"status_message": "翻译成功!用时 15.78 秒。",
|
||||||
|
"error_flag": False, "download_ready": True, "original_filename_stem": "about_us",
|
||||||
|
"original_filename": "about_us.html", "task_start_time": 1678890100.0,
|
||||||
|
"task_end_time": 1678890115.78,
|
||||||
|
"downloads": {
|
||||||
|
"html": "/service/download/a1b2c3d4/html"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
# --- HTML STATUS EXAMPLE END ---
|
||||||
"error": {
|
"error": {
|
||||||
"summary": "失败",
|
"summary": "失败",
|
||||||
"value": {
|
"value": {
|
||||||
@@ -935,6 +1005,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
|
html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
elif isinstance(workflow, EpubWorkflow):
|
elif isinstance(workflow, EpubWorkflow):
|
||||||
html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
|
html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
|
# No special html_config for HtmlWorkflow as it doesn't use these preview-oriented features
|
||||||
|
|
||||||
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
||||||
content_str = await asyncio.to_thread(workflow.export_to_html, html_config)
|
content_str = await asyncio.to_thread(workflow.export_to_html, html_config)
|
||||||
@@ -1191,4 +1262,4 @@ def run_app(port: int | None = None):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_app()
|
run_app()
|
||||||
@@ -38,7 +38,6 @@ else:
|
|||||||
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ConverterMineru(X2MarkdownConverter):
|
class ConverterMineru(X2MarkdownConverter):
|
||||||
def __init__(self, config: ConverterMineruConfig):
|
def __init__(self, config: ConverterMineruConfig):
|
||||||
super().__init__(config=config)
|
super().__init__(config=config)
|
||||||
|
|||||||
0
docutranslate/exporter/html/__init__.py
Normal file
0
docutranslate/exporter/html/__init__.py
Normal file
8
docutranslate/exporter/html/base.py
Normal file
8
docutranslate/exporter/html/base.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from docutranslate.exporter.base import Exporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
#TODO:看情况是否需要为json单独写一个document类型
|
||||||
|
class HtmlExporter(Exporter[Document]):
|
||||||
|
|
||||||
|
def export(self,document:Document)->Document:
|
||||||
|
...
|
||||||
11
docutranslate/exporter/html/html2html_exporter.py
Normal file
11
docutranslate/exporter/html/html2html_exporter.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
|
from docutranslate.exporter.html.base import HtmlExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
class Html2HtmlExporter(HtmlExporter):
|
||||||
|
def __init__(self, config: ExporterConfig|None = None):
|
||||||
|
super().__init__(config=config)
|
||||||
|
|
||||||
|
def export(self, document: Document) -> Document:
|
||||||
|
return Document.from_bytes(content=document.content, suffix=".html", stem=document.stem)
|
||||||
@@ -1,8 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from .conditional_import import available_packages,conditional_import
|
from .conditional_import import available_packages, conditional_import
|
||||||
|
|
||||||
|
USE_PROXY = True if (os.getenv("DOCUTRANSLATE_USE_PROXY") and os.getenv(
|
||||||
USE_PROXY=False
|
"DOCUTRANSLATE_USE_PROXY").lower() == "true") else False
|
||||||
if os.getenv("DOCUTRANSLATE_USE_PROXY") and os.getenv("DOCUTRANSLATE_USE_PROXY").lower()=="true":
|
|
||||||
USE_PROXY=True
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
"workflowOptionXlsx": "XLSX翻译 (.xlsx)",
|
"workflowOptionXlsx": "XLSX翻译 (.xlsx)",
|
||||||
"workflowOptionSrt": "SRT字幕翻译 (.srt)",
|
"workflowOptionSrt": "SRT字幕翻译 (.srt)",
|
||||||
"workflowOptionEpub": "EPUB翻译 (.epub)",
|
"workflowOptionEpub": "EPUB翻译 (.epub)",
|
||||||
|
"workflowOptionHtml": "HTML翻译 (.html)",
|
||||||
"autoWorkflowLabel": "自动选择工作流",
|
"autoWorkflowLabel": "自动选择工作流",
|
||||||
"docxSettingsTitleText": "DOCX翻译选项",
|
"docxSettingsTitleText": "DOCX翻译选项",
|
||||||
"insertModeLabel": "插入模式",
|
"insertModeLabel": "插入模式",
|
||||||
@@ -29,6 +30,8 @@
|
|||||||
"insertModeHelpSrt": "选择如何将翻译后的文本插入。",
|
"insertModeHelpSrt": "选择如何将翻译后的文本插入。",
|
||||||
"epubSettingsTitleText": "EPUB翻译选项",
|
"epubSettingsTitleText": "EPUB翻译选项",
|
||||||
"insertModeHelpEpub": "选择如何将翻译后的文本插入。",
|
"insertModeHelpEpub": "选择如何将翻译后的文本插入。",
|
||||||
|
"htmlSettingsTitleText": "HTML翻译选项",
|
||||||
|
"insertModeHelpHtml": "选择如何将翻译后的文本插入。",
|
||||||
"jsonSettingsTitleText": "JSON路径配置",
|
"jsonSettingsTitleText": "JSON路径配置",
|
||||||
"jsonPathLabel": "需要翻译的JSON路径",
|
"jsonPathLabel": "需要翻译的JSON路径",
|
||||||
"jsonPathPlaceholder": "每行一个路径, 例如:\n$.name\n$.*",
|
"jsonPathPlaceholder": "每行一个路径, 例如:\n$.name\n$.*",
|
||||||
@@ -145,6 +148,7 @@
|
|||||||
"workflowOptionXlsx": "XLSX (.xlsx)",
|
"workflowOptionXlsx": "XLSX (.xlsx)",
|
||||||
"workflowOptionSrt": "SRT Subtitle (.srt)",
|
"workflowOptionSrt": "SRT Subtitle (.srt)",
|
||||||
"workflowOptionEpub": "EPUB (.epub)",
|
"workflowOptionEpub": "EPUB (.epub)",
|
||||||
|
"workflowOptionHtml": "HTML (.html)",
|
||||||
"autoWorkflowLabel": "Auto-select workflow",
|
"autoWorkflowLabel": "Auto-select workflow",
|
||||||
"docxSettingsTitleText": "DOCX Translation Options",
|
"docxSettingsTitleText": "DOCX Translation Options",
|
||||||
"insertModeLabel": "Insert Mode",
|
"insertModeLabel": "Insert Mode",
|
||||||
@@ -162,6 +166,8 @@
|
|||||||
"insertModeHelpSrt": "Choose how to insert the translated text.",
|
"insertModeHelpSrt": "Choose how to insert the translated text.",
|
||||||
"epubSettingsTitleText": "EPUB Translation Options",
|
"epubSettingsTitleText": "EPUB Translation Options",
|
||||||
"insertModeHelpEpub": "Choose how to insert the translated text.",
|
"insertModeHelpEpub": "Choose how to insert the translated text.",
|
||||||
|
"htmlSettingsTitleText": "HTML Translation Options",
|
||||||
|
"insertModeHelpHtml": "Choose how to insert the translated text.",
|
||||||
"jsonSettingsTitleText": "JSON Path Configuration",
|
"jsonSettingsTitleText": "JSON Path Configuration",
|
||||||
"jsonPathLabel": "JSON paths to translate",
|
"jsonPathLabel": "JSON paths to translate",
|
||||||
"jsonPathPlaceholder": "One path per line, e.g.:\n$.name\n$.*",
|
"jsonPathPlaceholder": "One path per line, e.g.:\n$.name\n$.*",
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
220
docutranslate/translator/ai_translator/html_translator.py
Normal file
220
docutranslate/translator/ai_translator/html_translator.py
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Self, Literal, Set, Dict, List, Tuple
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||||
|
|
||||||
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
||||||
|
from docutranslate.translator.base import Translator
|
||||||
|
|
||||||
|
# --- 规则定义 ---
|
||||||
|
|
||||||
|
# 1. 不可翻译标签(黑名单)
|
||||||
|
# 这些标签及其内容在任何情况下都不应被翻译,因为它们通常包含代码、样式或元数据。
|
||||||
|
# 在预处理阶段,这些标签及其所有子元素将被直接从文档中移除,以确保它们不会被意外修改。
|
||||||
|
NON_TRANSLATABLE_TAGS: Set[str] = {
|
||||||
|
'script', # JavaScript代码
|
||||||
|
'style', # CSS样式
|
||||||
|
'pre', # 预格式化文本,通常用于代码块
|
||||||
|
'code', # 行内代码
|
||||||
|
'kbd', # 键盘输入
|
||||||
|
'samp', # 示例输出
|
||||||
|
'var', # 变量
|
||||||
|
'noscript',# script未启用时的内容
|
||||||
|
'meta', # 元数据
|
||||||
|
'link', # 外部资源链接
|
||||||
|
'head', # 文档头部,通常不包含可见的可翻译内容
|
||||||
|
}
|
||||||
|
|
||||||
|
# 2. 可翻译标签(白名单)
|
||||||
|
# 定义一组被认为是“安全”的HTML标签,这些标签中的直接文本内容适合被翻译。
|
||||||
|
# 这种白名单策略与上面的黑名单结合,提供了双重保障。
|
||||||
|
SAFE_TAGS: Set[str] = {
|
||||||
|
'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||||
|
'li', 'blockquote', 'q', 'caption',
|
||||||
|
'span', 'a', 'strong', 'em', 'b', 'i', 'u',
|
||||||
|
'td', 'th',
|
||||||
|
'button', 'label', 'legend', 'option',
|
||||||
|
'figcaption', 'summary', 'details',
|
||||||
|
'div', # div 比较通用,但我们的逻辑只提取其顶层文本节点,相对安全
|
||||||
|
}
|
||||||
|
|
||||||
|
# 3. 可翻译属性(白名单)
|
||||||
|
# 定义一组“安全”的属性,这些属性的值通常是给用户看的可读文本。
|
||||||
|
# 格式为: { 'tag_name': ['attr1', 'attr2'], ... }
|
||||||
|
SAFE_ATTRIBUTES: Dict[str, List[str]] = {
|
||||||
|
'img': ['alt', 'title'],
|
||||||
|
'a': ['title'],
|
||||||
|
'input': ['placeholder', 'title'],
|
||||||
|
'textarea': ['placeholder', 'title'],
|
||||||
|
'abbr': ['title'],
|
||||||
|
'area': ['alt'],
|
||||||
|
# 对于所有标签,title属性通常是可翻译的
|
||||||
|
'*': ['title']
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HtmlTranslatorConfig(AiTranslatorConfig):
|
||||||
|
"""
|
||||||
|
HTML翻译器的配置类。
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
insert_mode (Literal["replace", "append", "prepend"]):
|
||||||
|
指定如何插入翻译文本。
|
||||||
|
- "replace": 用译文替换原文。
|
||||||
|
- "append": 在原文后追加译文。
|
||||||
|
- "prepend": 在原文前追加译文。
|
||||||
|
separator (str): 在 "append" 或 "prepend" 模式下,用于分隔原文和译文的字符串。
|
||||||
|
"""
|
||||||
|
insert_mode: Literal["replace", "append", "prepend"] = "replace"
|
||||||
|
separator: str = " " # HTML中用空格作为默认分隔符可能更合适
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlTranslator(Translator):
|
||||||
|
"""
|
||||||
|
一个用于翻译 HTML 文件内容的翻译器。
|
||||||
|
它采用黑白名单结合的策略,以最大程度地保留页面样式和功能:
|
||||||
|
1. 黑名单:首先,完全移除 script, style, code 等明确不可翻译的标签及其内容。
|
||||||
|
2. 白名单:然后,在剩余的HTML中,只提取和翻译指定安全标签和属性中的文本内容。
|
||||||
|
3. 注释保护:显式地跳过HTML注释,确保它们不被翻译。
|
||||||
|
这种方法能有效避免破坏页面结构、脚本、样式和注释。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: HtmlTranslatorConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
self.chunk_size = config.chunk_size
|
||||||
|
agent_config = SegmentsTranslateAgentConfig(
|
||||||
|
custom_prompt=config.custom_prompt,
|
||||||
|
to_lang=config.to_lang,
|
||||||
|
baseurl=config.base_url,
|
||||||
|
key=config.api_key,
|
||||||
|
model_id=config.model_id,
|
||||||
|
system_prompt=None,
|
||||||
|
temperature=config.temperature,
|
||||||
|
thinking=config.thinking,
|
||||||
|
max_concurrent=config.concurrent,
|
||||||
|
timeout=config.timeout,
|
||||||
|
logger=self.logger
|
||||||
|
)
|
||||||
|
self.translate_agent = SegmentsTranslateAgent(agent_config)
|
||||||
|
self.insert_mode = config.insert_mode
|
||||||
|
self.separator = config.separator
|
||||||
|
|
||||||
|
def _pre_translate(self, document: Document) -> Tuple[BeautifulSoup, List[Dict], List[str]]:
|
||||||
|
"""
|
||||||
|
解析HTML文档,根据规则提取所有需要翻译的文本节点和属性。
|
||||||
|
步骤:
|
||||||
|
1. 使用黑名单移除所有不可翻译的标签,从根本上防止它们被处理。
|
||||||
|
2. 遍历剩余的HTML元素,根据白名单提取可翻译的文本和属性值,同时跳过注释。
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(document.content, 'lxml')
|
||||||
|
|
||||||
|
# 步骤 1: 移除所有不可翻译的标签及其内容
|
||||||
|
for tag in soup.find_all(NON_TRANSLATABLE_TAGS):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
translatable_items = []
|
||||||
|
original_texts = []
|
||||||
|
|
||||||
|
# 步骤 2: 遍历所有剩余标签,提取可翻译内容
|
||||||
|
for tag in soup.find_all(True):
|
||||||
|
# --- 2a. 翻译安全标签内的文本节点 ---
|
||||||
|
if tag.name in SAFE_TAGS:
|
||||||
|
# 只处理标签的直接子节点中的文本,这是保留样式的关键。
|
||||||
|
for child in list(tag.children):
|
||||||
|
# 【关键修改】确保处理的是纯文本节点,而不是注释(Comment是NavigableString的子类)
|
||||||
|
if isinstance(child, NavigableString) and not isinstance(child, Comment) and child.strip():
|
||||||
|
text = str(child)
|
||||||
|
translatable_items.append({'type': 'node', 'object': child})
|
||||||
|
original_texts.append(text)
|
||||||
|
|
||||||
|
# --- 2b. 翻译安全标签内的安全属性 ---
|
||||||
|
attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', [])
|
||||||
|
for attr in set(attributes_to_check): # 使用set去重
|
||||||
|
if tag.has_attr(attr) and tag[attr].strip():
|
||||||
|
value = tag[attr]
|
||||||
|
translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr})
|
||||||
|
original_texts.append(value)
|
||||||
|
|
||||||
|
return soup, translatable_items, original_texts
|
||||||
|
|
||||||
|
def _after_translate(self, soup: BeautifulSoup, translatable_items: list,
|
||||||
|
translated_texts: list[str], original_texts: list[str]) -> bytes:
|
||||||
|
"""
|
||||||
|
将翻译后的文本写回到BeautifulSoup对象中对应的节点或属性,并返回最终的HTML字节流。
|
||||||
|
"""
|
||||||
|
if len(translatable_items) != len(translated_texts):
|
||||||
|
self.logger.error("翻译前后的文本片段数量不匹配 (%d vs %d),跳过写入操作以防损坏文件。",
|
||||||
|
len(translatable_items), len(translated_texts))
|
||||||
|
return soup.encode('utf-8')
|
||||||
|
|
||||||
|
for i, item in enumerate(translatable_items):
|
||||||
|
translated_text = translated_texts[i]
|
||||||
|
original_text = original_texts[i]
|
||||||
|
|
||||||
|
new_content = ""
|
||||||
|
if self.insert_mode == "replace":
|
||||||
|
if item['type'] == 'node':
|
||||||
|
# 对于文本节点,保留原文前后的空白字符,这对维持内联元素的间距至关重要。
|
||||||
|
leading_space = original_text[:len(original_text) - len(original_text.lstrip())]
|
||||||
|
trailing_space = original_text[len(original_text.rstrip()):]
|
||||||
|
new_content = leading_space + translated_text + trailing_space
|
||||||
|
else: # 属性
|
||||||
|
new_content = translated_text
|
||||||
|
|
||||||
|
elif self.insert_mode == "append":
|
||||||
|
new_content = original_text + self.separator + translated_text
|
||||||
|
elif self.insert_mode == "prepend":
|
||||||
|
new_content = translated_text + self.separator + original_text
|
||||||
|
else:
|
||||||
|
self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'")
|
||||||
|
new_content = original_text # 出错时恢复原文
|
||||||
|
|
||||||
|
# 根据类型将内容写回
|
||||||
|
if item['type'] == 'node':
|
||||||
|
node = item['object']
|
||||||
|
# 检查节点是否仍然在解析树中,以防在处理过程中被移动或删除
|
||||||
|
if node.parent:
|
||||||
|
node.replace_with(NavigableString(new_content))
|
||||||
|
elif item['type'] == 'attribute':
|
||||||
|
tag = item['tag']
|
||||||
|
attr = item['attribute']
|
||||||
|
tag[attr] = new_content
|
||||||
|
|
||||||
|
# 将修改后的BeautifulSoup对象编码为utf-8字节流
|
||||||
|
return soup.encode('utf-8')
|
||||||
|
|
||||||
|
def translate(self, document: Document) -> Self:
|
||||||
|
"""
|
||||||
|
同步翻译HTML文档。
|
||||||
|
"""
|
||||||
|
soup, translatable_items, original_texts = self._pre_translate(document)
|
||||||
|
if not translatable_items:
|
||||||
|
self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。")
|
||||||
|
# 即使没有翻译内容,也返回经过清理(移除非翻译标签)的文档内容
|
||||||
|
document.content = soup.encode('utf-8')
|
||||||
|
return self
|
||||||
|
|
||||||
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts)
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, document: Document) -> Self:
|
||||||
|
"""
|
||||||
|
异步翻译HTML文档。
|
||||||
|
"""
|
||||||
|
soup, translatable_items, original_texts = await asyncio.to_thread(self._pre_translate, document)
|
||||||
|
|
||||||
|
if not translatable_items:
|
||||||
|
self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。")
|
||||||
|
document.content = await asyncio.to_thread(soup.encode, 'utf-8')
|
||||||
|
return self
|
||||||
|
|
||||||
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
document.content = await asyncio.to_thread(
|
||||||
|
self._after_translate, soup, translatable_items, translated_texts, original_texts
|
||||||
|
)
|
||||||
|
return self
|
||||||
55
docutranslate/workflow/html_workflow.py
Normal file
55
docutranslate/workflow/html_workflow.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
|
from docutranslate.exporter.html.html2html_exporter import Html2HtmlExporter
|
||||||
|
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig, HtmlTranslator
|
||||||
|
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
||||||
|
from docutranslate.workflow.interfaces import HTMLExportable
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class HtmlWorkflowConfig(WorkflowConfig):
|
||||||
|
translator_config: HtmlTranslatorConfig
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlWorkflow(Workflow[HtmlWorkflowConfig, Document, Document], HTMLExportable):
|
||||||
|
def __init__(self, config: HtmlWorkflowConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
if config.logger:
|
||||||
|
for sub_config in [self.config.translator_config]:
|
||||||
|
if sub_config:
|
||||||
|
sub_config.logger = config.logger
|
||||||
|
|
||||||
|
def _pre_translate(self, document_original: Document):
|
||||||
|
document = document_original.copy()
|
||||||
|
translate_config = self.config.translator_config
|
||||||
|
translator = HtmlTranslator(translate_config)
|
||||||
|
return document, translator
|
||||||
|
|
||||||
|
def translate(self) -> Self:
|
||||||
|
document, translator = self._pre_translate(self.document_original)
|
||||||
|
translator.translate(document)
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self) -> Self:
|
||||||
|
document, translator = self._pre_translate(self.document_original)
|
||||||
|
await translator.translate_async(document)
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
def export_to_html(self, _: ExporterConfig = None) -> str:
|
||||||
|
|
||||||
|
docu = self._export(Html2HtmlExporter())
|
||||||
|
return docu.content.decode()
|
||||||
|
|
||||||
|
|
||||||
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
|
_: ExporterConfig | None = None) -> Self:
|
||||||
|
self._save(exporter=Html2HtmlExporter(), name=name, output_dir=output_dir)
|
||||||
|
return self
|
||||||
Reference in New Issue
Block a user