增加html工作流

This commit is contained in:
xunbu
2025-08-18 18:44:08 +08:00
parent 943c540edc
commit fd061960f0
11 changed files with 383 additions and 13 deletions

View File

@@ -89,8 +89,10 @@ class Agent:
self.temperature = config.temperature
# self.client = httpx.Client(trust_env=False, proxy=None, verify=False)
# self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False)
self.client = httpx.Client(verify=False) if USE_PROXY else httpx.Client(proxy=None, verify=False)
self.client_async = httpx.AsyncClient(verify=False) if USE_PROXY else httpx.AsyncClient(proxy=None,
self.client = httpx.Client(verify=False) if USE_PROXY else httpx.Client(trust_env=False, proxy=None,
verify=False)
self.client_async = httpx.AsyncClient(verify=False) if USE_PROXY else httpx.AsyncClient(trust_env=False,
proxy=None,
verify=False)
self.max_concurrent = config.max_concurrent
self.timeout = config.timeout

View File

@@ -28,6 +28,9 @@ from docutranslate.global_values.conditional_import import DOCLING_EXIST
from docutranslate.workflow.base import Workflow
from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig
from docutranslate.workflow.epub_workflow import EpubWorkflow, EpubWorkflowConfig
# --- HTML WORKFLOW IMPORT START ---
from docutranslate.workflow.html_workflow import HtmlWorkflow, HtmlWorkflowConfig
# --- HTML WORKFLOW IMPORT END ---
from docutranslate.workflow.interfaces import DocxExportable, EpubExportable
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \
XlsxExportable, SrtExportable
@@ -54,6 +57,9 @@ from docutranslate.translator.ai_translator.srt_translator import SrtTranslatorC
from docutranslate.exporter.srt.srt2html_exporter import Srt2HTMLExporterConfig
from docutranslate.translator.ai_translator.epub_translator import EpubTranslatorConfig
from docutranslate.exporter.epub.epub2html_exporter import Epub2HTMLExporterConfig
# --- HTML TRANSLATOR IMPORT START ---
from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig
# --- HTML TRANSLATOR IMPORT END ---
# ------------------------------------
from docutranslate.logger import global_logger
@@ -76,6 +82,7 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = {
"docx": DocxWorkflow,
"srt": SrtWorkflow,
"epub": EpubWorkflow,
"html": HtmlWorkflow,
}
@@ -282,10 +289,24 @@ class EpubWorkflowParams(BaseWorkflowParams):
)
# --- HTML WORKFLOW PARAMS START ---
class HtmlWorkflowParams(BaseWorkflowParams):
workflow_type: Literal['html'] = Field(..., description="指定使用HTML的翻译工作流。")
insert_mode: Literal["replace", "append", "prepend"] = Field(
"replace",
description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。"
)
separator: str = Field(
" ",
description="当 insert_mode 为 'append''prepend' 时,用于分隔原文和译文的分隔符。"
)
# --- HTML WORKFLOW PARAMS END ---
# 3. 使用可辨识联合类型Discriminated Union将它们组合起来
TranslatePayload = Annotated[
Union[
MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams],
MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams, HtmlWorkflowParams],
Field(discriminator='workflow_type')
]
@@ -293,7 +314,7 @@ TranslatePayload = Annotated[
# 4. 创建最终的请求体模型
class TranslateServiceRequest(BaseModel):
file_name: str = Field(..., description="上传的原始文件名,含扩展名。",
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub"])
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"])
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")
@@ -393,7 +414,24 @@ class TranslateServiceRequest(BaseModel):
"insert_mode": "replace",
}
}
},
# --- HTML EXAMPLE START ---
{
"summary": "HTML 工作流示例",
"value": {
"file_name": "company_about_us.html",
"file_content": "PGh0bWw+PGhlYWQ+PHRpdGxlPkFib3V0IFVzPC90aXRsZT48L2hlYWQ+PGJvZHk+PGgxPk91ciBDb21wYW55PC9oMT48cD5XZSBhcmUgYSBsZWFkaW5nIHByb3ZpZGVyIG9mIGlubm92YXRpdmUgc29sdXRpb25zLjwvcD48L2JvZHk+PC9odG1sPg==",
"payload": {
"workflow_type": "html",
"base_url": "https://api.openai.com/v1",
"api_key": "sk-your-api-key-here",
"model_id": "gpt-4o",
"to_lang": "简体中文",
"insert_mode": "replace"
}
}
}
# --- HTML EXAMPLE END ---
]
}
@@ -552,6 +590,23 @@ async def _perform_translation(
)
workflow = EpubWorkflow(config=workflow_config)
# --- HTML WORKFLOW LOGIC START ---
elif isinstance(payload, HtmlWorkflowParams):
task_logger.info("构建 HtmlWorkflow 配置。")
translator_config = HtmlTranslatorConfig(
**payload.model_dump(include={
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
'temperature', 'thinking', 'chunk_size', 'concurrent',
'insert_mode', 'separator'
}, exclude_none=True)
)
workflow_config = HtmlWorkflowConfig(
translator_config=translator_config,
logger=task_logger
)
workflow = HtmlWorkflow(config=workflow_config)
# --- HTML WORKFLOW LOGIC END ---
else:
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
@@ -678,7 +733,7 @@ def _cancel_translation_logic(task_id: str):
description="""
接收一个包含文件内容Base64编码和工作流参数的JSON请求启动一个后台翻译任务。
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`, `srt`, `epub`)。
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`, `srt`, `epub`, `html`)。
- **动态参数**: 根据所选工作流API需要不同的参数集。请参考下面的Schema或示例。
- **异步处理**: 此端点会立即返回任务ID客户端需轮询状态接口获取进度。
""",
@@ -815,6 +870,21 @@ async def service_release_task(task_id: str):
}
}
},
# --- HTML STATUS EXAMPLE START ---
"completed_html": {
"summary": "已完成 (HTML)",
"value": {
"task_id": "a1b2c3d4", "is_processing": False,
"status_message": "翻译成功!用时 15.78 秒。",
"error_flag": False, "download_ready": True, "original_filename_stem": "about_us",
"original_filename": "about_us.html", "task_start_time": 1678890100.0,
"task_end_time": 1678890115.78,
"downloads": {
"html": "/service/download/a1b2c3d4/html"
}
}
},
# --- HTML STATUS EXAMPLE END ---
"error": {
"summary": "失败",
"value": {
@@ -935,6 +1005,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
elif isinstance(workflow, EpubWorkflow):
html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
# No special html_config for HtmlWorkflow as it doesn't use these preview-oriented features
if file_type == 'html' and isinstance(workflow, HTMLExportable):
content_str = await asyncio.to_thread(workflow.export_to_html, html_config)

View File

@@ -38,7 +38,6 @@ else:
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig):
super().__init__(config=config)

View File

View File

@@ -0,0 +1,8 @@
from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document
#TODO:看情况是否需要为json单独写一个document类型
class HtmlExporter(Exporter[Document]):
def export(self,document:Document)->Document:
...

View File

@@ -0,0 +1,11 @@
from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.html.base import HtmlExporter
from docutranslate.ir.document import Document
class Html2HtmlExporter(HtmlExporter):
def __init__(self, config: ExporterConfig|None = None):
super().__init__(config=config)
def export(self, document: Document) -> Document:
return Document.from_bytes(content=document.content, suffix=".html", stem=document.stem)

View File

@@ -1,8 +1,6 @@
import os
from .conditional_import import available_packages,conditional_import
from .conditional_import import available_packages, conditional_import
USE_PROXY=False
if os.getenv("DOCUTRANSLATE_USE_PROXY") and os.getenv("DOCUTRANSLATE_USE_PROXY").lower()=="true":
USE_PROXY=True
USE_PROXY = True if (os.getenv("DOCUTRANSLATE_USE_PROXY") and os.getenv(
"DOCUTRANSLATE_USE_PROXY").lower() == "true") else False

View File

@@ -12,6 +12,7 @@
"workflowOptionXlsx": "XLSX翻译 (.xlsx)",
"workflowOptionSrt": "SRT字幕翻译 (.srt)",
"workflowOptionEpub": "EPUB翻译 (.epub)",
"workflowOptionHtml": "HTML翻译 (.html)",
"autoWorkflowLabel": "自动选择工作流",
"docxSettingsTitleText": "DOCX翻译选项",
"insertModeLabel": "插入模式",
@@ -29,6 +30,8 @@
"insertModeHelpSrt": "选择如何将翻译后的文本插入。",
"epubSettingsTitleText": "EPUB翻译选项",
"insertModeHelpEpub": "选择如何将翻译后的文本插入。",
"htmlSettingsTitleText": "HTML翻译选项",
"insertModeHelpHtml": "选择如何将翻译后的文本插入。",
"jsonSettingsTitleText": "JSON路径配置",
"jsonPathLabel": "需要翻译的JSON路径",
"jsonPathPlaceholder": "每行一个路径, 例如:\n$.name\n$.*",
@@ -145,6 +148,7 @@
"workflowOptionXlsx": "XLSX (.xlsx)",
"workflowOptionSrt": "SRT Subtitle (.srt)",
"workflowOptionEpub": "EPUB (.epub)",
"workflowOptionHtml": "HTML (.html)",
"autoWorkflowLabel": "Auto-select workflow",
"docxSettingsTitleText": "DOCX Translation Options",
"insertModeLabel": "Insert Mode",
@@ -162,6 +166,8 @@
"insertModeHelpSrt": "Choose how to insert the translated text.",
"epubSettingsTitleText": "EPUB Translation Options",
"insertModeHelpEpub": "Choose how to insert the translated text.",
"htmlSettingsTitleText": "HTML Translation Options",
"insertModeHelpHtml": "Choose how to insert the translated text.",
"jsonSettingsTitleText": "JSON Path Configuration",
"jsonPathLabel": "JSON paths to translate",
"jsonPathPlaceholder": "One path per line, e.g.:\n$.name\n$.*",

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,220 @@
import asyncio
from dataclasses import dataclass
from typing import Self, Literal, Set, Dict, List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
# --- 规则定义 ---
# 1. 不可翻译标签(黑名单)
# 这些标签及其内容在任何情况下都不应被翻译,因为它们通常包含代码、样式或元数据。
# 在预处理阶段,这些标签及其所有子元素将被直接从文档中移除,以确保它们不会被意外修改。
NON_TRANSLATABLE_TAGS: Set[str] = {
'script', # JavaScript代码
'style', # CSS样式
'pre', # 预格式化文本,通常用于代码块
'code', # 行内代码
'kbd', # 键盘输入
'samp', # 示例输出
'var', # 变量
'noscript',# script未启用时的内容
'meta', # 元数据
'link', # 外部资源链接
'head', # 文档头部,通常不包含可见的可翻译内容
}
# 2. 可翻译标签(白名单)
# 定义一组被认为是“安全”的HTML标签这些标签中的直接文本内容适合被翻译。
# 这种白名单策略与上面的黑名单结合,提供了双重保障。
SAFE_TAGS: Set[str] = {
'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'li', 'blockquote', 'q', 'caption',
'span', 'a', 'strong', 'em', 'b', 'i', 'u',
'td', 'th',
'button', 'label', 'legend', 'option',
'figcaption', 'summary', 'details',
'div', # div 比较通用,但我们的逻辑只提取其顶层文本节点,相对安全
}
# 3. 可翻译属性(白名单)
# 定义一组“安全”的属性,这些属性的值通常是给用户看的可读文本。
# 格式为: { 'tag_name': ['attr1', 'attr2'], ... }
SAFE_ATTRIBUTES: Dict[str, List[str]] = {
'img': ['alt', 'title'],
'a': ['title'],
'input': ['placeholder', 'title'],
'textarea': ['placeholder', 'title'],
'abbr': ['title'],
'area': ['alt'],
# 对于所有标签title属性通常是可翻译的
'*': ['title']
}
@dataclass
class HtmlTranslatorConfig(AiTranslatorConfig):
"""
HTML翻译器的配置类。
Attributes:
insert_mode (Literal["replace", "append", "prepend"]):
指定如何插入翻译文本。
- "replace": 用译文替换原文。
- "append": 在原文后追加译文。
- "prepend": 在原文前追加译文。
separator (str): 在 "append""prepend" 模式下,用于分隔原文和译文的字符串。
"""
insert_mode: Literal["replace", "append", "prepend"] = "replace"
separator: str = " " # HTML中用空格作为默认分隔符可能更合适
class HtmlTranslator(Translator):
"""
一个用于翻译 HTML 文件内容的翻译器。
它采用黑白名单结合的策略,以最大程度地保留页面样式和功能:
1. 黑名单:首先,完全移除 script, style, code 等明确不可翻译的标签及其内容。
2. 白名单然后在剩余的HTML中只提取和翻译指定安全标签和属性中的文本内容。
3. 注释保护显式地跳过HTML注释确保它们不被翻译。
这种方法能有效避免破坏页面结构、脚本、样式和注释。
"""
def __init__(self, config: HtmlTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
agent_config = SegmentsTranslateAgentConfig(
custom_prompt=config.custom_prompt,
to_lang=config.to_lang,
baseurl=config.base_url,
key=config.api_key,
model_id=config.model_id,
system_prompt=None,
temperature=config.temperature,
thinking=config.thinking,
max_concurrent=config.concurrent,
timeout=config.timeout,
logger=self.logger
)
self.translate_agent = SegmentsTranslateAgent(agent_config)
self.insert_mode = config.insert_mode
self.separator = config.separator
def _pre_translate(self, document: Document) -> Tuple[BeautifulSoup, List[Dict], List[str]]:
"""
解析HTML文档根据规则提取所有需要翻译的文本节点和属性。
步骤:
1. 使用黑名单移除所有不可翻译的标签,从根本上防止它们被处理。
2. 遍历剩余的HTML元素根据白名单提取可翻译的文本和属性值同时跳过注释。
"""
soup = BeautifulSoup(document.content, 'lxml')
# 步骤 1: 移除所有不可翻译的标签及其内容
for tag in soup.find_all(NON_TRANSLATABLE_TAGS):
tag.decompose()
translatable_items = []
original_texts = []
# 步骤 2: 遍历所有剩余标签,提取可翻译内容
for tag in soup.find_all(True):
# --- 2a. 翻译安全标签内的文本节点 ---
if tag.name in SAFE_TAGS:
# 只处理标签的直接子节点中的文本,这是保留样式的关键。
for child in list(tag.children):
# 【关键修改】确保处理的是纯文本节点而不是注释Comment是NavigableString的子类
if isinstance(child, NavigableString) and not isinstance(child, Comment) and child.strip():
text = str(child)
translatable_items.append({'type': 'node', 'object': child})
original_texts.append(text)
# --- 2b. 翻译安全标签内的安全属性 ---
attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', [])
for attr in set(attributes_to_check): # 使用set去重
if tag.has_attr(attr) and tag[attr].strip():
value = tag[attr]
translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr})
original_texts.append(value)
return soup, translatable_items, original_texts
def _after_translate(self, soup: BeautifulSoup, translatable_items: list,
translated_texts: list[str], original_texts: list[str]) -> bytes:
"""
将翻译后的文本写回到BeautifulSoup对象中对应的节点或属性并返回最终的HTML字节流。
"""
if len(translatable_items) != len(translated_texts):
self.logger.error("翻译前后的文本片段数量不匹配 (%d vs %d),跳过写入操作以防损坏文件。",
len(translatable_items), len(translated_texts))
return soup.encode('utf-8')
for i, item in enumerate(translatable_items):
translated_text = translated_texts[i]
original_text = original_texts[i]
new_content = ""
if self.insert_mode == "replace":
if item['type'] == 'node':
# 对于文本节点,保留原文前后的空白字符,这对维持内联元素的间距至关重要。
leading_space = original_text[:len(original_text) - len(original_text.lstrip())]
trailing_space = original_text[len(original_text.rstrip()):]
new_content = leading_space + translated_text + trailing_space
else: # 属性
new_content = translated_text
elif self.insert_mode == "append":
new_content = original_text + self.separator + translated_text
elif self.insert_mode == "prepend":
new_content = translated_text + self.separator + original_text
else:
self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'")
new_content = original_text # 出错时恢复原文
# 根据类型将内容写回
if item['type'] == 'node':
node = item['object']
# 检查节点是否仍然在解析树中,以防在处理过程中被移动或删除
if node.parent:
node.replace_with(NavigableString(new_content))
elif item['type'] == 'attribute':
tag = item['tag']
attr = item['attribute']
tag[attr] = new_content
# 将修改后的BeautifulSoup对象编码为utf-8字节流
return soup.encode('utf-8')
def translate(self, document: Document) -> Self:
"""
同步翻译HTML文档。
"""
soup, translatable_items, original_texts = self._pre_translate(document)
if not translatable_items:
self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。")
# 即使没有翻译内容,也返回经过清理(移除非翻译标签)的文档内容
document.content = soup.encode('utf-8')
return self
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts)
return self
async def translate_async(self, document: Document) -> Self:
"""
异步翻译HTML文档。
"""
soup, translatable_items, original_texts = await asyncio.to_thread(self._pre_translate, document)
if not translatable_items:
self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。")
document.content = await asyncio.to_thread(soup.encode, 'utf-8')
return self
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
document.content = await asyncio.to_thread(
self._after_translate, soup, translatable_items, translated_texts, original_texts
)
return self

View File

@@ -0,0 +1,55 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Self
from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.html.html2html_exporter import Html2HtmlExporter
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig, HtmlTranslator
from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import HTMLExportable
@dataclass(kw_only=True)
class HtmlWorkflowConfig(WorkflowConfig):
translator_config: HtmlTranslatorConfig
class HtmlWorkflow(Workflow[HtmlWorkflowConfig, Document, Document], HTMLExportable):
def __init__(self, config: HtmlWorkflowConfig):
super().__init__(config=config)
if config.logger:
for sub_config in [self.config.translator_config]:
if sub_config:
sub_config.logger = config.logger
def _pre_translate(self, document_original: Document):
document = document_original.copy()
translate_config = self.config.translator_config
translator = HtmlTranslator(translate_config)
return document, translator
def translate(self) -> Self:
document, translator = self._pre_translate(self.document_original)
translator.translate(document)
self.document_translated = document
return self
async def translate_async(self) -> Self:
document, translator = self._pre_translate(self.document_original)
await translator.translate_async(document)
self.document_translated = document
return self
def export_to_html(self, _: ExporterConfig = None) -> str:
docu = self._export(Html2HtmlExporter())
return docu.content.decode()
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
_: ExporterConfig | None = None) -> Self:
self._save(exporter=Html2HtmlExporter(), name=name, output_dir=output_dir)
return self