From fd061960f0ced06d2d6b49ce5b8ddb38d30c0e73 Mon Sep 17 00:00:00 2001 From: xunbu Date: Mon, 18 Aug 2025 18:44:08 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0html=E5=B7=A5=E4=BD=9C?= =?UTF-8?q?=E6=B5=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/agent.py | 6 +- docutranslate/app.py | 79 ++++++- .../converter/x2md/converter_mineru.py | 1 - docutranslate/exporter/html/__init__.py | 0 docutranslate/exporter/html/base.py | 8 + .../exporter/html/html2html_exporter.py | 11 + docutranslate/global_values/__init__.py | 8 +- docutranslate/static/i18nData.json | 6 + docutranslate/static/index.html | 2 +- .../ai_translator/html_translator.py | 220 ++++++++++++++++++ docutranslate/workflow/html_workflow.py | 55 +++++ 11 files changed, 383 insertions(+), 13 deletions(-) create mode 100644 docutranslate/exporter/html/__init__.py create mode 100644 docutranslate/exporter/html/base.py create mode 100644 docutranslate/exporter/html/html2html_exporter.py create mode 100644 docutranslate/translator/ai_translator/html_translator.py create mode 100644 docutranslate/workflow/html_workflow.py diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index 69c63e1..f29d873 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -89,8 +89,10 @@ class Agent: self.temperature = config.temperature # self.client = httpx.Client(trust_env=False, proxy=None, verify=False) # self.client_async = httpx.AsyncClient(trust_env=False, proxy=None, verify=False) - self.client = httpx.Client(verify=False) if USE_PROXY else httpx.Client(proxy=None, verify=False) - self.client_async = httpx.AsyncClient(verify=False) if USE_PROXY else httpx.AsyncClient(proxy=None, + self.client = httpx.Client(verify=False) if USE_PROXY else httpx.Client(trust_env=False, proxy=None, + verify=False) + self.client_async = httpx.AsyncClient(verify=False) if USE_PROXY else httpx.AsyncClient(trust_env=False, + proxy=None, verify=False) self.max_concurrent = config.max_concurrent self.timeout = config.timeout diff --git a/docutranslate/app.py b/docutranslate/app.py index a008fd4..e5c264e 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -28,6 +28,9 @@ from docutranslate.global_values.conditional_import import DOCLING_EXIST from docutranslate.workflow.base import Workflow from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig from docutranslate.workflow.epub_workflow import EpubWorkflow, EpubWorkflowConfig +# --- HTML WORKFLOW IMPORT START --- +from docutranslate.workflow.html_workflow import HtmlWorkflow, HtmlWorkflowConfig +# --- HTML WORKFLOW IMPORT END --- from docutranslate.workflow.interfaces import DocxExportable, EpubExportable from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \ XlsxExportable, SrtExportable @@ -54,6 +57,9 @@ from docutranslate.translator.ai_translator.srt_translator import SrtTranslatorC from docutranslate.exporter.srt.srt2html_exporter import Srt2HTMLExporterConfig from docutranslate.translator.ai_translator.epub_translator import EpubTranslatorConfig from docutranslate.exporter.epub.epub2html_exporter import Epub2HTMLExporterConfig +# --- HTML TRANSLATOR IMPORT START --- +from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig +# --- HTML TRANSLATOR IMPORT END --- # ------------------------------------ from docutranslate.logger import global_logger @@ -76,6 +82,7 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = { "docx": DocxWorkflow, "srt": SrtWorkflow, "epub": EpubWorkflow, + "html": HtmlWorkflow, } @@ -282,10 +289,24 @@ class EpubWorkflowParams(BaseWorkflowParams): ) +# --- HTML WORKFLOW PARAMS START --- +class HtmlWorkflowParams(BaseWorkflowParams): + workflow_type: Literal['html'] = Field(..., description="指定使用HTML的翻译工作流。") + insert_mode: Literal["replace", "append", "prepend"] = Field( + "replace", + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + ) + separator: str = Field( + " ", + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + ) +# --- HTML WORKFLOW PARAMS END --- + + # 3. 使用可辨识联合类型(Discriminated Union)将它们组合起来 TranslatePayload = Annotated[ Union[ - MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams], + MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams, SrtWorkflowParams, EpubWorkflowParams, HtmlWorkflowParams], Field(discriminator='workflow_type') ] @@ -293,7 +314,7 @@ TranslatePayload = Annotated[ # 4. 创建最终的请求体模型 class TranslateServiceRequest(BaseModel): file_name: str = Field(..., description="上传的原始文件名,含扩展名。", - examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub"]) + examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"]) file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."]) payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。") @@ -393,7 +414,24 @@ class TranslateServiceRequest(BaseModel): "insert_mode": "replace", } } + }, + # --- HTML EXAMPLE START --- + { + "summary": "HTML 工作流示例", + "value": { + "file_name": "company_about_us.html", + "file_content": "PGh0bWw+PGhlYWQ+PHRpdGxlPkFib3V0IFVzPC90aXRsZT48L2hlYWQ+PGJvZHk+PGgxPk91ciBDb21wYW55PC9oMT48cD5XZSBhcmUgYSBsZWFkaW5nIHByb3ZpZGVyIG9mIGlubm92YXRpdmUgc29sdXRpb25zLjwvcD48L2JvZHk+PC9odG1sPg==", + "payload": { + "workflow_type": "html", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-your-api-key-here", + "model_id": "gpt-4o", + "to_lang": "简体中文", + "insert_mode": "replace" + } + } } + # --- HTML EXAMPLE END --- ] } @@ -552,6 +590,23 @@ async def _perform_translation( ) workflow = EpubWorkflow(config=workflow_config) + # --- HTML WORKFLOW LOGIC START --- + elif isinstance(payload, HtmlWorkflowParams): + task_logger.info("构建 HtmlWorkflow 配置。") + translator_config = HtmlTranslatorConfig( + **payload.model_dump(include={ + 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', + 'temperature', 'thinking', 'chunk_size', 'concurrent', + 'insert_mode', 'separator' + }, exclude_none=True) + ) + workflow_config = HtmlWorkflowConfig( + translator_config=translator_config, + logger=task_logger + ) + workflow = HtmlWorkflow(config=workflow_config) + # --- HTML WORKFLOW LOGIC END --- + else: raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。") @@ -678,7 +733,7 @@ def _cancel_translation_logic(task_id: str): description=""" 接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。 -- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`, `srt`, `epub`)。 +- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`, `srt`, `epub`, `html`)。 - **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。 - **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。 """, @@ -815,6 +870,21 @@ async def service_release_task(task_id: str): } } }, + # --- HTML STATUS EXAMPLE START --- + "completed_html": { + "summary": "已完成 (HTML)", + "value": { + "task_id": "a1b2c3d4", "is_processing": False, + "status_message": "翻译成功!用时 15.78 秒。", + "error_flag": False, "download_ready": True, "original_filename_stem": "about_us", + "original_filename": "about_us.html", "task_start_time": 1678890100.0, + "task_end_time": 1678890115.78, + "downloads": { + "html": "/service/download/a1b2c3d4/html" + } + } + }, + # --- HTML STATUS EXAMPLE END --- "error": { "summary": "失败", "value": { @@ -935,6 +1005,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available) elif isinstance(workflow, EpubWorkflow): html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available) + # No special html_config for HtmlWorkflow as it doesn't use these preview-oriented features if file_type == 'html' and isinstance(workflow, HTMLExportable): content_str = await asyncio.to_thread(workflow.export_to_html, html_config) @@ -1191,4 +1262,4 @@ def run_app(port: int | None = None): if __name__ == "__main__": - run_app() + run_app() \ No newline at end of file diff --git a/docutranslate/converter/x2md/converter_mineru.py b/docutranslate/converter/x2md/converter_mineru.py index 69ab5c9..658111b 100644 --- a/docutranslate/converter/x2md/converter_mineru.py +++ b/docutranslate/converter/x2md/converter_mineru.py @@ -38,7 +38,6 @@ else: client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False) - class ConverterMineru(X2MarkdownConverter): def __init__(self, config: ConverterMineruConfig): super().__init__(config=config) diff --git a/docutranslate/exporter/html/__init__.py b/docutranslate/exporter/html/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/exporter/html/base.py b/docutranslate/exporter/html/base.py new file mode 100644 index 0000000..6f594b0 --- /dev/null +++ b/docutranslate/exporter/html/base.py @@ -0,0 +1,8 @@ +from docutranslate.exporter.base import Exporter +from docutranslate.ir.document import Document + +#TODO:看情况是否需要为json单独写一个document类型 +class HtmlExporter(Exporter[Document]): + + def export(self,document:Document)->Document: + ... \ No newline at end of file diff --git a/docutranslate/exporter/html/html2html_exporter.py b/docutranslate/exporter/html/html2html_exporter.py new file mode 100644 index 0000000..6e376e0 --- /dev/null +++ b/docutranslate/exporter/html/html2html_exporter.py @@ -0,0 +1,11 @@ +from docutranslate.exporter.base import ExporterConfig +from docutranslate.exporter.html.base import HtmlExporter +from docutranslate.ir.document import Document + + +class Html2HtmlExporter(HtmlExporter): + def __init__(self, config: ExporterConfig|None = None): + super().__init__(config=config) + + def export(self, document: Document) -> Document: + return Document.from_bytes(content=document.content, suffix=".html", stem=document.stem) diff --git a/docutranslate/global_values/__init__.py b/docutranslate/global_values/__init__.py index 1aa2bf0..5d1ae46 100644 --- a/docutranslate/global_values/__init__.py +++ b/docutranslate/global_values/__init__.py @@ -1,8 +1,6 @@ import os -from .conditional_import import available_packages,conditional_import +from .conditional_import import available_packages, conditional_import - -USE_PROXY=False -if os.getenv("DOCUTRANSLATE_USE_PROXY") and os.getenv("DOCUTRANSLATE_USE_PROXY").lower()=="true": - USE_PROXY=True +USE_PROXY = True if (os.getenv("DOCUTRANSLATE_USE_PROXY") and os.getenv( + "DOCUTRANSLATE_USE_PROXY").lower() == "true") else False diff --git a/docutranslate/static/i18nData.json b/docutranslate/static/i18nData.json index 4239774..cb88949 100644 --- a/docutranslate/static/i18nData.json +++ b/docutranslate/static/i18nData.json @@ -12,6 +12,7 @@ "workflowOptionXlsx": "XLSX翻译 (.xlsx)", "workflowOptionSrt": "SRT字幕翻译 (.srt)", "workflowOptionEpub": "EPUB翻译 (.epub)", + "workflowOptionHtml": "HTML翻译 (.html)", "autoWorkflowLabel": "自动选择工作流", "docxSettingsTitleText": "DOCX翻译选项", "insertModeLabel": "插入模式", @@ -29,6 +30,8 @@ "insertModeHelpSrt": "选择如何将翻译后的文本插入。", "epubSettingsTitleText": "EPUB翻译选项", "insertModeHelpEpub": "选择如何将翻译后的文本插入。", + "htmlSettingsTitleText": "HTML翻译选项", + "insertModeHelpHtml": "选择如何将翻译后的文本插入。", "jsonSettingsTitleText": "JSON路径配置", "jsonPathLabel": "需要翻译的JSON路径", "jsonPathPlaceholder": "每行一个路径, 例如:\n$.name\n$.*", @@ -145,6 +148,7 @@ "workflowOptionXlsx": "XLSX (.xlsx)", "workflowOptionSrt": "SRT Subtitle (.srt)", "workflowOptionEpub": "EPUB (.epub)", + "workflowOptionHtml": "HTML (.html)", "autoWorkflowLabel": "Auto-select workflow", "docxSettingsTitleText": "DOCX Translation Options", "insertModeLabel": "Insert Mode", @@ -162,6 +166,8 @@ "insertModeHelpSrt": "Choose how to insert the translated text.", "epubSettingsTitleText": "EPUB Translation Options", "insertModeHelpEpub": "Choose how to insert the translated text.", + "htmlSettingsTitleText": "HTML Translation Options", + "insertModeHelpHtml": "Choose how to insert the translated text.", "jsonSettingsTitleText": "JSON Path Configuration", "jsonPathLabel": "JSON paths to translate", "jsonPathPlaceholder": "One path per line, e.g.:\n$.name\n$.*", diff --git a/docutranslate/static/index.html b/docutranslate/static/index.html index 3633645..f046344 100644 --- a/docutranslate/static/index.html +++ b/docutranslate/static/index.html @@ -1 +1 @@ - DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

任务列表

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file + DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

任务列表

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file diff --git a/docutranslate/translator/ai_translator/html_translator.py b/docutranslate/translator/ai_translator/html_translator.py new file mode 100644 index 0000000..a6a867f --- /dev/null +++ b/docutranslate/translator/ai_translator/html_translator.py @@ -0,0 +1,220 @@ +import asyncio +from dataclasses import dataclass +from typing import Self, Literal, Set, Dict, List, Tuple + +from bs4 import BeautifulSoup, NavigableString, Tag, Comment + +from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent +from docutranslate.ir.document import Document +from docutranslate.translator.ai_translator.base import AiTranslatorConfig +from docutranslate.translator.base import Translator + +# --- 规则定义 --- + +# 1. 不可翻译标签(黑名单) +# 这些标签及其内容在任何情况下都不应被翻译,因为它们通常包含代码、样式或元数据。 +# 在预处理阶段,这些标签及其所有子元素将被直接从文档中移除,以确保它们不会被意外修改。 +NON_TRANSLATABLE_TAGS: Set[str] = { + 'script', # JavaScript代码 + 'style', # CSS样式 + 'pre', # 预格式化文本,通常用于代码块 + 'code', # 行内代码 + 'kbd', # 键盘输入 + 'samp', # 示例输出 + 'var', # 变量 + 'noscript',# script未启用时的内容 + 'meta', # 元数据 + 'link', # 外部资源链接 + 'head', # 文档头部,通常不包含可见的可翻译内容 +} + +# 2. 可翻译标签(白名单) +# 定义一组被认为是“安全”的HTML标签,这些标签中的直接文本内容适合被翻译。 +# 这种白名单策略与上面的黑名单结合,提供了双重保障。 +SAFE_TAGS: Set[str] = { + 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'li', 'blockquote', 'q', 'caption', + 'span', 'a', 'strong', 'em', 'b', 'i', 'u', + 'td', 'th', + 'button', 'label', 'legend', 'option', + 'figcaption', 'summary', 'details', + 'div', # div 比较通用,但我们的逻辑只提取其顶层文本节点,相对安全 +} + +# 3. 可翻译属性(白名单) +# 定义一组“安全”的属性,这些属性的值通常是给用户看的可读文本。 +# 格式为: { 'tag_name': ['attr1', 'attr2'], ... } +SAFE_ATTRIBUTES: Dict[str, List[str]] = { + 'img': ['alt', 'title'], + 'a': ['title'], + 'input': ['placeholder', 'title'], + 'textarea': ['placeholder', 'title'], + 'abbr': ['title'], + 'area': ['alt'], + # 对于所有标签,title属性通常是可翻译的 + '*': ['title'] +} + + +@dataclass +class HtmlTranslatorConfig(AiTranslatorConfig): + """ + HTML翻译器的配置类。 + + Attributes: + insert_mode (Literal["replace", "append", "prepend"]): + 指定如何插入翻译文本。 + - "replace": 用译文替换原文。 + - "append": 在原文后追加译文。 + - "prepend": 在原文前追加译文。 + separator (str): 在 "append" 或 "prepend" 模式下,用于分隔原文和译文的字符串。 + """ + insert_mode: Literal["replace", "append", "prepend"] = "replace" + separator: str = " " # HTML中用空格作为默认分隔符可能更合适 + + +class HtmlTranslator(Translator): + """ + 一个用于翻译 HTML 文件内容的翻译器。 + 它采用黑白名单结合的策略,以最大程度地保留页面样式和功能: + 1. 黑名单:首先,完全移除 script, style, code 等明确不可翻译的标签及其内容。 + 2. 白名单:然后,在剩余的HTML中,只提取和翻译指定安全标签和属性中的文本内容。 + 3. 注释保护:显式地跳过HTML注释,确保它们不被翻译。 + 这种方法能有效避免破坏页面结构、脚本、样式和注释。 + """ + + def __init__(self, config: HtmlTranslatorConfig): + super().__init__(config=config) + self.chunk_size = config.chunk_size + agent_config = SegmentsTranslateAgentConfig( + custom_prompt=config.custom_prompt, + to_lang=config.to_lang, + baseurl=config.base_url, + key=config.api_key, + model_id=config.model_id, + system_prompt=None, + temperature=config.temperature, + thinking=config.thinking, + max_concurrent=config.concurrent, + timeout=config.timeout, + logger=self.logger + ) + self.translate_agent = SegmentsTranslateAgent(agent_config) + self.insert_mode = config.insert_mode + self.separator = config.separator + + def _pre_translate(self, document: Document) -> Tuple[BeautifulSoup, List[Dict], List[str]]: + """ + 解析HTML文档,根据规则提取所有需要翻译的文本节点和属性。 + 步骤: + 1. 使用黑名单移除所有不可翻译的标签,从根本上防止它们被处理。 + 2. 遍历剩余的HTML元素,根据白名单提取可翻译的文本和属性值,同时跳过注释。 + """ + soup = BeautifulSoup(document.content, 'lxml') + + # 步骤 1: 移除所有不可翻译的标签及其内容 + for tag in soup.find_all(NON_TRANSLATABLE_TAGS): + tag.decompose() + + translatable_items = [] + original_texts = [] + + # 步骤 2: 遍历所有剩余标签,提取可翻译内容 + for tag in soup.find_all(True): + # --- 2a. 翻译安全标签内的文本节点 --- + if tag.name in SAFE_TAGS: + # 只处理标签的直接子节点中的文本,这是保留样式的关键。 + for child in list(tag.children): + # 【关键修改】确保处理的是纯文本节点,而不是注释(Comment是NavigableString的子类) + if isinstance(child, NavigableString) and not isinstance(child, Comment) and child.strip(): + text = str(child) + translatable_items.append({'type': 'node', 'object': child}) + original_texts.append(text) + + # --- 2b. 翻译安全标签内的安全属性 --- + attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', []) + for attr in set(attributes_to_check): # 使用set去重 + if tag.has_attr(attr) and tag[attr].strip(): + value = tag[attr] + translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr}) + original_texts.append(value) + + return soup, translatable_items, original_texts + + def _after_translate(self, soup: BeautifulSoup, translatable_items: list, + translated_texts: list[str], original_texts: list[str]) -> bytes: + """ + 将翻译后的文本写回到BeautifulSoup对象中对应的节点或属性,并返回最终的HTML字节流。 + """ + if len(translatable_items) != len(translated_texts): + self.logger.error("翻译前后的文本片段数量不匹配 (%d vs %d),跳过写入操作以防损坏文件。", + len(translatable_items), len(translated_texts)) + return soup.encode('utf-8') + + for i, item in enumerate(translatable_items): + translated_text = translated_texts[i] + original_text = original_texts[i] + + new_content = "" + if self.insert_mode == "replace": + if item['type'] == 'node': + # 对于文本节点,保留原文前后的空白字符,这对维持内联元素的间距至关重要。 + leading_space = original_text[:len(original_text) - len(original_text.lstrip())] + trailing_space = original_text[len(original_text.rstrip()):] + new_content = leading_space + translated_text + trailing_space + else: # 属性 + new_content = translated_text + + elif self.insert_mode == "append": + new_content = original_text + self.separator + translated_text + elif self.insert_mode == "prepend": + new_content = translated_text + self.separator + original_text + else: + self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'") + new_content = original_text # 出错时恢复原文 + + # 根据类型将内容写回 + if item['type'] == 'node': + node = item['object'] + # 检查节点是否仍然在解析树中,以防在处理过程中被移动或删除 + if node.parent: + node.replace_with(NavigableString(new_content)) + elif item['type'] == 'attribute': + tag = item['tag'] + attr = item['attribute'] + tag[attr] = new_content + + # 将修改后的BeautifulSoup对象编码为utf-8字节流 + return soup.encode('utf-8') + + def translate(self, document: Document) -> Self: + """ + 同步翻译HTML文档。 + """ + soup, translatable_items, original_texts = self._pre_translate(document) + if not translatable_items: + self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。") + # 即使没有翻译内容,也返回经过清理(移除非翻译标签)的文档内容 + document.content = soup.encode('utf-8') + return self + + translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) + document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts) + return self + + async def translate_async(self, document: Document) -> Self: + """ + 异步翻译HTML文档。 + """ + soup, translatable_items, original_texts = await asyncio.to_thread(self._pre_translate, document) + + if not translatable_items: + self.logger.info("\nHTML文件中没有找到符合安全规则的可翻译内容。") + document.content = await asyncio.to_thread(soup.encode, 'utf-8') + return self + + translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) + document.content = await asyncio.to_thread( + self._after_translate, soup, translatable_items, translated_texts, original_texts + ) + return self \ No newline at end of file diff --git a/docutranslate/workflow/html_workflow.py b/docutranslate/workflow/html_workflow.py new file mode 100644 index 0000000..54644e1 --- /dev/null +++ b/docutranslate/workflow/html_workflow.py @@ -0,0 +1,55 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Self + +from docutranslate.exporter.base import ExporterConfig +from docutranslate.exporter.html.html2html_exporter import Html2HtmlExporter + +from docutranslate.ir.document import Document +from docutranslate.translator.ai_translator.html_translator import HtmlTranslatorConfig, HtmlTranslator +from docutranslate.workflow.base import Workflow, WorkflowConfig +from docutranslate.workflow.interfaces import HTMLExportable + + +@dataclass(kw_only=True) +class HtmlWorkflowConfig(WorkflowConfig): + translator_config: HtmlTranslatorConfig + + + +class HtmlWorkflow(Workflow[HtmlWorkflowConfig, Document, Document], HTMLExportable): + def __init__(self, config: HtmlWorkflowConfig): + super().__init__(config=config) + if config.logger: + for sub_config in [self.config.translator_config]: + if sub_config: + sub_config.logger = config.logger + + def _pre_translate(self, document_original: Document): + document = document_original.copy() + translate_config = self.config.translator_config + translator = HtmlTranslator(translate_config) + return document, translator + + def translate(self) -> Self: + document, translator = self._pre_translate(self.document_original) + translator.translate(document) + self.document_translated = document + return self + + async def translate_async(self) -> Self: + document, translator = self._pre_translate(self.document_original) + await translator.translate_async(document) + self.document_translated = document + return self + + def export_to_html(self, _: ExporterConfig = None) -> str: + + docu = self._export(Html2HtmlExporter()) + return docu.content.decode() + + + def save_as_html(self, name: str = None, output_dir: Path | str = "./output", + _: ExporterConfig | None = None) -> Self: + self._save(exporter=Html2HtmlExporter(), name=name, output_dir=output_dir) + return self