From caf1fa17ebfb4c92f7f9aff73817d801f0235bd4 Mon Sep 17 00:00:00 2001 From: xunbu Date: Tue, 5 Aug 2025 11:24:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=9D=E8=AF=95=E5=A2=9E=E5=8A=A0=E5=AF=B9do?= =?UTF-8?q?cx=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/exporter/docx/__init__.py | 0 docutranslate/exporter/docx/base.py | 8 + .../exporter/docx/docx2docx_exporter.py | 7 + .../exporter/docx/docx2html_exporter.py | 25 +++ .../exporter/xlsx/xlsx2html_exporter.py | 3 - .../ai_translator/docx_translator.py | 157 ++++++++++++++++++ docutranslate/workflow/docx_workflow.py | 66 ++++++++ docutranslate/workflow/interfaces.py | 8 + pyproject.toml | 1 + uv.lock | 23 +++ 10 files changed, 295 insertions(+), 3 deletions(-) create mode 100644 docutranslate/exporter/docx/__init__.py create mode 100644 docutranslate/exporter/docx/base.py create mode 100644 docutranslate/exporter/docx/docx2docx_exporter.py create mode 100644 docutranslate/exporter/docx/docx2html_exporter.py create mode 100644 docutranslate/translator/ai_translator/docx_translator.py create mode 100644 docutranslate/workflow/docx_workflow.py diff --git a/docutranslate/exporter/docx/__init__.py b/docutranslate/exporter/docx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docutranslate/exporter/docx/base.py b/docutranslate/exporter/docx/base.py new file mode 100644 index 0000000..18b628d --- /dev/null +++ b/docutranslate/exporter/docx/base.py @@ -0,0 +1,8 @@ +from docutranslate.exporter.base import Exporter +from docutranslate.ir.document import Document + +#TODO:看情况是否需要为json单独写一个document类型 +class DocxExporter(Exporter[Document]): + + def export(self,document:Document)->Document: + ... \ No newline at end of file diff --git a/docutranslate/exporter/docx/docx2docx_exporter.py b/docutranslate/exporter/docx/docx2docx_exporter.py new file mode 100644 index 0000000..e13efe5 --- /dev/null +++ b/docutranslate/exporter/docx/docx2docx_exporter.py @@ -0,0 +1,7 @@ +from docutranslate.exporter.docx.base import DocxExporter +from docutranslate.ir.document import Document + + +class Docx2DocxExporter(DocxExporter): + def export(self, document: Document) -> Document: + return document.copy() diff --git a/docutranslate/exporter/docx/docx2html_exporter.py b/docutranslate/exporter/docx/docx2html_exporter.py new file mode 100644 index 0000000..e3ef2c3 --- /dev/null +++ b/docutranslate/exporter/docx/docx2html_exporter.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from io import BytesIO + +import mammoth + +from docutranslate.exporter.base import ExporterConfig +from docutranslate.exporter.xlsx.base import XlsxExporter +from docutranslate.ir.document import Document + + +@dataclass +class Docx2HTMLExporterConfig(ExporterConfig): + cdn: bool = True + + +class Docx2HTMLExporter(XlsxExporter): + def __init__(self, config: Docx2HTMLExporterConfig = None): + config = config or Docx2HTMLExporterConfig() + super().__init__(config=config) + self.cdn = config.cdn + + def export(self, document: Document) -> Document: + html_content = mammoth.convert_to_html(BytesIO(document.content)) + + return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem) diff --git a/docutranslate/exporter/xlsx/xlsx2html_exporter.py b/docutranslate/exporter/xlsx/xlsx2html_exporter.py index 8d04705..1a0c746 100644 --- a/docutranslate/exporter/xlsx/xlsx2html_exporter.py +++ b/docutranslate/exporter/xlsx/xlsx2html_exporter.py @@ -1,14 +1,11 @@ from dataclasses import dataclass from io import BytesIO -import jinja2 -import openpyxl from xlsx2html import xlsx2html from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.xlsx.base import XlsxExporter from docutranslate.ir.document import Document -from docutranslate.utils.resource_utils import resource_path @dataclass diff --git a/docutranslate/translator/ai_translator/docx_translator.py b/docutranslate/translator/ai_translator/docx_translator.py new file mode 100644 index 0000000..4f21d71 --- /dev/null +++ b/docutranslate/translator/ai_translator/docx_translator.py @@ -0,0 +1,157 @@ +import asyncio +from dataclasses import dataclass +from io import BytesIO +from typing import Self, Literal, List, Dict, Any, Tuple + +import docx +from docx.document import Document as DocumentObject +from docx.table import _Cell + +from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent +from docutranslate.ir.document import Document +from docutranslate.translator.ai_translator.base import AiTranslatorConfig +from docutranslate.translator.base import Translator + + +@dataclass +class DocxTranslatorConfig(AiTranslatorConfig): + """ + DocxTranslator 的配置类。 + """ + insert_mode: Literal["replace", "append", "prepend"] = "replace" + separator: str = "\n" + + +class DocxTranslator(Translator): + """ + 用于翻译 .docx 文件的翻译器。 + """ + + def __init__(self, config: DocxTranslatorConfig): + super().__init__(config=config) + self.chunk_size = config.chunk_size + agent_config = SegmentsTranslateAgentConfig( + custom_prompt=config.custom_prompt, + to_lang=config.to_lang, + baseurl=config.base_url, + key=config.api_key, + model_id=config.model_id, + system_prompt=None, + temperature=config.temperature, + thinking=config.thinking, + max_concurrent=config.concurrent, + timeout=config.timeout, + logger=self.logger + ) + self.translate_agent = SegmentsTranslateAgent(agent_config) + self.insert_mode = config.insert_mode + self.separator = config.separator + + def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]: + """ + 预处理 .docx 文件,提取所有需要翻译的文本。 + + :param document: 包含 .docx 文件内容的 Document 对象。 + :return: 一个元组,包含: + - docx.Document 对象 + - 一个包含文本元素信息的列表 (e.g., paragraph, cell) + - 一个包含所有待翻译原文的列表 + """ + doc = docx.Document(BytesIO(document.content)) + elements_to_translate = [] + original_texts = [] + + # 遍历所有段落 + for para in doc.paragraphs: + if para.text.strip(): # 确保段落有实际内容 + elements_to_translate.append({"type": "paragraph", "element": para}) + original_texts.append(para.text) + + # 遍历所有表格 + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + if cell.text.strip(): # 确保单元格有实际内容 + elements_to_translate.append({"type": "cell", "element": cell}) + original_texts.append(cell.text) + + return doc, elements_to_translate, original_texts + + def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]], + translated_texts: List[str], original_texts: List[str]) -> bytes: + """ + 将翻译后的文本写回到 .docx 对象中。 + + :param doc: docx.Document 对象。 + :param elements_to_translate: 包含文本元素信息的列表。 + :param translated_texts: 翻译后的文本列表。 + :param original_texts: 原始文本列表。 + :return: 更新后的 .docx 文件内容的字节流。 + """ + for i, element_info in enumerate(elements_to_translate): + element = element_info["element"] + original_text = original_texts[i] + translated_text = translated_texts[i] + + # 清空原有内容并写入新内容 + if isinstance(element, docx.text.paragraph.Paragraph): + # 清空段落内容 + element.clear() + # 根据插入模式添加文本 + if self.insert_mode == "replace": + element.add_run(translated_text) + elif self.insert_mode == "append": + element.add_run(original_text + self.separator + translated_text) + elif self.insert_mode == "prepend": + element.add_run(translated_text + self.separator + original_text) + else: + self.logger.error("不正确的DocxTranslatorConfig参数") + + elif isinstance(element, _Cell): + # 根据插入模式设置单元格文本 + if self.insert_mode == "replace": + element.text = translated_text + elif self.insert_mode == "append": + element.text = original_text + self.separator + translated_text + elif self.insert_mode == "prepend": + element.text = translated_text + self.separator + original_text + else: + self.logger.error("不正确的DocxTranslatorConfig参数") + + # 将修改后的文档保存到 BytesIO 流 + doc_output_stream = BytesIO() + doc.save(doc_output_stream) + return doc_output_stream.getvalue() + + def translate(self, document: Document) -> Self: + """ + 同步翻译 .docx 文件。 + """ + doc, elements_to_translate, original_texts = self._pre_translate(document) + if not elements_to_translate: + print("\n文件中没有找到需要翻译的文本内容。") + return self + + # 调用翻译 agent + translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) + + # 将翻译结果写回文档 + document.content = self._after_translate(doc, elements_to_translate, translated_texts, original_texts) + return self + + async def translate_async(self, document: Document) -> Self: + """ + 异步翻译 .docx 文件。 + """ + doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document) + if not elements_to_translate: + print("\n文件中没有找到需要翻译的文本内容。") + return self + + # 异步调用翻译 agent + translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) + + # 将翻译结果写回文档 + document.content = await asyncio.to_thread(self._after_translate, doc, elements_to_translate, translated_texts, + original_texts) + return self \ No newline at end of file diff --git a/docutranslate/workflow/docx_workflow.py b/docutranslate/workflow/docx_workflow.py new file mode 100644 index 0000000..91b5c37 --- /dev/null +++ b/docutranslate/workflow/docx_workflow.py @@ -0,0 +1,66 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Self + +from docutranslate.exporter.base import ExporterConfig +from docutranslate.exporter.docx.docx2docx_exporter import Docx2DocxExporter +from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig, Docx2HTMLExporter + +from docutranslate.ir.document import Document +from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig, DocxTranslator +from docutranslate.workflow.base import Workflow, WorkflowConfig +from docutranslate.workflow.interfaces import HTMLExportable, DocxExportable + + +@dataclass(kw_only=True) +class DocxWorkflowConfig(WorkflowConfig): + translator_config: DocxTranslatorConfig + html_exporter_config: Docx2HTMLExporterConfig + + +class DocxWorkflow(Workflow[DocxWorkflowConfig, Document, Document], HTMLExportable[Docx2HTMLExporterConfig], + DocxExportable[ExporterConfig]): + def __init__(self, config: DocxWorkflowConfig): + super().__init__(config=config) + if config.logger: + for sub_config in [self.config.translator_config]: + if sub_config: + sub_config.logger = config.logger + + def _pre_translate(self, document_original: Document): + document = document_original.copy() + translate_config = self.config.translator_config + translator = DocxTranslator(translate_config) + return document, translator + + def translate(self) -> Self: + document, translator = self._pre_translate(self.document_original) + translator.translate(document) + self.document_translated = document + return self + + async def translate_async(self) -> Self: + document, translator = self._pre_translate(self.document_original) + await translator.translate_async(document) + self.document_translated = document + return self + + def export_to_html(self, config: Docx2HTMLExporterConfig = None) -> str: + config = config or self.config.html_exporter_config + docu = self._export(Docx2HTMLExporter(config)) + return docu.content.decode() + + def export_to_xlsx(self, _: ExporterConfig | None = None) -> bytes: + docu = self._export(Docx2DocxExporter()) + return docu.content + + def save_as_html(self, name: str = None, output_dir: Path | str = "./output", + config: Docx2HTMLExporter | None = None) -> Self: + config = config or self.config.html_exporter_config + self._save(exporter=Docx2HTMLExporter(config), name=name, output_dir=output_dir) + return self + + def save_as_xlsx(self, name: str = None, output_dir: Path | str = "./output", + _: ExporterConfig | None = None) -> Self: + self._save(exporter=Docx2DocxExporter(), name=name, output_dir=output_dir) + return self diff --git a/docutranslate/workflow/interfaces.py b/docutranslate/workflow/interfaces.py index 255d991..a971765 100644 --- a/docutranslate/workflow/interfaces.py +++ b/docutranslate/workflow/interfaces.py @@ -63,3 +63,11 @@ class XlsxExportable(Protocol[T_ExporterConfig]): def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: ... + +@runtime_checkable +class DocxExportable(Protocol[T_ExporterConfig]): + def export_to_docx(self, config: T_ExporterConfig | None = None) -> bytes: + ... + + def save_as_docx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: + ... \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 08c886f..71a83c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "openpyxl>=3.1.5", "xlsx2html>=0.6.2", "json-repair>=0.48.0", + "mammoth>=1.10.0", ] dynamic = ["version"] diff --git a/uv.lock b/uv.lock index 1ccda02..ec27b35 100644 --- a/uv.lock +++ b/uv.lock @@ -142,6 +142,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, ] +[[package]] +name = "cobble" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -294,6 +303,7 @@ dependencies = [ { name = "httpx" }, { name = "json-repair" }, { name = "jsonpath-ng" }, + { name = "mammoth" }, { name = "markdown2" }, { name = "openpyxl" }, { name = "xlsx2html" }, @@ -319,6 +329,7 @@ requires-dist = [ { name = "httpx", specifier = "==0.27.2" }, { name = "json-repair", specifier = ">=0.48.0" }, { name = "jsonpath-ng", specifier = ">=1.7.0" }, + { name = "mammoth", specifier = ">=1.10.0" }, { name = "markdown2", specifier = ">=2.5.3" }, { name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" }, { name = "openpyxl", specifier = ">=3.1.5" }, @@ -727,6 +738,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/5d/c059c180c84f7962db0aeae7c3b9303ed1d73d76f2bfbc32bc231c8be314/macholib-1.16.3-py2.py3-none-any.whl", hash = "sha256:0e315d7583d38b8c77e815b1ecbdbf504a8258d8b3e17b61165c6feb60d18f2c", size = 38094 }, ] +[[package]] +name = "mammoth" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cobble" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/0d/2ab86f37021b4c50fe72354acd226b1e31a10497e51f6cbd7e3d1eca1181/mammoth-1.10.0.tar.gz", hash = "sha256:cb6fbba41ccf8b5502859c457177d87a833fef0e0b1d4e6fd23ec372fe892c30", size = 52285 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/67/36eeb3a8726df3b282ba99ec126323871cffdbcf3b7a1db64ca9bbe4abc1/mammoth-1.10.0-py2.py3-none-any.whl", hash = "sha256:a1c87d5b98ca30230394267f98614b58b14b50f8031dc33ac9a535c6ab04eb99", size = 53823 }, +] + [[package]] name = "markdown-it-py" version = "3.0.0"