尝试增加对docx的支持

This commit is contained in:
xunbu
2025-08-05 11:24:18 +08:00
parent 8e2f9e65da
commit caf1fa17eb
10 changed files with 295 additions and 3 deletions

View File

View File

@@ -0,0 +1,8 @@
from docutranslate.exporter.base import Exporter
from docutranslate.ir.document import Document
#TODO:看情况是否需要为json单独写一个document类型
class DocxExporter(Exporter[Document]):
def export(self,document:Document)->Document:
...

View File

@@ -0,0 +1,7 @@
from docutranslate.exporter.docx.base import DocxExporter
from docutranslate.ir.document import Document
class Docx2DocxExporter(DocxExporter):
def export(self, document: Document) -> Document:
return document.copy()

View File

@@ -0,0 +1,25 @@
from dataclasses import dataclass
from io import BytesIO
import mammoth
from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.xlsx.base import XlsxExporter
from docutranslate.ir.document import Document
@dataclass
class Docx2HTMLExporterConfig(ExporterConfig):
cdn: bool = True
class Docx2HTMLExporter(XlsxExporter):
def __init__(self, config: Docx2HTMLExporterConfig = None):
config = config or Docx2HTMLExporterConfig()
super().__init__(config=config)
self.cdn = config.cdn
def export(self, document: Document) -> Document:
html_content = mammoth.convert_to_html(BytesIO(document.content))
return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)

View File

@@ -1,14 +1,11 @@
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
import jinja2
import openpyxl
from xlsx2html import xlsx2html from xlsx2html import xlsx2html
from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.xlsx.base import XlsxExporter from docutranslate.exporter.xlsx.base import XlsxExporter
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.utils.resource_utils import resource_path
@dataclass @dataclass

View File

@@ -0,0 +1,157 @@
import asyncio
from dataclasses import dataclass
from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple
import docx
from docx.document import Document as DocumentObject
from docx.table import _Cell
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
@dataclass
class DocxTranslatorConfig(AiTranslatorConfig):
"""
DocxTranslator 的配置类。
"""
insert_mode: Literal["replace", "append", "prepend"] = "replace"
separator: str = "\n"
class DocxTranslator(Translator):
"""
用于翻译 .docx 文件的翻译器。
"""
def __init__(self, config: DocxTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
agent_config = SegmentsTranslateAgentConfig(
custom_prompt=config.custom_prompt,
to_lang=config.to_lang,
baseurl=config.base_url,
key=config.api_key,
model_id=config.model_id,
system_prompt=None,
temperature=config.temperature,
thinking=config.thinking,
max_concurrent=config.concurrent,
timeout=config.timeout,
logger=self.logger
)
self.translate_agent = SegmentsTranslateAgent(agent_config)
self.insert_mode = config.insert_mode
self.separator = config.separator
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
"""
预处理 .docx 文件,提取所有需要翻译的文本。
:param document: 包含 .docx 文件内容的 Document 对象。
:return: 一个元组,包含:
- docx.Document 对象
- 一个包含文本元素信息的列表 (e.g., paragraph, cell)
- 一个包含所有待翻译原文的列表
"""
doc = docx.Document(BytesIO(document.content))
elements_to_translate = []
original_texts = []
# 遍历所有段落
for para in doc.paragraphs:
if para.text.strip(): # 确保段落有实际内容
elements_to_translate.append({"type": "paragraph", "element": para})
original_texts.append(para.text)
# 遍历所有表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip(): # 确保单元格有实际内容
elements_to_translate.append({"type": "cell", "element": cell})
original_texts.append(cell.text)
return doc, elements_to_translate, original_texts
def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]],
translated_texts: List[str], original_texts: List[str]) -> bytes:
"""
将翻译后的文本写回到 .docx 对象中。
:param doc: docx.Document 对象。
:param elements_to_translate: 包含文本元素信息的列表。
:param translated_texts: 翻译后的文本列表。
:param original_texts: 原始文本列表。
:return: 更新后的 .docx 文件内容的字节流。
"""
for i, element_info in enumerate(elements_to_translate):
element = element_info["element"]
original_text = original_texts[i]
translated_text = translated_texts[i]
# 清空原有内容并写入新内容
if isinstance(element, docx.text.paragraph.Paragraph):
# 清空段落内容
element.clear()
# 根据插入模式添加文本
if self.insert_mode == "replace":
element.add_run(translated_text)
elif self.insert_mode == "append":
element.add_run(original_text + self.separator + translated_text)
elif self.insert_mode == "prepend":
element.add_run(translated_text + self.separator + original_text)
else:
self.logger.error("不正确的DocxTranslatorConfig参数")
elif isinstance(element, _Cell):
# 根据插入模式设置单元格文本
if self.insert_mode == "replace":
element.text = translated_text
elif self.insert_mode == "append":
element.text = original_text + self.separator + translated_text
elif self.insert_mode == "prepend":
element.text = translated_text + self.separator + original_text
else:
self.logger.error("不正确的DocxTranslatorConfig参数")
# 将修改后的文档保存到 BytesIO 流
doc_output_stream = BytesIO()
doc.save(doc_output_stream)
return doc_output_stream.getvalue()
def translate(self, document: Document) -> Self:
"""
同步翻译 .docx 文件。
"""
doc, elements_to_translate, original_texts = self._pre_translate(document)
if not elements_to_translate:
print("\n文件中没有找到需要翻译的文本内容。")
return self
# 调用翻译 agent
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
# 将翻译结果写回文档
document.content = self._after_translate(doc, elements_to_translate, translated_texts, original_texts)
return self
async def translate_async(self, document: Document) -> Self:
"""
异步翻译 .docx 文件。
"""
doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
if not elements_to_translate:
print("\n文件中没有找到需要翻译的文本内容。")
return self
# 异步调用翻译 agent
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
# 将翻译结果写回文档
document.content = await asyncio.to_thread(self._after_translate, doc, elements_to_translate, translated_texts,
original_texts)
return self

View File

@@ -0,0 +1,66 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Self
from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.docx.docx2docx_exporter import Docx2DocxExporter
from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig, Docx2HTMLExporter
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig, DocxTranslator
from docutranslate.workflow.base import Workflow, WorkflowConfig
from docutranslate.workflow.interfaces import HTMLExportable, DocxExportable
@dataclass(kw_only=True)
class DocxWorkflowConfig(WorkflowConfig):
translator_config: DocxTranslatorConfig
html_exporter_config: Docx2HTMLExporterConfig
class DocxWorkflow(Workflow[DocxWorkflowConfig, Document, Document], HTMLExportable[Docx2HTMLExporterConfig],
DocxExportable[ExporterConfig]):
def __init__(self, config: DocxWorkflowConfig):
super().__init__(config=config)
if config.logger:
for sub_config in [self.config.translator_config]:
if sub_config:
sub_config.logger = config.logger
def _pre_translate(self, document_original: Document):
document = document_original.copy()
translate_config = self.config.translator_config
translator = DocxTranslator(translate_config)
return document, translator
def translate(self) -> Self:
document, translator = self._pre_translate(self.document_original)
translator.translate(document)
self.document_translated = document
return self
async def translate_async(self) -> Self:
document, translator = self._pre_translate(self.document_original)
await translator.translate_async(document)
self.document_translated = document
return self
def export_to_html(self, config: Docx2HTMLExporterConfig = None) -> str:
config = config or self.config.html_exporter_config
docu = self._export(Docx2HTMLExporter(config))
return docu.content.decode()
def export_to_xlsx(self, _: ExporterConfig | None = None) -> bytes:
docu = self._export(Docx2DocxExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
config: Docx2HTMLExporter | None = None) -> Self:
config = config or self.config.html_exporter_config
self._save(exporter=Docx2HTMLExporter(config), name=name, output_dir=output_dir)
return self
def save_as_xlsx(self, name: str = None, output_dir: Path | str = "./output",
_: ExporterConfig | None = None) -> Self:
self._save(exporter=Docx2DocxExporter(), name=name, output_dir=output_dir)
return self

View File

@@ -63,3 +63,11 @@ class XlsxExportable(Protocol[T_ExporterConfig]):
def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self: def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
... ...
@runtime_checkable
class DocxExportable(Protocol[T_ExporterConfig]):
def export_to_docx(self, config: T_ExporterConfig | None = None) -> bytes:
...
def save_as_docx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
...

View File

@@ -11,6 +11,7 @@ dependencies = [
"openpyxl>=3.1.5", "openpyxl>=3.1.5",
"xlsx2html>=0.6.2", "xlsx2html>=0.6.2",
"json-repair>=0.48.0", "json-repair>=0.48.0",
"mammoth>=1.10.0",
] ]
dynamic = ["version"] dynamic = ["version"]

23
uv.lock generated
View File

@@ -142,6 +142,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
] ]
[[package]]
name = "cobble"
version = "0.1.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984 },
]
[[package]] [[package]]
name = "colorama" name = "colorama"
version = "0.4.6" version = "0.4.6"
@@ -294,6 +303,7 @@ dependencies = [
{ name = "httpx" }, { name = "httpx" },
{ name = "json-repair" }, { name = "json-repair" },
{ name = "jsonpath-ng" }, { name = "jsonpath-ng" },
{ name = "mammoth" },
{ name = "markdown2" }, { name = "markdown2" },
{ name = "openpyxl" }, { name = "openpyxl" },
{ name = "xlsx2html" }, { name = "xlsx2html" },
@@ -319,6 +329,7 @@ requires-dist = [
{ name = "httpx", specifier = "==0.27.2" }, { name = "httpx", specifier = "==0.27.2" },
{ name = "json-repair", specifier = ">=0.48.0" }, { name = "json-repair", specifier = ">=0.48.0" },
{ name = "jsonpath-ng", specifier = ">=1.7.0" }, { name = "jsonpath-ng", specifier = ">=1.7.0" },
{ name = "mammoth", specifier = ">=1.10.0" },
{ name = "markdown2", specifier = ">=2.5.3" }, { name = "markdown2", specifier = ">=2.5.3" },
{ name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" }, { name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" },
{ name = "openpyxl", specifier = ">=3.1.5" }, { name = "openpyxl", specifier = ">=3.1.5" },
@@ -727,6 +738,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/5d/c059c180c84f7962db0aeae7c3b9303ed1d73d76f2bfbc32bc231c8be314/macholib-1.16.3-py2.py3-none-any.whl", hash = "sha256:0e315d7583d38b8c77e815b1ecbdbf504a8258d8b3e17b61165c6feb60d18f2c", size = 38094 }, { url = "https://files.pythonhosted.org/packages/d1/5d/c059c180c84f7962db0aeae7c3b9303ed1d73d76f2bfbc32bc231c8be314/macholib-1.16.3-py2.py3-none-any.whl", hash = "sha256:0e315d7583d38b8c77e815b1ecbdbf504a8258d8b3e17b61165c6feb60d18f2c", size = 38094 },
] ]
[[package]]
name = "mammoth"
version = "1.10.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cobble" },
]
sdist = { url = "https://files.pythonhosted.org/packages/89/0d/2ab86f37021b4c50fe72354acd226b1e31a10497e51f6cbd7e3d1eca1181/mammoth-1.10.0.tar.gz", hash = "sha256:cb6fbba41ccf8b5502859c457177d87a833fef0e0b1d4e6fd23ec372fe892c30", size = 52285 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a6/67/36eeb3a8726df3b282ba99ec126323871cffdbcf3b7a1db64ca9bbe4abc1/mammoth-1.10.0-py2.py3-none-any.whl", hash = "sha256:a1c87d5b98ca30230394267f98614b58b14b50f8031dc33ac9a535c6ab04eb99", size = 53823 },
]
[[package]] [[package]]
name = "markdown-it-py" name = "markdown-it-py"
version = "3.0.0" version = "3.0.0"