尝试增加对docx的支持
This commit is contained in:
0
docutranslate/exporter/docx/__init__.py
Normal file
0
docutranslate/exporter/docx/__init__.py
Normal file
8
docutranslate/exporter/docx/base.py
Normal file
8
docutranslate/exporter/docx/base.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from docutranslate.exporter.base import Exporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
#TODO:看情况是否需要为json单独写一个document类型
|
||||||
|
class DocxExporter(Exporter[Document]):
|
||||||
|
|
||||||
|
def export(self,document:Document)->Document:
|
||||||
|
...
|
||||||
7
docutranslate/exporter/docx/docx2docx_exporter.py
Normal file
7
docutranslate/exporter/docx/docx2docx_exporter.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from docutranslate.exporter.docx.base import DocxExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
class Docx2DocxExporter(DocxExporter):
|
||||||
|
def export(self, document: Document) -> Document:
|
||||||
|
return document.copy()
|
||||||
25
docutranslate/exporter/docx/docx2html_exporter.py
Normal file
25
docutranslate/exporter/docx/docx2html_exporter.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import mammoth
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
|
from docutranslate.exporter.xlsx.base import XlsxExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Docx2HTMLExporterConfig(ExporterConfig):
|
||||||
|
cdn: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class Docx2HTMLExporter(XlsxExporter):
|
||||||
|
def __init__(self, config: Docx2HTMLExporterConfig = None):
|
||||||
|
config = config or Docx2HTMLExporterConfig()
|
||||||
|
super().__init__(config=config)
|
||||||
|
self.cdn = config.cdn
|
||||||
|
|
||||||
|
def export(self, document: Document) -> Document:
|
||||||
|
html_content = mammoth.convert_to_html(BytesIO(document.content))
|
||||||
|
|
||||||
|
return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||||
@@ -1,14 +1,11 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import jinja2
|
|
||||||
import openpyxl
|
|
||||||
from xlsx2html import xlsx2html
|
from xlsx2html import xlsx2html
|
||||||
|
|
||||||
from docutranslate.exporter.base import ExporterConfig
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
from docutranslate.exporter.xlsx.base import XlsxExporter
|
from docutranslate.exporter.xlsx.base import XlsxExporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
157
docutranslate/translator/ai_translator/docx_translator.py
Normal file
157
docutranslate/translator/ai_translator/docx_translator.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Self, Literal, List, Dict, Any, Tuple
|
||||||
|
|
||||||
|
import docx
|
||||||
|
from docx.document import Document as DocumentObject
|
||||||
|
from docx.table import _Cell
|
||||||
|
|
||||||
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
||||||
|
from docutranslate.translator.base import Translator
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DocxTranslatorConfig(AiTranslatorConfig):
|
||||||
|
"""
|
||||||
|
DocxTranslator 的配置类。
|
||||||
|
"""
|
||||||
|
insert_mode: Literal["replace", "append", "prepend"] = "replace"
|
||||||
|
separator: str = "\n"
|
||||||
|
|
||||||
|
|
||||||
|
class DocxTranslator(Translator):
|
||||||
|
"""
|
||||||
|
用于翻译 .docx 文件的翻译器。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: DocxTranslatorConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
self.chunk_size = config.chunk_size
|
||||||
|
agent_config = SegmentsTranslateAgentConfig(
|
||||||
|
custom_prompt=config.custom_prompt,
|
||||||
|
to_lang=config.to_lang,
|
||||||
|
baseurl=config.base_url,
|
||||||
|
key=config.api_key,
|
||||||
|
model_id=config.model_id,
|
||||||
|
system_prompt=None,
|
||||||
|
temperature=config.temperature,
|
||||||
|
thinking=config.thinking,
|
||||||
|
max_concurrent=config.concurrent,
|
||||||
|
timeout=config.timeout,
|
||||||
|
logger=self.logger
|
||||||
|
)
|
||||||
|
self.translate_agent = SegmentsTranslateAgent(agent_config)
|
||||||
|
self.insert_mode = config.insert_mode
|
||||||
|
self.separator = config.separator
|
||||||
|
|
||||||
|
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
|
||||||
|
"""
|
||||||
|
预处理 .docx 文件,提取所有需要翻译的文本。
|
||||||
|
|
||||||
|
:param document: 包含 .docx 文件内容的 Document 对象。
|
||||||
|
:return: 一个元组,包含:
|
||||||
|
- docx.Document 对象
|
||||||
|
- 一个包含文本元素信息的列表 (e.g., paragraph, cell)
|
||||||
|
- 一个包含所有待翻译原文的列表
|
||||||
|
"""
|
||||||
|
doc = docx.Document(BytesIO(document.content))
|
||||||
|
elements_to_translate = []
|
||||||
|
original_texts = []
|
||||||
|
|
||||||
|
# 遍历所有段落
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
if para.text.strip(): # 确保段落有实际内容
|
||||||
|
elements_to_translate.append({"type": "paragraph", "element": para})
|
||||||
|
original_texts.append(para.text)
|
||||||
|
|
||||||
|
# 遍历所有表格
|
||||||
|
for table in doc.tables:
|
||||||
|
for row in table.rows:
|
||||||
|
for cell in row.cells:
|
||||||
|
if cell.text.strip(): # 确保单元格有实际内容
|
||||||
|
elements_to_translate.append({"type": "cell", "element": cell})
|
||||||
|
original_texts.append(cell.text)
|
||||||
|
|
||||||
|
return doc, elements_to_translate, original_texts
|
||||||
|
|
||||||
|
def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]],
|
||||||
|
translated_texts: List[str], original_texts: List[str]) -> bytes:
|
||||||
|
"""
|
||||||
|
将翻译后的文本写回到 .docx 对象中。
|
||||||
|
|
||||||
|
:param doc: docx.Document 对象。
|
||||||
|
:param elements_to_translate: 包含文本元素信息的列表。
|
||||||
|
:param translated_texts: 翻译后的文本列表。
|
||||||
|
:param original_texts: 原始文本列表。
|
||||||
|
:return: 更新后的 .docx 文件内容的字节流。
|
||||||
|
"""
|
||||||
|
for i, element_info in enumerate(elements_to_translate):
|
||||||
|
element = element_info["element"]
|
||||||
|
original_text = original_texts[i]
|
||||||
|
translated_text = translated_texts[i]
|
||||||
|
|
||||||
|
# 清空原有内容并写入新内容
|
||||||
|
if isinstance(element, docx.text.paragraph.Paragraph):
|
||||||
|
# 清空段落内容
|
||||||
|
element.clear()
|
||||||
|
# 根据插入模式添加文本
|
||||||
|
if self.insert_mode == "replace":
|
||||||
|
element.add_run(translated_text)
|
||||||
|
elif self.insert_mode == "append":
|
||||||
|
element.add_run(original_text + self.separator + translated_text)
|
||||||
|
elif self.insert_mode == "prepend":
|
||||||
|
element.add_run(translated_text + self.separator + original_text)
|
||||||
|
else:
|
||||||
|
self.logger.error("不正确的DocxTranslatorConfig参数")
|
||||||
|
|
||||||
|
elif isinstance(element, _Cell):
|
||||||
|
# 根据插入模式设置单元格文本
|
||||||
|
if self.insert_mode == "replace":
|
||||||
|
element.text = translated_text
|
||||||
|
elif self.insert_mode == "append":
|
||||||
|
element.text = original_text + self.separator + translated_text
|
||||||
|
elif self.insert_mode == "prepend":
|
||||||
|
element.text = translated_text + self.separator + original_text
|
||||||
|
else:
|
||||||
|
self.logger.error("不正确的DocxTranslatorConfig参数")
|
||||||
|
|
||||||
|
# 将修改后的文档保存到 BytesIO 流
|
||||||
|
doc_output_stream = BytesIO()
|
||||||
|
doc.save(doc_output_stream)
|
||||||
|
return doc_output_stream.getvalue()
|
||||||
|
|
||||||
|
def translate(self, document: Document) -> Self:
|
||||||
|
"""
|
||||||
|
同步翻译 .docx 文件。
|
||||||
|
"""
|
||||||
|
doc, elements_to_translate, original_texts = self._pre_translate(document)
|
||||||
|
if not elements_to_translate:
|
||||||
|
print("\n文件中没有找到需要翻译的文本内容。")
|
||||||
|
return self
|
||||||
|
|
||||||
|
# 调用翻译 agent
|
||||||
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
|
||||||
|
# 将翻译结果写回文档
|
||||||
|
document.content = self._after_translate(doc, elements_to_translate, translated_texts, original_texts)
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, document: Document) -> Self:
|
||||||
|
"""
|
||||||
|
异步翻译 .docx 文件。
|
||||||
|
"""
|
||||||
|
doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
|
||||||
|
if not elements_to_translate:
|
||||||
|
print("\n文件中没有找到需要翻译的文本内容。")
|
||||||
|
return self
|
||||||
|
|
||||||
|
# 异步调用翻译 agent
|
||||||
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
|
||||||
|
# 将翻译结果写回文档
|
||||||
|
document.content = await asyncio.to_thread(self._after_translate, doc, elements_to_translate, translated_texts,
|
||||||
|
original_texts)
|
||||||
|
return self
|
||||||
66
docutranslate/workflow/docx_workflow.py
Normal file
66
docutranslate/workflow/docx_workflow.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
|
from docutranslate.exporter.docx.docx2docx_exporter import Docx2DocxExporter
|
||||||
|
from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig, Docx2HTMLExporter
|
||||||
|
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig, DocxTranslator
|
||||||
|
from docutranslate.workflow.base import Workflow, WorkflowConfig
|
||||||
|
from docutranslate.workflow.interfaces import HTMLExportable, DocxExportable
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class DocxWorkflowConfig(WorkflowConfig):
|
||||||
|
translator_config: DocxTranslatorConfig
|
||||||
|
html_exporter_config: Docx2HTMLExporterConfig
|
||||||
|
|
||||||
|
|
||||||
|
class DocxWorkflow(Workflow[DocxWorkflowConfig, Document, Document], HTMLExportable[Docx2HTMLExporterConfig],
|
||||||
|
DocxExportable[ExporterConfig]):
|
||||||
|
def __init__(self, config: DocxWorkflowConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
if config.logger:
|
||||||
|
for sub_config in [self.config.translator_config]:
|
||||||
|
if sub_config:
|
||||||
|
sub_config.logger = config.logger
|
||||||
|
|
||||||
|
def _pre_translate(self, document_original: Document):
|
||||||
|
document = document_original.copy()
|
||||||
|
translate_config = self.config.translator_config
|
||||||
|
translator = DocxTranslator(translate_config)
|
||||||
|
return document, translator
|
||||||
|
|
||||||
|
def translate(self) -> Self:
|
||||||
|
document, translator = self._pre_translate(self.document_original)
|
||||||
|
translator.translate(document)
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self) -> Self:
|
||||||
|
document, translator = self._pre_translate(self.document_original)
|
||||||
|
await translator.translate_async(document)
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
def export_to_html(self, config: Docx2HTMLExporterConfig = None) -> str:
|
||||||
|
config = config or self.config.html_exporter_config
|
||||||
|
docu = self._export(Docx2HTMLExporter(config))
|
||||||
|
return docu.content.decode()
|
||||||
|
|
||||||
|
def export_to_xlsx(self, _: ExporterConfig | None = None) -> bytes:
|
||||||
|
docu = self._export(Docx2DocxExporter())
|
||||||
|
return docu.content
|
||||||
|
|
||||||
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
|
config: Docx2HTMLExporter | None = None) -> Self:
|
||||||
|
config = config or self.config.html_exporter_config
|
||||||
|
self._save(exporter=Docx2HTMLExporter(config), name=name, output_dir=output_dir)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def save_as_xlsx(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
|
_: ExporterConfig | None = None) -> Self:
|
||||||
|
self._save(exporter=Docx2DocxExporter(), name=name, output_dir=output_dir)
|
||||||
|
return self
|
||||||
@@ -63,3 +63,11 @@ class XlsxExportable(Protocol[T_ExporterConfig]):
|
|||||||
|
|
||||||
def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
def save_as_xlsx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class DocxExportable(Protocol[T_ExporterConfig]):
|
||||||
|
def export_to_docx(self, config: T_ExporterConfig | None = None) -> bytes:
|
||||||
|
...
|
||||||
|
|
||||||
|
def save_as_docx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
|
...
|
||||||
@@ -11,6 +11,7 @@ dependencies = [
|
|||||||
"openpyxl>=3.1.5",
|
"openpyxl>=3.1.5",
|
||||||
"xlsx2html>=0.6.2",
|
"xlsx2html>=0.6.2",
|
||||||
"json-repair>=0.48.0",
|
"json-repair>=0.48.0",
|
||||||
|
"mammoth>=1.10.0",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
23
uv.lock
generated
23
uv.lock
generated
@@ -142,6 +142,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
|
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cobble"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa", size = 3805 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44", size = 3984 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorama"
|
name = "colorama"
|
||||||
version = "0.4.6"
|
version = "0.4.6"
|
||||||
@@ -294,6 +303,7 @@ dependencies = [
|
|||||||
{ name = "httpx" },
|
{ name = "httpx" },
|
||||||
{ name = "json-repair" },
|
{ name = "json-repair" },
|
||||||
{ name = "jsonpath-ng" },
|
{ name = "jsonpath-ng" },
|
||||||
|
{ name = "mammoth" },
|
||||||
{ name = "markdown2" },
|
{ name = "markdown2" },
|
||||||
{ name = "openpyxl" },
|
{ name = "openpyxl" },
|
||||||
{ name = "xlsx2html" },
|
{ name = "xlsx2html" },
|
||||||
@@ -319,6 +329,7 @@ requires-dist = [
|
|||||||
{ name = "httpx", specifier = "==0.27.2" },
|
{ name = "httpx", specifier = "==0.27.2" },
|
||||||
{ name = "json-repair", specifier = ">=0.48.0" },
|
{ name = "json-repair", specifier = ">=0.48.0" },
|
||||||
{ name = "jsonpath-ng", specifier = ">=1.7.0" },
|
{ name = "jsonpath-ng", specifier = ">=1.7.0" },
|
||||||
|
{ name = "mammoth", specifier = ">=1.10.0" },
|
||||||
{ name = "markdown2", specifier = ">=2.5.3" },
|
{ name = "markdown2", specifier = ">=2.5.3" },
|
||||||
{ name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" },
|
{ name = "opencv-python", marker = "extra == 'docling'", specifier = ">=4.11.0.86" },
|
||||||
{ name = "openpyxl", specifier = ">=3.1.5" },
|
{ name = "openpyxl", specifier = ">=3.1.5" },
|
||||||
@@ -727,6 +738,18 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/d1/5d/c059c180c84f7962db0aeae7c3b9303ed1d73d76f2bfbc32bc231c8be314/macholib-1.16.3-py2.py3-none-any.whl", hash = "sha256:0e315d7583d38b8c77e815b1ecbdbf504a8258d8b3e17b61165c6feb60d18f2c", size = 38094 },
|
{ url = "https://files.pythonhosted.org/packages/d1/5d/c059c180c84f7962db0aeae7c3b9303ed1d73d76f2bfbc32bc231c8be314/macholib-1.16.3-py2.py3-none-any.whl", hash = "sha256:0e315d7583d38b8c77e815b1ecbdbf504a8258d8b3e17b61165c6feb60d18f2c", size = 38094 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mammoth"
|
||||||
|
version = "1.10.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "cobble" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/89/0d/2ab86f37021b4c50fe72354acd226b1e31a10497e51f6cbd7e3d1eca1181/mammoth-1.10.0.tar.gz", hash = "sha256:cb6fbba41ccf8b5502859c457177d87a833fef0e0b1d4e6fd23ec372fe892c30", size = 52285 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/a6/67/36eeb3a8726df3b282ba99ec126323871cffdbcf3b7a1db64ca9bbe4abc1/mammoth-1.10.0-py2.py3-none-any.whl", hash = "sha256:a1c87d5b98ca30230394267f98614b58b14b50f8031dc33ac9a535c6ab04eb99", size = 53823 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "markdown-it-py"
|
name = "markdown-it-py"
|
||||||
version = "3.0.0"
|
version = "3.0.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user