优化docxtranslator

This commit is contained in:
xunbu
2025-08-05 14:22:32 +08:00
parent 5dc18df585
commit 233be8ec13
4 changed files with 174 additions and 83 deletions

View File

@@ -31,13 +31,16 @@ from docutranslate.exporter.md.types import ConvertEngineType
# --- 核心代码 Imports ---
from docutranslate.global_values.conditional_import import DOCLING_EXIST
from docutranslate.workflow.base import Workflow
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \
XlsxExportable
# --- [NEW] DOCX 工作流 Imports ---
from docutranslate.workflow.interfaces import DocxExportable
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow, MarkdownBasedWorkflowConfig
from docutranslate.workflow.txt_workflow import TXTWorkflow, TXTWorkflowConfig
from docutranslate.workflow.json_workflow import JsonWorkflow, JsonWorkflowConfig
# --- [NEW] XLSX 工作流 Imports ---
from docutranslate.workflow.xlsx_workflow import XlsxWorkflow, XlsxWorkflowConfig
from docutranslate.workflow.interfaces import XlsxExportable
# --- [NEW] DOCX 工作流 Imports ---
from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig
if DOCLING_EXIST or TYPE_CHECKING:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
@@ -48,9 +51,11 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig
from docutranslate.translator.ai_translator.json_translator import JsonTranslatorConfig
from docutranslate.exporter.js.json2html_exporter import Json2HTMLExporterConfig
# --- [NEW] XLSX 工作流相关配置 Imports ---
from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig
# --- [NEW] DOCX 工作流相关配置 Imports ---
from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig
from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig
# ------------------------------------
from docutranslate.logger import global_logger
@@ -69,7 +74,8 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = {
"markdown_based": MarkdownBasedWorkflow,
"txt": TXTWorkflow,
"json": JsonWorkflow,
"xlsx": XlsxWorkflow, # <--- 新增 XLSX 工作流
"xlsx": XlsxWorkflow,
"docx": DocxWorkflow, # <--- 新增 DOCX 工作流
}
@@ -228,7 +234,6 @@ class JsonWorkflowParams(BaseWorkflowParams):
)
# --- [NEW] XLSX 工作流参数模型 ---
class XlsxWorkflowParams(BaseWorkflowParams):
workflow_type: Literal['xlsx'] = Field(..., description="指定使用XLSX的翻译工作流。")
insert_mode: Literal["replace", "append", "prepend"] = Field(
@@ -241,9 +246,22 @@ class XlsxWorkflowParams(BaseWorkflowParams):
)
# --- [NEW] DOCX 工作流参数模型 ---
class DocxWorkflowParams(BaseWorkflowParams):
workflow_type: Literal['docx'] = Field(..., description="指定使用DOCX的翻译工作流。")
insert_mode: Literal["replace", "append", "prepend"] = Field(
"replace",
description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。"
)
separator: str = Field(
"\n",
description="当 insert_mode 为 'append''prepend' 时,用于分隔原文和译文的分隔符。"
)
# 3. [MODIFIED] 使用可辨识联合类型Discriminated Union将它们组合起来
TranslatePayload = Annotated[
Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams], # <-- 新增 XlsxWorkflowParams
Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams], # <-- 新增 DocxWorkflowParams
Field(discriminator='workflow_type')
]
@@ -288,7 +306,6 @@ class TranslateServiceRequest(BaseModel):
}
}
},
# --- [NEW] XLSX 工作流示例 ---
{
"summary": "XLSX 工作流示例",
"value": {
@@ -306,6 +323,22 @@ class TranslateServiceRequest(BaseModel):
"concurrent": 5
}
}
},
# --- [NEW] DOCX 工作流示例 ---
{
"summary": "DOCX 工作流示例",
"value": {
"file_name": "contract.docx",
"file_content": "UEsDBBQAAAAIA... (base64-encoded docx)",
"payload": {
"workflow_type": "docx",
"base_url": "https://api.openai.com/v1",
"api_key": "sk-your-api-key-here",
"model_id": "gpt-4o",
"to_lang": "English",
"insert_mode": "replace",
}
}
}
]
}
@@ -395,14 +428,13 @@ async def _perform_translation(
)
workflow = JsonWorkflow(config=workflow_config)
# --- [NEW] XLSX 工作流处理逻辑 ---
elif isinstance(payload, XlsxWorkflowParams):
task_logger.info("构建 XlsxWorkflow 配置。")
translator_config = XlsxTranslatorConfig(
**payload.model_dump(include={
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
'temperature', 'thinking', 'chunk_size', 'concurrent',
'insert_mode', 'separator' # 包含XLSX特定参数
'insert_mode', 'separator'
}, exclude_none=True)
)
html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True)
@@ -413,6 +445,24 @@ async def _perform_translation(
)
workflow = XlsxWorkflow(config=workflow_config)
# --- [NEW] DOCX 工作流处理逻辑 ---
elif isinstance(payload, DocxWorkflowParams):
task_logger.info("构建 DocxWorkflow 配置。")
translator_config = DocxTranslatorConfig(
**payload.model_dump(include={
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
'temperature', 'thinking', 'chunk_size', 'concurrent',
'insert_mode', 'separator' # 包含DOCX特定参数
}, exclude_none=True)
)
html_exporter_config = Docx2HTMLExporterConfig(cdn=True)
workflow_config = DocxWorkflowConfig(
translator_config=translator_config,
html_exporter_config=html_exporter_config,
logger=task_logger
)
workflow = DocxWorkflow(config=workflow_config)
else:
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
@@ -539,7 +589,7 @@ def _cancel_translation_logic(task_id: str):
description="""
接收一个包含文件内容Base64编码和工作流参数的JSON请求启动一个后台翻译任务。
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`)。
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`)。
- **动态参数**: 根据所选工作流API需要不同的参数集。请参考下面的Schema或示例。
- **异步处理**: 此端点会立即返回任务ID客户端需轮询状态接口获取进度。
@@ -606,21 +656,21 @@ async def service_release_task(task_id: str):
"application/json": {
"examples": {
# ... 其他示例保持不变 ...
"completed_xlsx": {
"summary": "已完成 (XLSX)",
"completed_docx": {
"summary": "已完成 (DOCX)",
"value": {
"task_id": "e5b98cc6",
"task_id": "f8a9c1b2",
"is_processing": False,
"status_message": "翻译成功!用时 18.99 秒。",
"status_message": "翻译成功!用时 25.10 秒。",
"error_flag": False,
"download_ready": True,
"original_filename_stem": "product_list",
"original_filename": "product_list.xlsx",
"task_start_time": 1678889400.123,
"task_end_time": 1678889419.113,
"original_filename_stem": "contract",
"original_filename": "contract.docx",
"task_start_time": 1678889500.123,
"task_end_time": 1678889525.223,
"downloads": {
"xlsx": "/service/download/e5b98cc6/xlsx",
"html": "/service/download/e5b98cc6/html"
"docx": "/service/download/f8a9c1b2/docx",
"html": "/service/download/f8a9c1b2/html"
}
}
},
@@ -650,9 +700,11 @@ async def service_get_status(
downloads["txt"] = f"/service/download/{task_id}/txt"
if isinstance(workflow, JsonExportable):
downloads["json"] = f"/service/download/{task_id}/json"
# --- [NEW] 新增对 XLSX 导出的支持 ---
if isinstance(workflow, XlsxExportable):
downloads["xlsx"] = f"/service/download/{task_id}/xlsx"
# --- [NEW] 新增对 DOCX 导出的支持 ---
if isinstance(workflow, DocxExportable):
downloads["docx"] = f"/service/download/{task_id}/docx"
return JSONResponse(content={
"task_id": task_id,
@@ -683,8 +735,8 @@ async def service_get_logs(task_id: str):
return JSONResponse(content={"logs": new_logs})
# [MODIFIED] 扩展 FileType 以包含 'xlsx'
FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx"]
# [MODIFIED] 扩展 FileType 以包含 'docx'
FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx", "docx"]
async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple[bytes, str, str]:
@@ -698,7 +750,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
filename_stem = task_state['original_filename_stem']
try:
content_bytes: bytes; media_type: str; filename: str
content_bytes: bytes
media_type: str
filename: str
html_config = None
if file_type == 'html':
@@ -712,8 +766,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
if isinstance(workflow, MarkdownBasedWorkflow): html_config = MD2HTMLExporterConfig(cdn=is_cdn_available)
elif isinstance(workflow, TXTWorkflow): html_config = TXT2HTMLExporterConfig(cdn=is_cdn_available)
elif isinstance(workflow, JsonWorkflow): html_config = Json2HTMLExporterConfig(cdn=is_cdn_available)
# --- [NEW] 新增对 XLSX->HTML 的支持 ---
elif isinstance(workflow, XlsxWorkflow): html_config = Xlsx2HTMLExporterConfig(cdn=is_cdn_available)
# --- [NEW] 新增对 DOCX->HTML 的支持 ---
elif isinstance(workflow, DocxWorkflow): html_config = Docx2HTMLExporterConfig(cdn=is_cdn_available)
if file_type == 'html' and isinstance(workflow, HTMLExportable):
content_str = workflow.export_to_html(html_config)
@@ -729,10 +784,13 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
elif file_type == 'json' and isinstance(workflow, JsonExportable):
json_content = workflow.export_to_json()
content_bytes, media_type, filename = json_content.encode('utf-8'), "application/json; charset=utf-8", f"{filename_stem}_translated.json"
# --- [NEW] XLSX 导出逻辑 ---
elif file_type == 'xlsx' and isinstance(workflow, XlsxExportable):
content_bytes = workflow.export_to_xlsx()
media_type, filename = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", f"{filename_stem}_translated.xlsx"
# --- [NEW] DOCX 导出逻辑 ---
elif file_type == 'docx' and isinstance(workflow, DocxExportable):
content_bytes = workflow.export_to_docx()
media_type, filename = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", f"{filename_stem}_translated.docx"
else:
raise HTTPException(status_code=404, detail=f"此任务不支持导出 '{file_type}' 类型的文件。")
@@ -751,7 +809,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
"content": {
# ... 其他类型保持不变 ...
"application/json": {"schema": {"type": "string", "format": "binary"}},
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW]
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}},
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW]
}
},
# ... 其他 response 保持不变 ...
@@ -759,7 +818,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
)
async def service_download_file(
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "xlsx"])
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "docx"])
):
content, media_type, filename = await _get_content_from_workflow(task_id, file_type)
headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"}
@@ -773,7 +832,7 @@ async def service_download_file(
...
- **内容编码**:
- 对于 `html`, `markdown`, `txt`, `json` 类型, `content` 字段包含原始的文本内容。
- 对于 `markdown_zip`, `xlsx` 类型, `content` 字段包含Base64编码后的字符串。
- 对于 `markdown_zip`, `xlsx`, `docx` 类型, `content` 字段包含Base64编码后的字符串。
...
""",
responses={
@@ -781,11 +840,11 @@ async def service_download_file(
"description": "成功返回文件内容。",
"content": { "application/json": { "examples": {
# ... 其他示例 ...
"xlsx_base64": {
"summary": "XLSX 内容 (Base64)",
"docx_base64": {
"summary": "DOCX 内容 (Base64)",
"value": {
"file_type": "xlsx",
"filename": "my_sheet_translated.xlsx",
"file_type": "docx",
"filename": "my_doc_translated.docx",
"content": "UEsDBBQAAAAIA... (base64-encoded string)"
}
}
@@ -796,13 +855,13 @@ async def service_download_file(
)
async def service_content(
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "xlsx"])
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "docx"])
):
"""[MODIFIED] 根据任务ID和文件类型以JSON格式返回内容。zip/xlsx文件会进行Base64编码。"""
"""[MODIFIED] 根据任务ID和文件类型以JSON格式返回内容。zip/xlsx/docx文件会进行Base64编码。"""
content, _, filename = await _get_content_from_workflow(task_id, file_type)
final_content: str
if file_type in ['markdown_zip', 'xlsx']: # 二进制文件进行Base64编码
if file_type in ['markdown_zip', 'xlsx', 'docx']: # 二进制文件进行Base64编码
final_content = base64.b64encode(content).decode('utf-8')
else: # 文本文件直接解码
final_content = content.decode('utf-8')

View File

@@ -20,6 +20,6 @@ class Docx2HTMLExporter(XlsxExporter):
self.cdn = config.cdn
def export(self, document: Document) -> Document:
html_content = mammoth.convert_to_html(BytesIO(document.content))
html_content = mammoth.convert_to_html(BytesIO(document.content)).value
return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)

File diff suppressed because one or more lines are too long

View File

@@ -1,11 +1,15 @@
import asyncio
from dataclasses import dataclass
from dataclasses import dataclass, field
from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple
import docx
from docx.document import Document as DocumentObject
from docx.table import _Cell
from docx.oxml.ns import nsdecls
from docx.oxml import OxmlElement
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
@@ -13,6 +17,12 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
def is_image_run(run: Run) -> bool:
"""检查一个 run 是否包含图片。"""
# w:drawing 是嵌入式图片的标志, w:pict 是 VML 图片的标志
return '<w:drawing' in run.element.xml or '<w:pict' in run.element.xml
@dataclass
class DocxTranslatorConfig(AiTranslatorConfig):
"""
@@ -25,6 +35,7 @@ class DocxTranslatorConfig(AiTranslatorConfig):
class DocxTranslator(Translator):
"""
用于翻译 .docx 文件的翻译器。
此版本经过优化,可以处理图文混排的段落而不会丢失图片。
"""
def __init__(self, config: DocxTranslatorConfig):
@@ -49,74 +60,90 @@ class DocxTranslator(Translator):
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
"""
预处理 .docx 文件,提取所有需要翻译的文本
[已重构] 预处理 .docx 文件,在 Run 级别上提取文本,以避免破坏图片
:param document: 包含 .docx 文件内容的 Document 对象。
:return: 一个元组,包含:
- docx.Document 对象
- 一个包含文本元素信息的列表 (e.g., paragraph, cell)
- 一个包含文本信息的列表 (每个元素代表一组连续的文本 run)
- 一个包含所有待翻译原文的列表
"""
doc = docx.Document(BytesIO(document.content))
elements_to_translate = []
original_texts = []
def process_paragraph(para: Paragraph):
nonlocal elements_to_translate, original_texts
current_text_segment = ""
current_runs = []
for run in para.runs:
if is_image_run(run):
# 遇到图片,将之前累积的文本作为一个翻译单元
if current_text_segment.strip():
elements_to_translate.append({"type": "text_runs", "runs": current_runs})
original_texts.append(current_text_segment)
# 重置累加器
current_text_segment = ""
current_runs = []
else:
# 累积文本 run
current_runs.append(run)
current_text_segment += run.text
# 处理段落末尾的最后一个文本块
if current_text_segment.strip():
elements_to_translate.append({"type": "text_runs", "runs": current_runs})
original_texts.append(current_text_segment)
# 遍历所有段落
for para in doc.paragraphs:
if para.text.strip(): # 确保段落有实际内容
elements_to_translate.append({"type": "paragraph", "element": para})
original_texts.append(para.text)
process_paragraph(para)
# 遍历所有表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip(): # 确保单元格有实际内容
elements_to_translate.append({"type": "cell", "element": cell})
original_texts.append(cell.text)
for para in cell.paragraphs:
process_paragraph(para)
return doc, elements_to_translate, original_texts
def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]],
translated_texts: List[str], original_texts: List[str]) -> bytes:
"""
将翻译后的文本写回到 .docx 对象中
:param doc: docx.Document 对象。
:param elements_to_translate: 包含文本元素信息的列表。
:param translated_texts: 翻译后的文本列表。
:param original_texts: 原始文本列表。
:return: 更新后的 .docx 文件内容的字节流。
[已重构] 将翻译后的文本写回到对应的 text runs 中,保留图片和样式
"""
translation_map = dict(zip(original_texts, translated_texts))
for i, element_info in enumerate(elements_to_translate):
element = element_info["element"]
runs = element_info["runs"]
original_text = original_texts[i]
translated_text = translated_texts[i]
# 清空原有内容并写入新内容
if isinstance(element, docx.text.paragraph.Paragraph):
# 清空段落内容
element.clear()
# 根据插入模式添加文本
if self.insert_mode == "replace":
element.add_run(translated_text)
elif self.insert_mode == "append":
element.add_run(original_text + self.separator + translated_text)
elif self.insert_mode == "prepend":
element.add_run(translated_text + self.separator + original_text)
else:
self.logger.error("不正确的DocxTranslatorConfig参数")
# 根据插入模式确定最终文本
if self.insert_mode == "replace":
final_text = translated_text
elif self.insert_mode == "append":
final_text = original_text + self.separator + translated_text
elif self.insert_mode == "prepend":
final_text = translated_text + self.separator + original_text
else:
self.logger.error("不正确的DocxTranslatorConfig参数")
final_text = translated_text
elif isinstance(element, _Cell):
# 根据插入模式设置单元格文本
if self.insert_mode == "replace":
element.text = translated_text
elif self.insert_mode == "append":
element.text = original_text + self.separator + translated_text
elif self.insert_mode == "prepend":
element.text = translated_text + self.separator + original_text
else:
self.logger.error("不正确的DocxTranslatorConfig参数")
if not runs:
continue
# --- 这是修改的核心部分 ---
# 1. 将完整的翻译文本写入第一个 run
first_run = runs[0]
first_run.text = final_text
# 2. 清空该文本块中其余 run 的内容,但保留 run 本身及其格式
# 这可以防止重复文本,同时保留文档结构
for run in runs[1:]:
run.text = ""
# --- 修改结束 ---
# 将修改后的文档保存到 BytesIO 流
doc_output_stream = BytesIO()
@@ -128,8 +155,9 @@ class DocxTranslator(Translator):
同步翻译 .docx 文件。
"""
doc, elements_to_translate, original_texts = self._pre_translate(document)
if not elements_to_translate:
if not original_texts:
print("\n文件中没有找到需要翻译的文本内容。")
document.content = doc.save(BytesIO()).getvalue() # 返回原文件
return self
# 调用翻译 agent
@@ -144,8 +172,12 @@ class DocxTranslator(Translator):
异步翻译 .docx 文件。
"""
doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
if not elements_to_translate:
if not original_texts:
print("\n文件中没有找到需要翻译的文本内容。")
# 在异步环境中正确保存和返回
output_stream = BytesIO()
doc.save(output_stream)
document.content = output_stream.getvalue()
return self
# 异步调用翻译 agent