diff --git a/docutranslate/app.py b/docutranslate/app.py index a7ac7f8..03f06f4 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -31,13 +31,16 @@ from docutranslate.exporter.md.types import ConvertEngineType # --- 核心代码 Imports --- from docutranslate.global_values.conditional_import import DOCLING_EXIST from docutranslate.workflow.base import Workflow -from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable +from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \ + XlsxExportable +# --- [NEW] DOCX 工作流 Imports --- +from docutranslate.workflow.interfaces import DocxExportable from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow, MarkdownBasedWorkflowConfig from docutranslate.workflow.txt_workflow import TXTWorkflow, TXTWorkflowConfig from docutranslate.workflow.json_workflow import JsonWorkflow, JsonWorkflowConfig -# --- [NEW] XLSX 工作流 Imports --- from docutranslate.workflow.xlsx_workflow import XlsxWorkflow, XlsxWorkflowConfig -from docutranslate.workflow.interfaces import XlsxExportable +# --- [NEW] DOCX 工作流 Imports --- +from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig if DOCLING_EXIST or TYPE_CHECKING: from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig @@ -48,9 +51,11 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig from docutranslate.translator.ai_translator.json_translator import JsonTranslatorConfig from docutranslate.exporter.js.json2html_exporter import Json2HTMLExporterConfig -# --- [NEW] XLSX 工作流相关配置 Imports --- from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig +# --- [NEW] DOCX 工作流相关配置 Imports --- +from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig +from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig # ------------------------------------ from docutranslate.logger import global_logger @@ -69,7 +74,8 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = { "markdown_based": MarkdownBasedWorkflow, "txt": TXTWorkflow, "json": JsonWorkflow, - "xlsx": XlsxWorkflow, # <--- 新增 XLSX 工作流 + "xlsx": XlsxWorkflow, + "docx": DocxWorkflow, # <--- 新增 DOCX 工作流 } @@ -228,7 +234,6 @@ class JsonWorkflowParams(BaseWorkflowParams): ) -# --- [NEW] XLSX 工作流参数模型 --- class XlsxWorkflowParams(BaseWorkflowParams): workflow_type: Literal['xlsx'] = Field(..., description="指定使用XLSX的翻译工作流。") insert_mode: Literal["replace", "append", "prepend"] = Field( @@ -241,9 +246,22 @@ class XlsxWorkflowParams(BaseWorkflowParams): ) +# --- [NEW] DOCX 工作流参数模型 --- +class DocxWorkflowParams(BaseWorkflowParams): + workflow_type: Literal['docx'] = Field(..., description="指定使用DOCX的翻译工作流。") + insert_mode: Literal["replace", "append", "prepend"] = Field( + "replace", + description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。" + ) + separator: str = Field( + "\n", + description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。" + ) + + # 3. [MODIFIED] 使用可辨识联合类型(Discriminated Union)将它们组合起来 TranslatePayload = Annotated[ - Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams], # <-- 新增 XlsxWorkflowParams + Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams], # <-- 新增 DocxWorkflowParams Field(discriminator='workflow_type') ] @@ -288,7 +306,6 @@ class TranslateServiceRequest(BaseModel): } } }, - # --- [NEW] XLSX 工作流示例 --- { "summary": "XLSX 工作流示例", "value": { @@ -306,6 +323,22 @@ class TranslateServiceRequest(BaseModel): "concurrent": 5 } } + }, + # --- [NEW] DOCX 工作流示例 --- + { + "summary": "DOCX 工作流示例", + "value": { + "file_name": "contract.docx", + "file_content": "UEsDBBQAAAAIA... (base64-encoded docx)", + "payload": { + "workflow_type": "docx", + "base_url": "https://api.openai.com/v1", + "api_key": "sk-your-api-key-here", + "model_id": "gpt-4o", + "to_lang": "English", + "insert_mode": "replace", + } + } } ] } @@ -395,14 +428,13 @@ async def _perform_translation( ) workflow = JsonWorkflow(config=workflow_config) - # --- [NEW] XLSX 工作流处理逻辑 --- elif isinstance(payload, XlsxWorkflowParams): task_logger.info("构建 XlsxWorkflow 配置。") translator_config = XlsxTranslatorConfig( **payload.model_dump(include={ 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', 'temperature', 'thinking', 'chunk_size', 'concurrent', - 'insert_mode', 'separator' # 包含XLSX特定参数 + 'insert_mode', 'separator' }, exclude_none=True) ) html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True) @@ -413,6 +445,24 @@ async def _perform_translation( ) workflow = XlsxWorkflow(config=workflow_config) + # --- [NEW] DOCX 工作流处理逻辑 --- + elif isinstance(payload, DocxWorkflowParams): + task_logger.info("构建 DocxWorkflow 配置。") + translator_config = DocxTranslatorConfig( + **payload.model_dump(include={ + 'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt', + 'temperature', 'thinking', 'chunk_size', 'concurrent', + 'insert_mode', 'separator' # 包含DOCX特定参数 + }, exclude_none=True) + ) + html_exporter_config = Docx2HTMLExporterConfig(cdn=True) + workflow_config = DocxWorkflowConfig( + translator_config=translator_config, + html_exporter_config=html_exporter_config, + logger=task_logger + ) + workflow = DocxWorkflow(config=workflow_config) + else: raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。") @@ -539,7 +589,7 @@ def _cancel_translation_logic(task_id: str): description=""" 接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。 -- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`)。 +- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`)。 - **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。 - **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。 @@ -606,21 +656,21 @@ async def service_release_task(task_id: str): "application/json": { "examples": { # ... 其他示例保持不变 ... - "completed_xlsx": { - "summary": "已完成 (XLSX)", + "completed_docx": { + "summary": "已完成 (DOCX)", "value": { - "task_id": "e5b98cc6", + "task_id": "f8a9c1b2", "is_processing": False, - "status_message": "翻译成功!用时 18.99 秒。", + "status_message": "翻译成功!用时 25.10 秒。", "error_flag": False, "download_ready": True, - "original_filename_stem": "product_list", - "original_filename": "product_list.xlsx", - "task_start_time": 1678889400.123, - "task_end_time": 1678889419.113, + "original_filename_stem": "contract", + "original_filename": "contract.docx", + "task_start_time": 1678889500.123, + "task_end_time": 1678889525.223, "downloads": { - "xlsx": "/service/download/e5b98cc6/xlsx", - "html": "/service/download/e5b98cc6/html" + "docx": "/service/download/f8a9c1b2/docx", + "html": "/service/download/f8a9c1b2/html" } } }, @@ -650,9 +700,11 @@ async def service_get_status( downloads["txt"] = f"/service/download/{task_id}/txt" if isinstance(workflow, JsonExportable): downloads["json"] = f"/service/download/{task_id}/json" - # --- [NEW] 新增对 XLSX 导出的支持 --- if isinstance(workflow, XlsxExportable): downloads["xlsx"] = f"/service/download/{task_id}/xlsx" + # --- [NEW] 新增对 DOCX 导出的支持 --- + if isinstance(workflow, DocxExportable): + downloads["docx"] = f"/service/download/{task_id}/docx" return JSONResponse(content={ "task_id": task_id, @@ -683,8 +735,8 @@ async def service_get_logs(task_id: str): return JSONResponse(content={"logs": new_logs}) -# [MODIFIED] 扩展 FileType 以包含 'xlsx' -FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx"] +# [MODIFIED] 扩展 FileType 以包含 'docx' +FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx", "docx"] async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple[bytes, str, str]: @@ -698,7 +750,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple filename_stem = task_state['original_filename_stem'] try: - content_bytes: bytes; media_type: str; filename: str + content_bytes: bytes + media_type: str + filename: str html_config = None if file_type == 'html': @@ -712,8 +766,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple if isinstance(workflow, MarkdownBasedWorkflow): html_config = MD2HTMLExporterConfig(cdn=is_cdn_available) elif isinstance(workflow, TXTWorkflow): html_config = TXT2HTMLExporterConfig(cdn=is_cdn_available) elif isinstance(workflow, JsonWorkflow): html_config = Json2HTMLExporterConfig(cdn=is_cdn_available) - # --- [NEW] 新增对 XLSX->HTML 的支持 --- elif isinstance(workflow, XlsxWorkflow): html_config = Xlsx2HTMLExporterConfig(cdn=is_cdn_available) + # --- [NEW] 新增对 DOCX->HTML 的支持 --- + elif isinstance(workflow, DocxWorkflow): html_config = Docx2HTMLExporterConfig(cdn=is_cdn_available) if file_type == 'html' and isinstance(workflow, HTMLExportable): content_str = workflow.export_to_html(html_config) @@ -729,10 +784,13 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple elif file_type == 'json' and isinstance(workflow, JsonExportable): json_content = workflow.export_to_json() content_bytes, media_type, filename = json_content.encode('utf-8'), "application/json; charset=utf-8", f"{filename_stem}_translated.json" - # --- [NEW] XLSX 导出逻辑 --- elif file_type == 'xlsx' and isinstance(workflow, XlsxExportable): content_bytes = workflow.export_to_xlsx() media_type, filename = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", f"{filename_stem}_translated.xlsx" + # --- [NEW] DOCX 导出逻辑 --- + elif file_type == 'docx' and isinstance(workflow, DocxExportable): + content_bytes = workflow.export_to_docx() + media_type, filename = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", f"{filename_stem}_translated.docx" else: raise HTTPException(status_code=404, detail=f"此任务不支持导出 '{file_type}' 类型的文件。") @@ -751,7 +809,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple "content": { # ... 其他类型保持不变 ... "application/json": {"schema": {"type": "string", "format": "binary"}}, - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW] + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}}, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW] } }, # ... 其他 response 保持不变 ... @@ -759,7 +818,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple ) async def service_download_file( task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]), - file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "xlsx"]) + file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "docx"]) ): content, media_type, filename = await _get_content_from_workflow(task_id, file_type) headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"} @@ -773,7 +832,7 @@ async def service_download_file( ... - **内容编码**: - 对于 `html`, `markdown`, `txt`, `json` 类型, `content` 字段包含原始的文本内容。 - - 对于 `markdown_zip`, `xlsx` 类型, `content` 字段包含Base64编码后的字符串。 + - 对于 `markdown_zip`, `xlsx`, `docx` 类型, `content` 字段包含Base64编码后的字符串。 ... """, responses={ @@ -781,11 +840,11 @@ async def service_download_file( "description": "成功返回文件内容。", "content": { "application/json": { "examples": { # ... 其他示例 ... - "xlsx_base64": { - "summary": "XLSX 内容 (Base64)", + "docx_base64": { + "summary": "DOCX 内容 (Base64)", "value": { - "file_type": "xlsx", - "filename": "my_sheet_translated.xlsx", + "file_type": "docx", + "filename": "my_doc_translated.docx", "content": "UEsDBBQAAAAIA... (base64-encoded string)" } } @@ -796,13 +855,13 @@ async def service_download_file( ) async def service_content( task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]), - file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "xlsx"]) + file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "docx"]) ): - """[MODIFIED] 根据任务ID和文件类型,以JSON格式返回内容。zip/xlsx文件会进行Base64编码。""" + """[MODIFIED] 根据任务ID和文件类型,以JSON格式返回内容。zip/xlsx/docx文件会进行Base64编码。""" content, _, filename = await _get_content_from_workflow(task_id, file_type) final_content: str - if file_type in ['markdown_zip', 'xlsx']: # 二进制文件进行Base64编码 + if file_type in ['markdown_zip', 'xlsx', 'docx']: # 二进制文件进行Base64编码 final_content = base64.b64encode(content).decode('utf-8') else: # 文本文件直接解码 final_content = content.decode('utf-8') diff --git a/docutranslate/exporter/docx/docx2html_exporter.py b/docutranslate/exporter/docx/docx2html_exporter.py index e3ef2c3..89cfcd0 100644 --- a/docutranslate/exporter/docx/docx2html_exporter.py +++ b/docutranslate/exporter/docx/docx2html_exporter.py @@ -20,6 +20,6 @@ class Docx2HTMLExporter(XlsxExporter): self.cdn = config.cdn def export(self, document: Document) -> Document: - html_content = mammoth.convert_to_html(BytesIO(document.content)) + html_content = mammoth.convert_to_html(BytesIO(document.content)).value return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem) diff --git a/docutranslate/static/index.html b/docutranslate/static/index.html index c6e3a59..011125c 100644 --- a/docutranslate/static/index.html +++ b/docutranslate/static/index.html @@ -1 +1 @@ - DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

任务列表

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file + DocuTranslate - 交互式文档翻译

DocuTranslate

如果上传的文件本身是.md格式,此项可不选。

GitHub主页(欢迎star❤):
https://github.com/xunbu/docutranslate

交流QQ群: 1047781902

任务列表

当前没有任务,点击“新建任务”开始吧!

预览
原文
译文
\ No newline at end of file diff --git a/docutranslate/translator/ai_translator/docx_translator.py b/docutranslate/translator/ai_translator/docx_translator.py index 4f21d71..f9f3317 100644 --- a/docutranslate/translator/ai_translator/docx_translator.py +++ b/docutranslate/translator/ai_translator/docx_translator.py @@ -1,11 +1,15 @@ import asyncio -from dataclasses import dataclass +from dataclasses import dataclass, field from io import BytesIO from typing import Self, Literal, List, Dict, Any, Tuple import docx from docx.document import Document as DocumentObject -from docx.table import _Cell +from docx.oxml.ns import nsdecls +from docx.oxml import OxmlElement +from docx.table import _Cell, Table +from docx.text.paragraph import Paragraph +from docx.text.run import Run from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.ir.document import Document @@ -13,6 +17,12 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.base import Translator +def is_image_run(run: Run) -> bool: + """检查一个 run 是否包含图片。""" + # w:drawing 是嵌入式图片的标志, w:pict 是 VML 图片的标志 + return ' Tuple[DocumentObject, List[Dict[str, Any]], List[str]]: """ - 预处理 .docx 文件,提取所有需要翻译的文本。 - + [已重构] 预处理 .docx 文件,在 Run 级别上提取文本,以避免破坏图片。 :param document: 包含 .docx 文件内容的 Document 对象。 :return: 一个元组,包含: - docx.Document 对象 - - 一个包含文本元素信息的列表 (e.g., paragraph, cell) + - 一个包含文本块信息的列表 (每个元素代表一组连续的文本 run) - 一个包含所有待翻译原文的列表 """ doc = docx.Document(BytesIO(document.content)) elements_to_translate = [] original_texts = [] + def process_paragraph(para: Paragraph): + nonlocal elements_to_translate, original_texts + current_text_segment = "" + current_runs = [] + + for run in para.runs: + if is_image_run(run): + # 遇到图片,将之前累积的文本作为一个翻译单元 + if current_text_segment.strip(): + elements_to_translate.append({"type": "text_runs", "runs": current_runs}) + original_texts.append(current_text_segment) + # 重置累加器 + current_text_segment = "" + current_runs = [] + else: + # 累积文本 run + current_runs.append(run) + current_text_segment += run.text + + # 处理段落末尾的最后一个文本块 + if current_text_segment.strip(): + elements_to_translate.append({"type": "text_runs", "runs": current_runs}) + original_texts.append(current_text_segment) + # 遍历所有段落 for para in doc.paragraphs: - if para.text.strip(): # 确保段落有实际内容 - elements_to_translate.append({"type": "paragraph", "element": para}) - original_texts.append(para.text) + process_paragraph(para) # 遍历所有表格 for table in doc.tables: for row in table.rows: for cell in row.cells: - if cell.text.strip(): # 确保单元格有实际内容 - elements_to_translate.append({"type": "cell", "element": cell}) - original_texts.append(cell.text) + for para in cell.paragraphs: + process_paragraph(para) return doc, elements_to_translate, original_texts def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]], translated_texts: List[str], original_texts: List[str]) -> bytes: """ - 将翻译后的文本写回到 .docx 对象中。 - - :param doc: docx.Document 对象。 - :param elements_to_translate: 包含文本元素信息的列表。 - :param translated_texts: 翻译后的文本列表。 - :param original_texts: 原始文本列表。 - :return: 更新后的 .docx 文件内容的字节流。 + [已重构] 将翻译后的文本写回到对应的 text runs 中,保留图片和样式。 """ + translation_map = dict(zip(original_texts, translated_texts)) + for i, element_info in enumerate(elements_to_translate): - element = element_info["element"] + runs = element_info["runs"] original_text = original_texts[i] translated_text = translated_texts[i] - # 清空原有内容并写入新内容 - if isinstance(element, docx.text.paragraph.Paragraph): - # 清空段落内容 - element.clear() - # 根据插入模式添加文本 - if self.insert_mode == "replace": - element.add_run(translated_text) - elif self.insert_mode == "append": - element.add_run(original_text + self.separator + translated_text) - elif self.insert_mode == "prepend": - element.add_run(translated_text + self.separator + original_text) - else: - self.logger.error("不正确的DocxTranslatorConfig参数") + # 根据插入模式确定最终文本 + if self.insert_mode == "replace": + final_text = translated_text + elif self.insert_mode == "append": + final_text = original_text + self.separator + translated_text + elif self.insert_mode == "prepend": + final_text = translated_text + self.separator + original_text + else: + self.logger.error("不正确的DocxTranslatorConfig参数") + final_text = translated_text - elif isinstance(element, _Cell): - # 根据插入模式设置单元格文本 - if self.insert_mode == "replace": - element.text = translated_text - elif self.insert_mode == "append": - element.text = original_text + self.separator + translated_text - elif self.insert_mode == "prepend": - element.text = translated_text + self.separator + original_text - else: - self.logger.error("不正确的DocxTranslatorConfig参数") + if not runs: + continue + + # --- 这是修改的核心部分 --- + # 1. 将完整的翻译文本写入第一个 run + first_run = runs[0] + first_run.text = final_text + + # 2. 清空该文本块中其余 run 的内容,但保留 run 本身及其格式 + # 这可以防止重复文本,同时保留文档结构 + for run in runs[1:]: + run.text = "" + # --- 修改结束 --- # 将修改后的文档保存到 BytesIO 流 doc_output_stream = BytesIO() @@ -128,8 +155,9 @@ class DocxTranslator(Translator): 同步翻译 .docx 文件。 """ doc, elements_to_translate, original_texts = self._pre_translate(document) - if not elements_to_translate: + if not original_texts: print("\n文件中没有找到需要翻译的文本内容。") + document.content = doc.save(BytesIO()).getvalue() # 返回原文件 return self # 调用翻译 agent @@ -144,8 +172,12 @@ class DocxTranslator(Translator): 异步翻译 .docx 文件。 """ doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document) - if not elements_to_translate: + if not original_texts: print("\n文件中没有找到需要翻译的文本内容。") + # 在异步环境中正确保存和返回 + output_stream = BytesIO() + doc.save(output_stream) + document.content = output_stream.getvalue() return self # 异步调用翻译 agent