优化docxtranslator
This commit is contained in:
@@ -31,13 +31,16 @@ from docutranslate.exporter.md.types import ConvertEngineType
|
||||
# --- 核心代码 Imports ---
|
||||
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
||||
from docutranslate.workflow.base import Workflow
|
||||
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable
|
||||
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \
|
||||
XlsxExportable
|
||||
# --- [NEW] DOCX 工作流 Imports ---
|
||||
from docutranslate.workflow.interfaces import DocxExportable
|
||||
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow, MarkdownBasedWorkflowConfig
|
||||
from docutranslate.workflow.txt_workflow import TXTWorkflow, TXTWorkflowConfig
|
||||
from docutranslate.workflow.json_workflow import JsonWorkflow, JsonWorkflowConfig
|
||||
# --- [NEW] XLSX 工作流 Imports ---
|
||||
from docutranslate.workflow.xlsx_workflow import XlsxWorkflow, XlsxWorkflowConfig
|
||||
from docutranslate.workflow.interfaces import XlsxExportable
|
||||
# --- [NEW] DOCX 工作流 Imports ---
|
||||
from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig
|
||||
|
||||
if DOCLING_EXIST or TYPE_CHECKING:
|
||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
||||
@@ -48,9 +51,11 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
|
||||
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig
|
||||
from docutranslate.translator.ai_translator.json_translator import JsonTranslatorConfig
|
||||
from docutranslate.exporter.js.json2html_exporter import Json2HTMLExporterConfig
|
||||
# --- [NEW] XLSX 工作流相关配置 Imports ---
|
||||
from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig
|
||||
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig
|
||||
# --- [NEW] DOCX 工作流相关配置 Imports ---
|
||||
from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig
|
||||
from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig
|
||||
# ------------------------------------
|
||||
|
||||
from docutranslate.logger import global_logger
|
||||
@@ -69,7 +74,8 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = {
|
||||
"markdown_based": MarkdownBasedWorkflow,
|
||||
"txt": TXTWorkflow,
|
||||
"json": JsonWorkflow,
|
||||
"xlsx": XlsxWorkflow, # <--- 新增 XLSX 工作流
|
||||
"xlsx": XlsxWorkflow,
|
||||
"docx": DocxWorkflow, # <--- 新增 DOCX 工作流
|
||||
}
|
||||
|
||||
|
||||
@@ -228,7 +234,6 @@ class JsonWorkflowParams(BaseWorkflowParams):
|
||||
)
|
||||
|
||||
|
||||
# --- [NEW] XLSX 工作流参数模型 ---
|
||||
class XlsxWorkflowParams(BaseWorkflowParams):
|
||||
workflow_type: Literal['xlsx'] = Field(..., description="指定使用XLSX的翻译工作流。")
|
||||
insert_mode: Literal["replace", "append", "prepend"] = Field(
|
||||
@@ -241,9 +246,22 @@ class XlsxWorkflowParams(BaseWorkflowParams):
|
||||
)
|
||||
|
||||
|
||||
# --- [NEW] DOCX 工作流参数模型 ---
|
||||
class DocxWorkflowParams(BaseWorkflowParams):
|
||||
workflow_type: Literal['docx'] = Field(..., description="指定使用DOCX的翻译工作流。")
|
||||
insert_mode: Literal["replace", "append", "prepend"] = Field(
|
||||
"replace",
|
||||
description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。"
|
||||
)
|
||||
separator: str = Field(
|
||||
"\n",
|
||||
description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。"
|
||||
)
|
||||
|
||||
|
||||
# 3. [MODIFIED] 使用可辨识联合类型(Discriminated Union)将它们组合起来
|
||||
TranslatePayload = Annotated[
|
||||
Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams], # <-- 新增 XlsxWorkflowParams
|
||||
Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams], # <-- 新增 DocxWorkflowParams
|
||||
Field(discriminator='workflow_type')
|
||||
]
|
||||
|
||||
@@ -288,7 +306,6 @@ class TranslateServiceRequest(BaseModel):
|
||||
}
|
||||
}
|
||||
},
|
||||
# --- [NEW] XLSX 工作流示例 ---
|
||||
{
|
||||
"summary": "XLSX 工作流示例",
|
||||
"value": {
|
||||
@@ -306,6 +323,22 @@ class TranslateServiceRequest(BaseModel):
|
||||
"concurrent": 5
|
||||
}
|
||||
}
|
||||
},
|
||||
# --- [NEW] DOCX 工作流示例 ---
|
||||
{
|
||||
"summary": "DOCX 工作流示例",
|
||||
"value": {
|
||||
"file_name": "contract.docx",
|
||||
"file_content": "UEsDBBQAAAAIA... (base64-encoded docx)",
|
||||
"payload": {
|
||||
"workflow_type": "docx",
|
||||
"base_url": "https://api.openai.com/v1",
|
||||
"api_key": "sk-your-api-key-here",
|
||||
"model_id": "gpt-4o",
|
||||
"to_lang": "English",
|
||||
"insert_mode": "replace",
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -395,14 +428,13 @@ async def _perform_translation(
|
||||
)
|
||||
workflow = JsonWorkflow(config=workflow_config)
|
||||
|
||||
# --- [NEW] XLSX 工作流处理逻辑 ---
|
||||
elif isinstance(payload, XlsxWorkflowParams):
|
||||
task_logger.info("构建 XlsxWorkflow 配置。")
|
||||
translator_config = XlsxTranslatorConfig(
|
||||
**payload.model_dump(include={
|
||||
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
|
||||
'temperature', 'thinking', 'chunk_size', 'concurrent',
|
||||
'insert_mode', 'separator' # 包含XLSX特定参数
|
||||
'insert_mode', 'separator'
|
||||
}, exclude_none=True)
|
||||
)
|
||||
html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True)
|
||||
@@ -413,6 +445,24 @@ async def _perform_translation(
|
||||
)
|
||||
workflow = XlsxWorkflow(config=workflow_config)
|
||||
|
||||
# --- [NEW] DOCX 工作流处理逻辑 ---
|
||||
elif isinstance(payload, DocxWorkflowParams):
|
||||
task_logger.info("构建 DocxWorkflow 配置。")
|
||||
translator_config = DocxTranslatorConfig(
|
||||
**payload.model_dump(include={
|
||||
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
|
||||
'temperature', 'thinking', 'chunk_size', 'concurrent',
|
||||
'insert_mode', 'separator' # 包含DOCX特定参数
|
||||
}, exclude_none=True)
|
||||
)
|
||||
html_exporter_config = Docx2HTMLExporterConfig(cdn=True)
|
||||
workflow_config = DocxWorkflowConfig(
|
||||
translator_config=translator_config,
|
||||
html_exporter_config=html_exporter_config,
|
||||
logger=task_logger
|
||||
)
|
||||
workflow = DocxWorkflow(config=workflow_config)
|
||||
|
||||
else:
|
||||
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
|
||||
|
||||
@@ -539,7 +589,7 @@ def _cancel_translation_logic(task_id: str):
|
||||
description="""
|
||||
接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。
|
||||
|
||||
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`)。
|
||||
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`)。
|
||||
- **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。
|
||||
- **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。
|
||||
|
||||
@@ -606,21 +656,21 @@ async def service_release_task(task_id: str):
|
||||
"application/json": {
|
||||
"examples": {
|
||||
# ... 其他示例保持不变 ...
|
||||
"completed_xlsx": {
|
||||
"summary": "已完成 (XLSX)",
|
||||
"completed_docx": {
|
||||
"summary": "已完成 (DOCX)",
|
||||
"value": {
|
||||
"task_id": "e5b98cc6",
|
||||
"task_id": "f8a9c1b2",
|
||||
"is_processing": False,
|
||||
"status_message": "翻译成功!用时 18.99 秒。",
|
||||
"status_message": "翻译成功!用时 25.10 秒。",
|
||||
"error_flag": False,
|
||||
"download_ready": True,
|
||||
"original_filename_stem": "product_list",
|
||||
"original_filename": "product_list.xlsx",
|
||||
"task_start_time": 1678889400.123,
|
||||
"task_end_time": 1678889419.113,
|
||||
"original_filename_stem": "contract",
|
||||
"original_filename": "contract.docx",
|
||||
"task_start_time": 1678889500.123,
|
||||
"task_end_time": 1678889525.223,
|
||||
"downloads": {
|
||||
"xlsx": "/service/download/e5b98cc6/xlsx",
|
||||
"html": "/service/download/e5b98cc6/html"
|
||||
"docx": "/service/download/f8a9c1b2/docx",
|
||||
"html": "/service/download/f8a9c1b2/html"
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -650,9 +700,11 @@ async def service_get_status(
|
||||
downloads["txt"] = f"/service/download/{task_id}/txt"
|
||||
if isinstance(workflow, JsonExportable):
|
||||
downloads["json"] = f"/service/download/{task_id}/json"
|
||||
# --- [NEW] 新增对 XLSX 导出的支持 ---
|
||||
if isinstance(workflow, XlsxExportable):
|
||||
downloads["xlsx"] = f"/service/download/{task_id}/xlsx"
|
||||
# --- [NEW] 新增对 DOCX 导出的支持 ---
|
||||
if isinstance(workflow, DocxExportable):
|
||||
downloads["docx"] = f"/service/download/{task_id}/docx"
|
||||
|
||||
return JSONResponse(content={
|
||||
"task_id": task_id,
|
||||
@@ -683,8 +735,8 @@ async def service_get_logs(task_id: str):
|
||||
return JSONResponse(content={"logs": new_logs})
|
||||
|
||||
|
||||
# [MODIFIED] 扩展 FileType 以包含 'xlsx'
|
||||
FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx"]
|
||||
# [MODIFIED] 扩展 FileType 以包含 'docx'
|
||||
FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx", "docx"]
|
||||
|
||||
|
||||
async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple[bytes, str, str]:
|
||||
@@ -698,7 +750,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
||||
filename_stem = task_state['original_filename_stem']
|
||||
|
||||
try:
|
||||
content_bytes: bytes; media_type: str; filename: str
|
||||
content_bytes: bytes
|
||||
media_type: str
|
||||
filename: str
|
||||
|
||||
html_config = None
|
||||
if file_type == 'html':
|
||||
@@ -712,8 +766,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
||||
if isinstance(workflow, MarkdownBasedWorkflow): html_config = MD2HTMLExporterConfig(cdn=is_cdn_available)
|
||||
elif isinstance(workflow, TXTWorkflow): html_config = TXT2HTMLExporterConfig(cdn=is_cdn_available)
|
||||
elif isinstance(workflow, JsonWorkflow): html_config = Json2HTMLExporterConfig(cdn=is_cdn_available)
|
||||
# --- [NEW] 新增对 XLSX->HTML 的支持 ---
|
||||
elif isinstance(workflow, XlsxWorkflow): html_config = Xlsx2HTMLExporterConfig(cdn=is_cdn_available)
|
||||
# --- [NEW] 新增对 DOCX->HTML 的支持 ---
|
||||
elif isinstance(workflow, DocxWorkflow): html_config = Docx2HTMLExporterConfig(cdn=is_cdn_available)
|
||||
|
||||
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
||||
content_str = workflow.export_to_html(html_config)
|
||||
@@ -729,10 +784,13 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
||||
elif file_type == 'json' and isinstance(workflow, JsonExportable):
|
||||
json_content = workflow.export_to_json()
|
||||
content_bytes, media_type, filename = json_content.encode('utf-8'), "application/json; charset=utf-8", f"{filename_stem}_translated.json"
|
||||
# --- [NEW] XLSX 导出逻辑 ---
|
||||
elif file_type == 'xlsx' and isinstance(workflow, XlsxExportable):
|
||||
content_bytes = workflow.export_to_xlsx()
|
||||
media_type, filename = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", f"{filename_stem}_translated.xlsx"
|
||||
# --- [NEW] DOCX 导出逻辑 ---
|
||||
elif file_type == 'docx' and isinstance(workflow, DocxExportable):
|
||||
content_bytes = workflow.export_to_docx()
|
||||
media_type, filename = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", f"{filename_stem}_translated.docx"
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail=f"此任务不支持导出 '{file_type}' 类型的文件。")
|
||||
|
||||
@@ -751,7 +809,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
||||
"content": {
|
||||
# ... 其他类型保持不变 ...
|
||||
"application/json": {"schema": {"type": "string", "format": "binary"}},
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW]
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}},
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW]
|
||||
}
|
||||
},
|
||||
# ... 其他 response 保持不变 ...
|
||||
@@ -759,7 +818,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
||||
)
|
||||
async def service_download_file(
|
||||
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
|
||||
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "xlsx"])
|
||||
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "docx"])
|
||||
):
|
||||
content, media_type, filename = await _get_content_from_workflow(task_id, file_type)
|
||||
headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"}
|
||||
@@ -773,7 +832,7 @@ async def service_download_file(
|
||||
...
|
||||
- **内容编码**:
|
||||
- 对于 `html`, `markdown`, `txt`, `json` 类型, `content` 字段包含原始的文本内容。
|
||||
- 对于 `markdown_zip`, `xlsx` 类型, `content` 字段包含Base64编码后的字符串。
|
||||
- 对于 `markdown_zip`, `xlsx`, `docx` 类型, `content` 字段包含Base64编码后的字符串。
|
||||
...
|
||||
""",
|
||||
responses={
|
||||
@@ -781,11 +840,11 @@ async def service_download_file(
|
||||
"description": "成功返回文件内容。",
|
||||
"content": { "application/json": { "examples": {
|
||||
# ... 其他示例 ...
|
||||
"xlsx_base64": {
|
||||
"summary": "XLSX 内容 (Base64)",
|
||||
"docx_base64": {
|
||||
"summary": "DOCX 内容 (Base64)",
|
||||
"value": {
|
||||
"file_type": "xlsx",
|
||||
"filename": "my_sheet_translated.xlsx",
|
||||
"file_type": "docx",
|
||||
"filename": "my_doc_translated.docx",
|
||||
"content": "UEsDBBQAAAAIA... (base64-encoded string)"
|
||||
}
|
||||
}
|
||||
@@ -796,13 +855,13 @@ async def service_download_file(
|
||||
)
|
||||
async def service_content(
|
||||
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
|
||||
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "xlsx"])
|
||||
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "docx"])
|
||||
):
|
||||
"""[MODIFIED] 根据任务ID和文件类型,以JSON格式返回内容。zip/xlsx文件会进行Base64编码。"""
|
||||
"""[MODIFIED] 根据任务ID和文件类型,以JSON格式返回内容。zip/xlsx/docx文件会进行Base64编码。"""
|
||||
content, _, filename = await _get_content_from_workflow(task_id, file_type)
|
||||
|
||||
final_content: str
|
||||
if file_type in ['markdown_zip', 'xlsx']: # 二进制文件进行Base64编码
|
||||
if file_type in ['markdown_zip', 'xlsx', 'docx']: # 二进制文件进行Base64编码
|
||||
final_content = base64.b64encode(content).decode('utf-8')
|
||||
else: # 文本文件直接解码
|
||||
final_content = content.decode('utf-8')
|
||||
|
||||
@@ -20,6 +20,6 @@ class Docx2HTMLExporter(XlsxExporter):
|
||||
self.cdn = config.cdn
|
||||
|
||||
def export(self, document: Document) -> Document:
|
||||
html_content = mammoth.convert_to_html(BytesIO(document.content))
|
||||
html_content = mammoth.convert_to_html(BytesIO(document.content)).value
|
||||
|
||||
return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,11 +1,15 @@
|
||||
import asyncio
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from io import BytesIO
|
||||
from typing import Self, Literal, List, Dict, Any, Tuple
|
||||
|
||||
import docx
|
||||
from docx.document import Document as DocumentObject
|
||||
from docx.table import _Cell
|
||||
from docx.oxml.ns import nsdecls
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.table import _Cell, Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.text.run import Run
|
||||
|
||||
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||
from docutranslate.ir.document import Document
|
||||
@@ -13,6 +17,12 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
||||
from docutranslate.translator.base import Translator
|
||||
|
||||
|
||||
def is_image_run(run: Run) -> bool:
|
||||
"""检查一个 run 是否包含图片。"""
|
||||
# w:drawing 是嵌入式图片的标志, w:pict 是 VML 图片的标志
|
||||
return '<w:drawing' in run.element.xml or '<w:pict' in run.element.xml
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocxTranslatorConfig(AiTranslatorConfig):
|
||||
"""
|
||||
@@ -25,6 +35,7 @@ class DocxTranslatorConfig(AiTranslatorConfig):
|
||||
class DocxTranslator(Translator):
|
||||
"""
|
||||
用于翻译 .docx 文件的翻译器。
|
||||
此版本经过优化,可以处理图文混排的段落而不会丢失图片。
|
||||
"""
|
||||
|
||||
def __init__(self, config: DocxTranslatorConfig):
|
||||
@@ -49,74 +60,90 @@ class DocxTranslator(Translator):
|
||||
|
||||
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
|
||||
"""
|
||||
预处理 .docx 文件,提取所有需要翻译的文本。
|
||||
|
||||
[已重构] 预处理 .docx 文件,在 Run 级别上提取文本,以避免破坏图片。
|
||||
:param document: 包含 .docx 文件内容的 Document 对象。
|
||||
:return: 一个元组,包含:
|
||||
- docx.Document 对象
|
||||
- 一个包含文本元素信息的列表 (e.g., paragraph, cell)
|
||||
- 一个包含文本块信息的列表 (每个元素代表一组连续的文本 run)
|
||||
- 一个包含所有待翻译原文的列表
|
||||
"""
|
||||
doc = docx.Document(BytesIO(document.content))
|
||||
elements_to_translate = []
|
||||
original_texts = []
|
||||
|
||||
def process_paragraph(para: Paragraph):
|
||||
nonlocal elements_to_translate, original_texts
|
||||
current_text_segment = ""
|
||||
current_runs = []
|
||||
|
||||
for run in para.runs:
|
||||
if is_image_run(run):
|
||||
# 遇到图片,将之前累积的文本作为一个翻译单元
|
||||
if current_text_segment.strip():
|
||||
elements_to_translate.append({"type": "text_runs", "runs": current_runs})
|
||||
original_texts.append(current_text_segment)
|
||||
# 重置累加器
|
||||
current_text_segment = ""
|
||||
current_runs = []
|
||||
else:
|
||||
# 累积文本 run
|
||||
current_runs.append(run)
|
||||
current_text_segment += run.text
|
||||
|
||||
# 处理段落末尾的最后一个文本块
|
||||
if current_text_segment.strip():
|
||||
elements_to_translate.append({"type": "text_runs", "runs": current_runs})
|
||||
original_texts.append(current_text_segment)
|
||||
|
||||
# 遍历所有段落
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip(): # 确保段落有实际内容
|
||||
elements_to_translate.append({"type": "paragraph", "element": para})
|
||||
original_texts.append(para.text)
|
||||
process_paragraph(para)
|
||||
|
||||
# 遍历所有表格
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip(): # 确保单元格有实际内容
|
||||
elements_to_translate.append({"type": "cell", "element": cell})
|
||||
original_texts.append(cell.text)
|
||||
for para in cell.paragraphs:
|
||||
process_paragraph(para)
|
||||
|
||||
return doc, elements_to_translate, original_texts
|
||||
|
||||
def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]],
|
||||
translated_texts: List[str], original_texts: List[str]) -> bytes:
|
||||
"""
|
||||
将翻译后的文本写回到 .docx 对象中。
|
||||
|
||||
:param doc: docx.Document 对象。
|
||||
:param elements_to_translate: 包含文本元素信息的列表。
|
||||
:param translated_texts: 翻译后的文本列表。
|
||||
:param original_texts: 原始文本列表。
|
||||
:return: 更新后的 .docx 文件内容的字节流。
|
||||
[已重构] 将翻译后的文本写回到对应的 text runs 中,保留图片和样式。
|
||||
"""
|
||||
translation_map = dict(zip(original_texts, translated_texts))
|
||||
|
||||
for i, element_info in enumerate(elements_to_translate):
|
||||
element = element_info["element"]
|
||||
runs = element_info["runs"]
|
||||
original_text = original_texts[i]
|
||||
translated_text = translated_texts[i]
|
||||
|
||||
# 清空原有内容并写入新内容
|
||||
if isinstance(element, docx.text.paragraph.Paragraph):
|
||||
# 清空段落内容
|
||||
element.clear()
|
||||
# 根据插入模式添加文本
|
||||
if self.insert_mode == "replace":
|
||||
element.add_run(translated_text)
|
||||
elif self.insert_mode == "append":
|
||||
element.add_run(original_text + self.separator + translated_text)
|
||||
elif self.insert_mode == "prepend":
|
||||
element.add_run(translated_text + self.separator + original_text)
|
||||
else:
|
||||
self.logger.error("不正确的DocxTranslatorConfig参数")
|
||||
# 根据插入模式确定最终文本
|
||||
if self.insert_mode == "replace":
|
||||
final_text = translated_text
|
||||
elif self.insert_mode == "append":
|
||||
final_text = original_text + self.separator + translated_text
|
||||
elif self.insert_mode == "prepend":
|
||||
final_text = translated_text + self.separator + original_text
|
||||
else:
|
||||
self.logger.error("不正确的DocxTranslatorConfig参数")
|
||||
final_text = translated_text
|
||||
|
||||
elif isinstance(element, _Cell):
|
||||
# 根据插入模式设置单元格文本
|
||||
if self.insert_mode == "replace":
|
||||
element.text = translated_text
|
||||
elif self.insert_mode == "append":
|
||||
element.text = original_text + self.separator + translated_text
|
||||
elif self.insert_mode == "prepend":
|
||||
element.text = translated_text + self.separator + original_text
|
||||
else:
|
||||
self.logger.error("不正确的DocxTranslatorConfig参数")
|
||||
if not runs:
|
||||
continue
|
||||
|
||||
# --- 这是修改的核心部分 ---
|
||||
# 1. 将完整的翻译文本写入第一个 run
|
||||
first_run = runs[0]
|
||||
first_run.text = final_text
|
||||
|
||||
# 2. 清空该文本块中其余 run 的内容,但保留 run 本身及其格式
|
||||
# 这可以防止重复文本,同时保留文档结构
|
||||
for run in runs[1:]:
|
||||
run.text = ""
|
||||
# --- 修改结束 ---
|
||||
|
||||
# 将修改后的文档保存到 BytesIO 流
|
||||
doc_output_stream = BytesIO()
|
||||
@@ -128,8 +155,9 @@ class DocxTranslator(Translator):
|
||||
同步翻译 .docx 文件。
|
||||
"""
|
||||
doc, elements_to_translate, original_texts = self._pre_translate(document)
|
||||
if not elements_to_translate:
|
||||
if not original_texts:
|
||||
print("\n文件中没有找到需要翻译的文本内容。")
|
||||
document.content = doc.save(BytesIO()).getvalue() # 返回原文件
|
||||
return self
|
||||
|
||||
# 调用翻译 agent
|
||||
@@ -144,8 +172,12 @@ class DocxTranslator(Translator):
|
||||
异步翻译 .docx 文件。
|
||||
"""
|
||||
doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
|
||||
if not elements_to_translate:
|
||||
if not original_texts:
|
||||
print("\n文件中没有找到需要翻译的文本内容。")
|
||||
# 在异步环境中正确保存和返回
|
||||
output_stream = BytesIO()
|
||||
doc.save(output_stream)
|
||||
document.content = output_stream.getvalue()
|
||||
return self
|
||||
|
||||
# 异步调用翻译 agent
|
||||
|
||||
Reference in New Issue
Block a user