优化docxtranslator
This commit is contained in:
@@ -31,13 +31,16 @@ from docutranslate.exporter.md.types import ConvertEngineType
|
|||||||
# --- 核心代码 Imports ---
|
# --- 核心代码 Imports ---
|
||||||
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
from docutranslate.global_values.conditional_import import DOCLING_EXIST
|
||||||
from docutranslate.workflow.base import Workflow
|
from docutranslate.workflow.base import Workflow
|
||||||
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable
|
from docutranslate.workflow.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable, JsonExportable, \
|
||||||
|
XlsxExportable
|
||||||
|
# --- [NEW] DOCX 工作流 Imports ---
|
||||||
|
from docutranslate.workflow.interfaces import DocxExportable
|
||||||
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow, MarkdownBasedWorkflowConfig
|
from docutranslate.workflow.md_based_workflow import MarkdownBasedWorkflow, MarkdownBasedWorkflowConfig
|
||||||
from docutranslate.workflow.txt_workflow import TXTWorkflow, TXTWorkflowConfig
|
from docutranslate.workflow.txt_workflow import TXTWorkflow, TXTWorkflowConfig
|
||||||
from docutranslate.workflow.json_workflow import JsonWorkflow, JsonWorkflowConfig
|
from docutranslate.workflow.json_workflow import JsonWorkflow, JsonWorkflowConfig
|
||||||
# --- [NEW] XLSX 工作流 Imports ---
|
|
||||||
from docutranslate.workflow.xlsx_workflow import XlsxWorkflow, XlsxWorkflowConfig
|
from docutranslate.workflow.xlsx_workflow import XlsxWorkflow, XlsxWorkflowConfig
|
||||||
from docutranslate.workflow.interfaces import XlsxExportable
|
# --- [NEW] DOCX 工作流 Imports ---
|
||||||
|
from docutranslate.workflow.docx_workflow import DocxWorkflow, DocxWorkflowConfig
|
||||||
|
|
||||||
if DOCLING_EXIST or TYPE_CHECKING:
|
if DOCLING_EXIST or TYPE_CHECKING:
|
||||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
||||||
@@ -48,9 +51,11 @@ from docutranslate.translator.ai_translator.md_translator import MDTranslatorCon
|
|||||||
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig
|
from docutranslate.translator.ai_translator.txt_translator import TXTTranslatorConfig
|
||||||
from docutranslate.translator.ai_translator.json_translator import JsonTranslatorConfig
|
from docutranslate.translator.ai_translator.json_translator import JsonTranslatorConfig
|
||||||
from docutranslate.exporter.js.json2html_exporter import Json2HTMLExporterConfig
|
from docutranslate.exporter.js.json2html_exporter import Json2HTMLExporterConfig
|
||||||
# --- [NEW] XLSX 工作流相关配置 Imports ---
|
|
||||||
from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig
|
from docutranslate.translator.ai_translator.xlsx_translator import XlsxTranslatorConfig
|
||||||
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig
|
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig
|
||||||
|
# --- [NEW] DOCX 工作流相关配置 Imports ---
|
||||||
|
from docutranslate.translator.ai_translator.docx_translator import DocxTranslatorConfig
|
||||||
|
from docutranslate.exporter.docx.docx2html_exporter import Docx2HTMLExporterConfig
|
||||||
# ------------------------------------
|
# ------------------------------------
|
||||||
|
|
||||||
from docutranslate.logger import global_logger
|
from docutranslate.logger import global_logger
|
||||||
@@ -69,7 +74,8 @@ WORKFLOW_DICT: Dict[str, Type[Workflow]] = {
|
|||||||
"markdown_based": MarkdownBasedWorkflow,
|
"markdown_based": MarkdownBasedWorkflow,
|
||||||
"txt": TXTWorkflow,
|
"txt": TXTWorkflow,
|
||||||
"json": JsonWorkflow,
|
"json": JsonWorkflow,
|
||||||
"xlsx": XlsxWorkflow, # <--- 新增 XLSX 工作流
|
"xlsx": XlsxWorkflow,
|
||||||
|
"docx": DocxWorkflow, # <--- 新增 DOCX 工作流
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -228,7 +234,6 @@ class JsonWorkflowParams(BaseWorkflowParams):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# --- [NEW] XLSX 工作流参数模型 ---
|
|
||||||
class XlsxWorkflowParams(BaseWorkflowParams):
|
class XlsxWorkflowParams(BaseWorkflowParams):
|
||||||
workflow_type: Literal['xlsx'] = Field(..., description="指定使用XLSX的翻译工作流。")
|
workflow_type: Literal['xlsx'] = Field(..., description="指定使用XLSX的翻译工作流。")
|
||||||
insert_mode: Literal["replace", "append", "prepend"] = Field(
|
insert_mode: Literal["replace", "append", "prepend"] = Field(
|
||||||
@@ -241,9 +246,22 @@ class XlsxWorkflowParams(BaseWorkflowParams):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# --- [NEW] DOCX 工作流参数模型 ---
|
||||||
|
class DocxWorkflowParams(BaseWorkflowParams):
|
||||||
|
workflow_type: Literal['docx'] = Field(..., description="指定使用DOCX的翻译工作流。")
|
||||||
|
insert_mode: Literal["replace", "append", "prepend"] = Field(
|
||||||
|
"replace",
|
||||||
|
description="翻译文本的插入模式。'replace':替换原文,'append':附加到原文后,'prepend':附加到原文前。"
|
||||||
|
)
|
||||||
|
separator: str = Field(
|
||||||
|
"\n",
|
||||||
|
description="当 insert_mode 为 'append' 或 'prepend' 时,用于分隔原文和译文的分隔符。"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# 3. [MODIFIED] 使用可辨识联合类型(Discriminated Union)将它们组合起来
|
# 3. [MODIFIED] 使用可辨识联合类型(Discriminated Union)将它们组合起来
|
||||||
TranslatePayload = Annotated[
|
TranslatePayload = Annotated[
|
||||||
Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams], # <-- 新增 XlsxWorkflowParams
|
Union[MarkdownWorkflowParams, TextWorkflowParams, JsonWorkflowParams, XlsxWorkflowParams, DocxWorkflowParams], # <-- 新增 DocxWorkflowParams
|
||||||
Field(discriminator='workflow_type')
|
Field(discriminator='workflow_type')
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -288,7 +306,6 @@ class TranslateServiceRequest(BaseModel):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
# --- [NEW] XLSX 工作流示例 ---
|
|
||||||
{
|
{
|
||||||
"summary": "XLSX 工作流示例",
|
"summary": "XLSX 工作流示例",
|
||||||
"value": {
|
"value": {
|
||||||
@@ -306,6 +323,22 @@ class TranslateServiceRequest(BaseModel):
|
|||||||
"concurrent": 5
|
"concurrent": 5
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
# --- [NEW] DOCX 工作流示例 ---
|
||||||
|
{
|
||||||
|
"summary": "DOCX 工作流示例",
|
||||||
|
"value": {
|
||||||
|
"file_name": "contract.docx",
|
||||||
|
"file_content": "UEsDBBQAAAAIA... (base64-encoded docx)",
|
||||||
|
"payload": {
|
||||||
|
"workflow_type": "docx",
|
||||||
|
"base_url": "https://api.openai.com/v1",
|
||||||
|
"api_key": "sk-your-api-key-here",
|
||||||
|
"model_id": "gpt-4o",
|
||||||
|
"to_lang": "English",
|
||||||
|
"insert_mode": "replace",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -395,14 +428,13 @@ async def _perform_translation(
|
|||||||
)
|
)
|
||||||
workflow = JsonWorkflow(config=workflow_config)
|
workflow = JsonWorkflow(config=workflow_config)
|
||||||
|
|
||||||
# --- [NEW] XLSX 工作流处理逻辑 ---
|
|
||||||
elif isinstance(payload, XlsxWorkflowParams):
|
elif isinstance(payload, XlsxWorkflowParams):
|
||||||
task_logger.info("构建 XlsxWorkflow 配置。")
|
task_logger.info("构建 XlsxWorkflow 配置。")
|
||||||
translator_config = XlsxTranslatorConfig(
|
translator_config = XlsxTranslatorConfig(
|
||||||
**payload.model_dump(include={
|
**payload.model_dump(include={
|
||||||
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
|
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
|
||||||
'temperature', 'thinking', 'chunk_size', 'concurrent',
|
'temperature', 'thinking', 'chunk_size', 'concurrent',
|
||||||
'insert_mode', 'separator' # 包含XLSX特定参数
|
'insert_mode', 'separator'
|
||||||
}, exclude_none=True)
|
}, exclude_none=True)
|
||||||
)
|
)
|
||||||
html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True)
|
html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True)
|
||||||
@@ -413,6 +445,24 @@ async def _perform_translation(
|
|||||||
)
|
)
|
||||||
workflow = XlsxWorkflow(config=workflow_config)
|
workflow = XlsxWorkflow(config=workflow_config)
|
||||||
|
|
||||||
|
# --- [NEW] DOCX 工作流处理逻辑 ---
|
||||||
|
elif isinstance(payload, DocxWorkflowParams):
|
||||||
|
task_logger.info("构建 DocxWorkflow 配置。")
|
||||||
|
translator_config = DocxTranslatorConfig(
|
||||||
|
**payload.model_dump(include={
|
||||||
|
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
|
||||||
|
'temperature', 'thinking', 'chunk_size', 'concurrent',
|
||||||
|
'insert_mode', 'separator' # 包含DOCX特定参数
|
||||||
|
}, exclude_none=True)
|
||||||
|
)
|
||||||
|
html_exporter_config = Docx2HTMLExporterConfig(cdn=True)
|
||||||
|
workflow_config = DocxWorkflowConfig(
|
||||||
|
translator_config=translator_config,
|
||||||
|
html_exporter_config=html_exporter_config,
|
||||||
|
logger=task_logger
|
||||||
|
)
|
||||||
|
workflow = DocxWorkflow(config=workflow_config)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
|
raise TypeError(f"工作流类型 '{payload.workflow_type}' 的处理逻辑未实现。")
|
||||||
|
|
||||||
@@ -539,7 +589,7 @@ def _cancel_translation_logic(task_id: str):
|
|||||||
description="""
|
description="""
|
||||||
接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。
|
接收一个包含文件内容(Base64编码)和工作流参数的JSON请求,启动一个后台翻译任务。
|
||||||
|
|
||||||
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`)。
|
- **工作流选择**: 请求体中的 `payload.workflow_type` 字段决定了本次任务的类型(如 `markdown_based`, `txt`, `json`, `xlsx`, `docx`)。
|
||||||
- **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。
|
- **动态参数**: 根据所选工作流,API需要不同的参数集。请参考下面的Schema或示例。
|
||||||
- **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。
|
- **异步处理**: 此端点会立即返回任务ID,客户端需轮询状态接口获取进度。
|
||||||
|
|
||||||
@@ -606,21 +656,21 @@ async def service_release_task(task_id: str):
|
|||||||
"application/json": {
|
"application/json": {
|
||||||
"examples": {
|
"examples": {
|
||||||
# ... 其他示例保持不变 ...
|
# ... 其他示例保持不变 ...
|
||||||
"completed_xlsx": {
|
"completed_docx": {
|
||||||
"summary": "已完成 (XLSX)",
|
"summary": "已完成 (DOCX)",
|
||||||
"value": {
|
"value": {
|
||||||
"task_id": "e5b98cc6",
|
"task_id": "f8a9c1b2",
|
||||||
"is_processing": False,
|
"is_processing": False,
|
||||||
"status_message": "翻译成功!用时 18.99 秒。",
|
"status_message": "翻译成功!用时 25.10 秒。",
|
||||||
"error_flag": False,
|
"error_flag": False,
|
||||||
"download_ready": True,
|
"download_ready": True,
|
||||||
"original_filename_stem": "product_list",
|
"original_filename_stem": "contract",
|
||||||
"original_filename": "product_list.xlsx",
|
"original_filename": "contract.docx",
|
||||||
"task_start_time": 1678889400.123,
|
"task_start_time": 1678889500.123,
|
||||||
"task_end_time": 1678889419.113,
|
"task_end_time": 1678889525.223,
|
||||||
"downloads": {
|
"downloads": {
|
||||||
"xlsx": "/service/download/e5b98cc6/xlsx",
|
"docx": "/service/download/f8a9c1b2/docx",
|
||||||
"html": "/service/download/e5b98cc6/html"
|
"html": "/service/download/f8a9c1b2/html"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -650,9 +700,11 @@ async def service_get_status(
|
|||||||
downloads["txt"] = f"/service/download/{task_id}/txt"
|
downloads["txt"] = f"/service/download/{task_id}/txt"
|
||||||
if isinstance(workflow, JsonExportable):
|
if isinstance(workflow, JsonExportable):
|
||||||
downloads["json"] = f"/service/download/{task_id}/json"
|
downloads["json"] = f"/service/download/{task_id}/json"
|
||||||
# --- [NEW] 新增对 XLSX 导出的支持 ---
|
|
||||||
if isinstance(workflow, XlsxExportable):
|
if isinstance(workflow, XlsxExportable):
|
||||||
downloads["xlsx"] = f"/service/download/{task_id}/xlsx"
|
downloads["xlsx"] = f"/service/download/{task_id}/xlsx"
|
||||||
|
# --- [NEW] 新增对 DOCX 导出的支持 ---
|
||||||
|
if isinstance(workflow, DocxExportable):
|
||||||
|
downloads["docx"] = f"/service/download/{task_id}/docx"
|
||||||
|
|
||||||
return JSONResponse(content={
|
return JSONResponse(content={
|
||||||
"task_id": task_id,
|
"task_id": task_id,
|
||||||
@@ -683,8 +735,8 @@ async def service_get_logs(task_id: str):
|
|||||||
return JSONResponse(content={"logs": new_logs})
|
return JSONResponse(content={"logs": new_logs})
|
||||||
|
|
||||||
|
|
||||||
# [MODIFIED] 扩展 FileType 以包含 'xlsx'
|
# [MODIFIED] 扩展 FileType 以包含 'docx'
|
||||||
FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx"]
|
FileType = Literal["markdown", "markdown_zip", "html", "txt", "json", "xlsx", "docx"]
|
||||||
|
|
||||||
|
|
||||||
async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple[bytes, str, str]:
|
async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple[bytes, str, str]:
|
||||||
@@ -698,7 +750,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
filename_stem = task_state['original_filename_stem']
|
filename_stem = task_state['original_filename_stem']
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content_bytes: bytes; media_type: str; filename: str
|
content_bytes: bytes
|
||||||
|
media_type: str
|
||||||
|
filename: str
|
||||||
|
|
||||||
html_config = None
|
html_config = None
|
||||||
if file_type == 'html':
|
if file_type == 'html':
|
||||||
@@ -712,8 +766,9 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
if isinstance(workflow, MarkdownBasedWorkflow): html_config = MD2HTMLExporterConfig(cdn=is_cdn_available)
|
if isinstance(workflow, MarkdownBasedWorkflow): html_config = MD2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
elif isinstance(workflow, TXTWorkflow): html_config = TXT2HTMLExporterConfig(cdn=is_cdn_available)
|
elif isinstance(workflow, TXTWorkflow): html_config = TXT2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
elif isinstance(workflow, JsonWorkflow): html_config = Json2HTMLExporterConfig(cdn=is_cdn_available)
|
elif isinstance(workflow, JsonWorkflow): html_config = Json2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
# --- [NEW] 新增对 XLSX->HTML 的支持 ---
|
|
||||||
elif isinstance(workflow, XlsxWorkflow): html_config = Xlsx2HTMLExporterConfig(cdn=is_cdn_available)
|
elif isinstance(workflow, XlsxWorkflow): html_config = Xlsx2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
|
# --- [NEW] 新增对 DOCX->HTML 的支持 ---
|
||||||
|
elif isinstance(workflow, DocxWorkflow): html_config = Docx2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
|
|
||||||
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
if file_type == 'html' and isinstance(workflow, HTMLExportable):
|
||||||
content_str = workflow.export_to_html(html_config)
|
content_str = workflow.export_to_html(html_config)
|
||||||
@@ -729,10 +784,13 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
elif file_type == 'json' and isinstance(workflow, JsonExportable):
|
elif file_type == 'json' and isinstance(workflow, JsonExportable):
|
||||||
json_content = workflow.export_to_json()
|
json_content = workflow.export_to_json()
|
||||||
content_bytes, media_type, filename = json_content.encode('utf-8'), "application/json; charset=utf-8", f"{filename_stem}_translated.json"
|
content_bytes, media_type, filename = json_content.encode('utf-8'), "application/json; charset=utf-8", f"{filename_stem}_translated.json"
|
||||||
# --- [NEW] XLSX 导出逻辑 ---
|
|
||||||
elif file_type == 'xlsx' and isinstance(workflow, XlsxExportable):
|
elif file_type == 'xlsx' and isinstance(workflow, XlsxExportable):
|
||||||
content_bytes = workflow.export_to_xlsx()
|
content_bytes = workflow.export_to_xlsx()
|
||||||
media_type, filename = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", f"{filename_stem}_translated.xlsx"
|
media_type, filename = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", f"{filename_stem}_translated.xlsx"
|
||||||
|
# --- [NEW] DOCX 导出逻辑 ---
|
||||||
|
elif file_type == 'docx' and isinstance(workflow, DocxExportable):
|
||||||
|
content_bytes = workflow.export_to_docx()
|
||||||
|
media_type, filename = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", f"{filename_stem}_translated.docx"
|
||||||
else:
|
else:
|
||||||
raise HTTPException(status_code=404, detail=f"此任务不支持导出 '{file_type}' 类型的文件。")
|
raise HTTPException(status_code=404, detail=f"此任务不支持导出 '{file_type}' 类型的文件。")
|
||||||
|
|
||||||
@@ -751,7 +809,8 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
"content": {
|
"content": {
|
||||||
# ... 其他类型保持不变 ...
|
# ... 其他类型保持不变 ...
|
||||||
"application/json": {"schema": {"type": "string", "format": "binary"}},
|
"application/json": {"schema": {"type": "string", "format": "binary"}},
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW]
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": {"schema": {"type": "string", "format": "binary"}},
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": {"schema": {"type": "string", "format": "binary"}}, # <-- [NEW]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
# ... 其他 response 保持不变 ...
|
# ... 其他 response 保持不变 ...
|
||||||
@@ -759,7 +818,7 @@ async def _get_content_from_workflow(task_id: str, file_type: FileType) -> tuple
|
|||||||
)
|
)
|
||||||
async def service_download_file(
|
async def service_download_file(
|
||||||
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
|
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
|
||||||
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "xlsx"])
|
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html", "json", "docx"])
|
||||||
):
|
):
|
||||||
content, media_type, filename = await _get_content_from_workflow(task_id, file_type)
|
content, media_type, filename = await _get_content_from_workflow(task_id, file_type)
|
||||||
headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"}
|
headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"}
|
||||||
@@ -773,7 +832,7 @@ async def service_download_file(
|
|||||||
...
|
...
|
||||||
- **内容编码**:
|
- **内容编码**:
|
||||||
- 对于 `html`, `markdown`, `txt`, `json` 类型, `content` 字段包含原始的文本内容。
|
- 对于 `html`, `markdown`, `txt`, `json` 类型, `content` 字段包含原始的文本内容。
|
||||||
- 对于 `markdown_zip`, `xlsx` 类型, `content` 字段包含Base64编码后的字符串。
|
- 对于 `markdown_zip`, `xlsx`, `docx` 类型, `content` 字段包含Base64编码后的字符串。
|
||||||
...
|
...
|
||||||
""",
|
""",
|
||||||
responses={
|
responses={
|
||||||
@@ -781,11 +840,11 @@ async def service_download_file(
|
|||||||
"description": "成功返回文件内容。",
|
"description": "成功返回文件内容。",
|
||||||
"content": { "application/json": { "examples": {
|
"content": { "application/json": { "examples": {
|
||||||
# ... 其他示例 ...
|
# ... 其他示例 ...
|
||||||
"xlsx_base64": {
|
"docx_base64": {
|
||||||
"summary": "XLSX 内容 (Base64)",
|
"summary": "DOCX 内容 (Base64)",
|
||||||
"value": {
|
"value": {
|
||||||
"file_type": "xlsx",
|
"file_type": "docx",
|
||||||
"filename": "my_sheet_translated.xlsx",
|
"filename": "my_doc_translated.docx",
|
||||||
"content": "UEsDBBQAAAAIA... (base64-encoded string)"
|
"content": "UEsDBBQAAAAIA... (base64-encoded string)"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -796,13 +855,13 @@ async def service_download_file(
|
|||||||
)
|
)
|
||||||
async def service_content(
|
async def service_content(
|
||||||
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
|
task_id: str = FastApiPath(..., description="已完成任务的ID", examples=["b2865b93"]),
|
||||||
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "xlsx"])
|
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html", "json", "docx"])
|
||||||
):
|
):
|
||||||
"""[MODIFIED] 根据任务ID和文件类型,以JSON格式返回内容。zip/xlsx文件会进行Base64编码。"""
|
"""[MODIFIED] 根据任务ID和文件类型,以JSON格式返回内容。zip/xlsx/docx文件会进行Base64编码。"""
|
||||||
content, _, filename = await _get_content_from_workflow(task_id, file_type)
|
content, _, filename = await _get_content_from_workflow(task_id, file_type)
|
||||||
|
|
||||||
final_content: str
|
final_content: str
|
||||||
if file_type in ['markdown_zip', 'xlsx']: # 二进制文件进行Base64编码
|
if file_type in ['markdown_zip', 'xlsx', 'docx']: # 二进制文件进行Base64编码
|
||||||
final_content = base64.b64encode(content).decode('utf-8')
|
final_content = base64.b64encode(content).decode('utf-8')
|
||||||
else: # 文本文件直接解码
|
else: # 文本文件直接解码
|
||||||
final_content = content.decode('utf-8')
|
final_content = content.decode('utf-8')
|
||||||
|
|||||||
@@ -20,6 +20,6 @@ class Docx2HTMLExporter(XlsxExporter):
|
|||||||
self.cdn = config.cdn
|
self.cdn = config.cdn
|
||||||
|
|
||||||
def export(self, document: Document) -> Document:
|
def export(self, document: Document) -> Document:
|
||||||
html_content = mammoth.convert_to_html(BytesIO(document.content))
|
html_content = mammoth.convert_to_html(BytesIO(document.content)).value
|
||||||
|
|
||||||
return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)
|
return Document.from_bytes(content=html_content.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -1,11 +1,15 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Self, Literal, List, Dict, Any, Tuple
|
from typing import Self, Literal, List, Dict, Any, Tuple
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
from docx.document import Document as DocumentObject
|
from docx.document import Document as DocumentObject
|
||||||
from docx.table import _Cell
|
from docx.oxml.ns import nsdecls
|
||||||
|
from docx.oxml import OxmlElement
|
||||||
|
from docx.table import _Cell, Table
|
||||||
|
from docx.text.paragraph import Paragraph
|
||||||
|
from docx.text.run import Run
|
||||||
|
|
||||||
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
@@ -13,6 +17,12 @@ from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
|||||||
from docutranslate.translator.base import Translator
|
from docutranslate.translator.base import Translator
|
||||||
|
|
||||||
|
|
||||||
|
def is_image_run(run: Run) -> bool:
|
||||||
|
"""检查一个 run 是否包含图片。"""
|
||||||
|
# w:drawing 是嵌入式图片的标志, w:pict 是 VML 图片的标志
|
||||||
|
return '<w:drawing' in run.element.xml or '<w:pict' in run.element.xml
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DocxTranslatorConfig(AiTranslatorConfig):
|
class DocxTranslatorConfig(AiTranslatorConfig):
|
||||||
"""
|
"""
|
||||||
@@ -25,6 +35,7 @@ class DocxTranslatorConfig(AiTranslatorConfig):
|
|||||||
class DocxTranslator(Translator):
|
class DocxTranslator(Translator):
|
||||||
"""
|
"""
|
||||||
用于翻译 .docx 文件的翻译器。
|
用于翻译 .docx 文件的翻译器。
|
||||||
|
此版本经过优化,可以处理图文混排的段落而不会丢失图片。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config: DocxTranslatorConfig):
|
def __init__(self, config: DocxTranslatorConfig):
|
||||||
@@ -49,74 +60,90 @@ class DocxTranslator(Translator):
|
|||||||
|
|
||||||
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
|
def _pre_translate(self, document: Document) -> Tuple[DocumentObject, List[Dict[str, Any]], List[str]]:
|
||||||
"""
|
"""
|
||||||
预处理 .docx 文件,提取所有需要翻译的文本。
|
[已重构] 预处理 .docx 文件,在 Run 级别上提取文本,以避免破坏图片。
|
||||||
|
|
||||||
:param document: 包含 .docx 文件内容的 Document 对象。
|
:param document: 包含 .docx 文件内容的 Document 对象。
|
||||||
:return: 一个元组,包含:
|
:return: 一个元组,包含:
|
||||||
- docx.Document 对象
|
- docx.Document 对象
|
||||||
- 一个包含文本元素信息的列表 (e.g., paragraph, cell)
|
- 一个包含文本块信息的列表 (每个元素代表一组连续的文本 run)
|
||||||
- 一个包含所有待翻译原文的列表
|
- 一个包含所有待翻译原文的列表
|
||||||
"""
|
"""
|
||||||
doc = docx.Document(BytesIO(document.content))
|
doc = docx.Document(BytesIO(document.content))
|
||||||
elements_to_translate = []
|
elements_to_translate = []
|
||||||
original_texts = []
|
original_texts = []
|
||||||
|
|
||||||
|
def process_paragraph(para: Paragraph):
|
||||||
|
nonlocal elements_to_translate, original_texts
|
||||||
|
current_text_segment = ""
|
||||||
|
current_runs = []
|
||||||
|
|
||||||
|
for run in para.runs:
|
||||||
|
if is_image_run(run):
|
||||||
|
# 遇到图片,将之前累积的文本作为一个翻译单元
|
||||||
|
if current_text_segment.strip():
|
||||||
|
elements_to_translate.append({"type": "text_runs", "runs": current_runs})
|
||||||
|
original_texts.append(current_text_segment)
|
||||||
|
# 重置累加器
|
||||||
|
current_text_segment = ""
|
||||||
|
current_runs = []
|
||||||
|
else:
|
||||||
|
# 累积文本 run
|
||||||
|
current_runs.append(run)
|
||||||
|
current_text_segment += run.text
|
||||||
|
|
||||||
|
# 处理段落末尾的最后一个文本块
|
||||||
|
if current_text_segment.strip():
|
||||||
|
elements_to_translate.append({"type": "text_runs", "runs": current_runs})
|
||||||
|
original_texts.append(current_text_segment)
|
||||||
|
|
||||||
# 遍历所有段落
|
# 遍历所有段落
|
||||||
for para in doc.paragraphs:
|
for para in doc.paragraphs:
|
||||||
if para.text.strip(): # 确保段落有实际内容
|
process_paragraph(para)
|
||||||
elements_to_translate.append({"type": "paragraph", "element": para})
|
|
||||||
original_texts.append(para.text)
|
|
||||||
|
|
||||||
# 遍历所有表格
|
# 遍历所有表格
|
||||||
for table in doc.tables:
|
for table in doc.tables:
|
||||||
for row in table.rows:
|
for row in table.rows:
|
||||||
for cell in row.cells:
|
for cell in row.cells:
|
||||||
if cell.text.strip(): # 确保单元格有实际内容
|
for para in cell.paragraphs:
|
||||||
elements_to_translate.append({"type": "cell", "element": cell})
|
process_paragraph(para)
|
||||||
original_texts.append(cell.text)
|
|
||||||
|
|
||||||
return doc, elements_to_translate, original_texts
|
return doc, elements_to_translate, original_texts
|
||||||
|
|
||||||
def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]],
|
def _after_translate(self, doc: DocumentObject, elements_to_translate: List[Dict[str, Any]],
|
||||||
translated_texts: List[str], original_texts: List[str]) -> bytes:
|
translated_texts: List[str], original_texts: List[str]) -> bytes:
|
||||||
"""
|
"""
|
||||||
将翻译后的文本写回到 .docx 对象中。
|
[已重构] 将翻译后的文本写回到对应的 text runs 中,保留图片和样式。
|
||||||
|
|
||||||
:param doc: docx.Document 对象。
|
|
||||||
:param elements_to_translate: 包含文本元素信息的列表。
|
|
||||||
:param translated_texts: 翻译后的文本列表。
|
|
||||||
:param original_texts: 原始文本列表。
|
|
||||||
:return: 更新后的 .docx 文件内容的字节流。
|
|
||||||
"""
|
"""
|
||||||
|
translation_map = dict(zip(original_texts, translated_texts))
|
||||||
|
|
||||||
for i, element_info in enumerate(elements_to_translate):
|
for i, element_info in enumerate(elements_to_translate):
|
||||||
element = element_info["element"]
|
runs = element_info["runs"]
|
||||||
original_text = original_texts[i]
|
original_text = original_texts[i]
|
||||||
translated_text = translated_texts[i]
|
translated_text = translated_texts[i]
|
||||||
|
|
||||||
# 清空原有内容并写入新内容
|
# 根据插入模式确定最终文本
|
||||||
if isinstance(element, docx.text.paragraph.Paragraph):
|
|
||||||
# 清空段落内容
|
|
||||||
element.clear()
|
|
||||||
# 根据插入模式添加文本
|
|
||||||
if self.insert_mode == "replace":
|
if self.insert_mode == "replace":
|
||||||
element.add_run(translated_text)
|
final_text = translated_text
|
||||||
elif self.insert_mode == "append":
|
elif self.insert_mode == "append":
|
||||||
element.add_run(original_text + self.separator + translated_text)
|
final_text = original_text + self.separator + translated_text
|
||||||
elif self.insert_mode == "prepend":
|
elif self.insert_mode == "prepend":
|
||||||
element.add_run(translated_text + self.separator + original_text)
|
final_text = translated_text + self.separator + original_text
|
||||||
else:
|
else:
|
||||||
self.logger.error("不正确的DocxTranslatorConfig参数")
|
self.logger.error("不正确的DocxTranslatorConfig参数")
|
||||||
|
final_text = translated_text
|
||||||
|
|
||||||
elif isinstance(element, _Cell):
|
if not runs:
|
||||||
# 根据插入模式设置单元格文本
|
continue
|
||||||
if self.insert_mode == "replace":
|
|
||||||
element.text = translated_text
|
# --- 这是修改的核心部分 ---
|
||||||
elif self.insert_mode == "append":
|
# 1. 将完整的翻译文本写入第一个 run
|
||||||
element.text = original_text + self.separator + translated_text
|
first_run = runs[0]
|
||||||
elif self.insert_mode == "prepend":
|
first_run.text = final_text
|
||||||
element.text = translated_text + self.separator + original_text
|
|
||||||
else:
|
# 2. 清空该文本块中其余 run 的内容,但保留 run 本身及其格式
|
||||||
self.logger.error("不正确的DocxTranslatorConfig参数")
|
# 这可以防止重复文本,同时保留文档结构
|
||||||
|
for run in runs[1:]:
|
||||||
|
run.text = ""
|
||||||
|
# --- 修改结束 ---
|
||||||
|
|
||||||
# 将修改后的文档保存到 BytesIO 流
|
# 将修改后的文档保存到 BytesIO 流
|
||||||
doc_output_stream = BytesIO()
|
doc_output_stream = BytesIO()
|
||||||
@@ -128,8 +155,9 @@ class DocxTranslator(Translator):
|
|||||||
同步翻译 .docx 文件。
|
同步翻译 .docx 文件。
|
||||||
"""
|
"""
|
||||||
doc, elements_to_translate, original_texts = self._pre_translate(document)
|
doc, elements_to_translate, original_texts = self._pre_translate(document)
|
||||||
if not elements_to_translate:
|
if not original_texts:
|
||||||
print("\n文件中没有找到需要翻译的文本内容。")
|
print("\n文件中没有找到需要翻译的文本内容。")
|
||||||
|
document.content = doc.save(BytesIO()).getvalue() # 返回原文件
|
||||||
return self
|
return self
|
||||||
|
|
||||||
# 调用翻译 agent
|
# 调用翻译 agent
|
||||||
@@ -144,8 +172,12 @@ class DocxTranslator(Translator):
|
|||||||
异步翻译 .docx 文件。
|
异步翻译 .docx 文件。
|
||||||
"""
|
"""
|
||||||
doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
|
doc, elements_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
|
||||||
if not elements_to_translate:
|
if not original_texts:
|
||||||
print("\n文件中没有找到需要翻译的文本内容。")
|
print("\n文件中没有找到需要翻译的文本内容。")
|
||||||
|
# 在异步环境中正确保存和返回
|
||||||
|
output_stream = BytesIO()
|
||||||
|
doc.save(output_stream)
|
||||||
|
document.content = output_stream.getvalue()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
# 异步调用翻译 agent
|
# 异步调用翻译 agent
|
||||||
|
|||||||
Reference in New Issue
Block a user