This commit is contained in:
xunbu
2025-08-24 12:02:56 +08:00
parent 486ac3c90a
commit 5cd0156978
6 changed files with 51 additions and 33 deletions

View File

@@ -1,7 +1,6 @@
import asyncio import asyncio
import base64 import base64
import binascii import binascii
import io
import logging import logging
import os import os
import shutil import shutil
@@ -12,13 +11,12 @@ import uuid
from contextlib import asynccontextmanager, closing from contextlib import asynccontextmanager, closing
from pathlib import Path from pathlib import Path
from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type
from urllib.parse import quote
import httpx import httpx
import uvicorn import uvicorn
from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath
from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, FileResponse from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field, field_validator from pydantic import BaseModel, Field, field_validator
@@ -247,7 +245,8 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。") mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。")
formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。") formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。")
code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。") code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。")
model_version: Literal["pipeline", "vlm"] = Field("vlm", description="Mineru模型的版本'vlm'是更新的版本。仅 `mineru` 引擎有效。") model_version: Literal["pipeline", "vlm"] = Field("vlm",
description="Mineru模型的版本'vlm'是更新的版本。仅 `mineru` 引擎有效。")
@field_validator('mineru_token') @field_validator('mineru_token')
def check_mineru_token(cls, v, values): def check_mineru_token(cls, v, values):
@@ -348,7 +347,8 @@ TranslatePayload = Annotated[
# 4. 创建最终的请求体模型 # 4. 创建最终的请求体模型
class TranslateServiceRequest(BaseModel): class TranslateServiceRequest(BaseModel):
file_name: str = Field(..., description="上传的原始文件名,含扩展名。", file_name: str = Field(..., description="上传的原始文件名,含扩展名。",
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"]) examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub",
"index.html"])
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."]) file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。") payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")
@@ -654,7 +654,7 @@ async def _perform_translation(
await workflow.translate_async() await workflow.translate_async()
# 4. 任务成功,生成所有可下载文件并存储 # 4. 任务成功,生成所有可下载文件并存储
task_logger.info("翻译完成,正在生成结果文件...") task_logger.info("翻译完成,正在生成临时结果文件...")
temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_") temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_")
task_state["temp_dir"] = temp_dir task_state["temp_dir"] = temp_dir
downloadable_files = {} downloadable_files = {}
@@ -689,7 +689,8 @@ async def _perform_translation(
html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available) html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
elif isinstance(workflow, EpubWorkflow): elif isinstance(workflow, EpubWorkflow):
html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available) html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html", True) export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html",
True)
if isinstance(workflow, MDFormatsExportable): if isinstance(workflow, MDFormatsExportable):
export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True) export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True)
export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False) export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False)
@@ -711,14 +712,13 @@ async def _perform_translation(
# 循环生成文件 # 循环生成文件
for file_type, (export_func, filename, is_string_output) in export_map.items(): for file_type, (export_func, filename, is_string_output) in export_map.items():
try: try:
task_logger.info(f"正在生成 {file_type} 文件: {filename}")
content = await asyncio.to_thread(export_func) content = await asyncio.to_thread(export_func)
content_bytes = content.encode('utf-8') if is_string_output else content content_bytes = content.encode('utf-8') if is_string_output else content
file_path = os.path.join(temp_dir, filename) file_path = os.path.join(temp_dir, filename)
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
f.write(content_bytes) f.write(content_bytes)
downloadable_files[file_type] = {"path": file_path, "filename": filename} downloadable_files[file_type] = {"path": file_path, "filename": filename}
task_logger.info(f"成功生成 {file_type} 文件于: {file_path}") task_logger.info(f"成功生成 {file_type} 文件")
except Exception as export_error: except Exception as export_error:
task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True) task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True)
@@ -760,7 +760,7 @@ async def _perform_translation(
if task_state["error_flag"] and temp_dir and os.path.isdir(temp_dir): if task_state["error_flag"] and temp_dir and os.path.isdir(temp_dir):
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
task_logger.info(f"因任务失败,已清理临时目录: {temp_dir}") task_logger.info(f"因任务失败,已清理临时目录")
task_state["temp_dir"] = None task_state["temp_dir"] = None
task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。") task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
@@ -1199,7 +1199,8 @@ async def service_content(
file_info = task_state.get("downloadable_files", {}).get(file_type) file_info = task_state.get("downloadable_files", {}).get(file_type)
if not file_info or not os.path.exists(file_info.get("path")): if not file_info or not os.path.exists(file_info.get("path")):
raise HTTPException(status_code=404, detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。") raise HTTPException(status_code=404,
detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。")
file_path = file_info["path"] file_path = file_info["path"]
filename = file_info["filename"] filename = file_info["filename"]
@@ -1299,7 +1300,8 @@ async def temp_translate(
decoded_content = file_content.encode('utf-8') decoded_content = file_content.encode('utf-8')
try: try:
workflow_config = MarkdownBasedWorkflowConfig( workflow_config = MarkdownBasedWorkflowConfig(
convert_engine="mineru", converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version), convert_engine="mineru",
converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version),
translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id, translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id,
to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature, to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature,
thinking=thinking, chunk_size=chunk_size, concurrent=concurrent), thinking=thinking, chunk_size=chunk_size, concurrent=concurrent),

View File

@@ -1,4 +1,7 @@
from docutranslate.converter.base import Converter from dataclasses import dataclass
from typing import Hashable
from docutranslate.converter.base import Converter, ConverterConfig
from docutranslate.ir.document import Document from docutranslate.ir.document import Document

View File

@@ -1,17 +1,24 @@
import asyncio import asyncio
import csv import csv
import logging from dataclasses import dataclass
from io import BytesIO, StringIO from io import BytesIO, StringIO
from typing import Hashable
# 引入 chardet 用于编码检测 # 引入 chardet 用于编码检测
import chardet import chardet
import openpyxl import openpyxl
from docutranslate.converter.x2xlsx.base import X2XlsxConverter
from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
# 配置一个基本的日志记录器(如果您的项目尚未配置) # 配置一个基本的日志记录器(如果您的项目尚未配置)
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@dataclass(kw_only=True)
class ConverterCsv2XlsxConfig(X2XlsxConverterConfig):
def gethash(self) -> Hashable:
return "1"
class ConverterCsv2Xlsx(X2XlsxConverter): class ConverterCsv2Xlsx(X2XlsxConverter):
@@ -25,6 +32,8 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
- 完善的错误处理和日志记录。 - 完善的错误处理和日志记录。
""" """
def __init__(self, config: ConverterCsv2XlsxConfig):
super().__init__(config=config)
def convert(self, document: Document) -> Document: def convert(self, document: Document) -> Document:
""" """

View File

@@ -1,5 +1,5 @@
import asyncio import asyncio
from dataclasses import dataclass, field from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Optional from typing import Self, Literal, List, Optional

View File

@@ -1,11 +1,12 @@
import asyncio import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Self, Type from typing import Self
from docutranslate.converter.base import ConverterConfig
from docutranslate.converter.converter_identity import ConverterIdentity from docutranslate.converter.converter_identity import ConverterIdentity
from docutranslate.converter.x2xlsx.base import X2XlsxConverter from docutranslate.converter.x2xlsx.base import X2XlsxConverter
from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx, ConverterCsv2XlsxConfig
from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.base import ExporterConfig
from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter
@@ -23,12 +24,7 @@ class XlsxWorkflowConfig(WorkflowConfig):
class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig], class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig],
XlsxExportable[ExporterConfig],CsvExportable[ExporterConfig]): XlsxExportable[ExporterConfig], CsvExportable[ExporterConfig]):
_converter_factory: dict[
str, Type[X2XlsxConverter | ConverterIdentity]] = {
".csv": ConverterCsv2Xlsx,
".xlsx": ConverterIdentity
}
def __init__(self, config: XlsxWorkflowConfig): def __init__(self, config: XlsxWorkflowConfig):
super().__init__(config=config) super().__init__(config=config)
@@ -36,17 +32,25 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta
for sub_config in [self.config.translator_config]: for sub_config in [self.config.translator_config]:
if sub_config: if sub_config:
sub_config.logger = config.logger sub_config.logger = config.logger
self._converter_factory: dict[
str, tuple[
type[X2XlsxConverter | ConverterIdentity], ConverterConfig|None]] = {
".csv": (ConverterCsv2Xlsx, ConverterCsv2XlsxConfig(logger=self.logger)),
".xlsx": (ConverterIdentity,None)
}
def _get_document_xlsx(self, document: Document) -> Document: def _get_document_xlsx(self, document: Document) -> Document:
suffix = document.suffix suffix = document.suffix
converter_type = self._converter_factory.get(suffix) converter_types = self._converter_factory.get(suffix)
if converter_type is None: if converter_types is None:
raise ValueError(f"Xlsx工作流不支持{suffix}格式文件") raise ValueError(f"Xlsx工作流不支持{suffix}格式文件")
converter = converter_type() converter_type, converter_config = converter_types
converter = converter_type(converter_config)
return converter.convert(document) return converter.convert(document)
def _pre_translate(self, document_pre_transalte: Document): def _pre_translate(self, document_pre_translate: Document):
document = document_pre_transalte.copy() document = document_pre_translate.copy()
translate_config = self.config.translator_config translate_config = self.config.translator_config
translator = XlsxTranslator(translate_config) translator = XlsxTranslator(translate_config)
return document, translator return document, translator