fix

2025-08-24 12:02:56 +08:00
parent 486ac3c90a
commit 5cd0156978
6 changed files with 51 additions and 33 deletions
--- a/docutranslate/app.py
+++ b/docutranslate/app.py
@@ -1,7 +1,6 @@
 import asyncio
 import base64
 import binascii
-import io
 import logging
 import os
 import shutil
@@ -12,13 +11,12 @@ import uuid
 from contextlib import asynccontextmanager, closing
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type
-from urllib.parse import quote

 import httpx
 import uvicorn
 from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath
 from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html
-from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, FileResponse
+from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field, field_validator

@@ -247,7 +245,8 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
    mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。")
    formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。")
    code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。")
-    model_version: Literal["pipeline", "vlm"] = Field("vlm", description="Mineru模型的版本，'vlm'是更新的版本。仅 `mineru` 引擎有效。")
+    model_version: Literal["pipeline", "vlm"] = Field("vlm",
+                                                      description="Mineru模型的版本，'vlm'是更新的版本。仅 `mineru` 引擎有效。")

    @field_validator('mineru_token')
    def check_mineru_token(cls, v, values):
@@ -348,7 +347,8 @@ TranslatePayload = Annotated[
 # 4. 创建最终的请求体模型
 class TranslateServiceRequest(BaseModel):
    file_name: str = Field(..., description="上传的原始文件名，含扩展名。",
-                           examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"])
+                           examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub",
+                                     "index.html"])
    file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
    payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")

@@ -654,7 +654,7 @@ async def _perform_translation(
        await workflow.translate_async()

        # 4. 任务成功，生成所有可下载文件并存储
-        task_logger.info("翻译完成，正在生成结果文件...")
+        task_logger.info("翻译完成，正在生成临时结果文件...")
        temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_")
        task_state["temp_dir"] = temp_dir
        downloadable_files = {}
@@ -689,7 +689,8 @@ async def _perform_translation(
                html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
            elif isinstance(workflow, EpubWorkflow):
                html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
-            export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html", True)
+            export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html",
+                                  True)
        if isinstance(workflow, MDFormatsExportable):
            export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True)
            export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False)
@@ -711,14 +712,13 @@ async def _perform_translation(
        # 循环生成文件
        for file_type, (export_func, filename, is_string_output) in export_map.items():
            try:
-                task_logger.info(f"正在生成 {file_type} 文件: {filename}")
                content = await asyncio.to_thread(export_func)
                content_bytes = content.encode('utf-8') if is_string_output else content
                file_path = os.path.join(temp_dir, filename)
                with open(file_path, "wb") as f:
                    f.write(content_bytes)
                downloadable_files[file_type] = {"path": file_path, "filename": filename}
-                task_logger.info(f"成功生成 {file_type} 文件于: {file_path}")
+                task_logger.info(f"成功生成 {file_type} 文件")
            except Exception as export_error:
                task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True)

@@ -760,7 +760,7 @@ async def _perform_translation(

        if task_state["error_flag"] and temp_dir and os.path.isdir(temp_dir):
            shutil.rmtree(temp_dir)
-            task_logger.info(f"因任务失败，已清理临时目录: {temp_dir}")
+            task_logger.info(f"因任务失败，已清理临时目录")
            task_state["temp_dir"] = None

        task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
@@ -1199,7 +1199,8 @@ async def service_content(

    file_info = task_state.get("downloadable_files", {}).get(file_type)
    if not file_info or not os.path.exists(file_info.get("path")):
-        raise HTTPException(status_code=404, detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容，或文件已丢失。")
+        raise HTTPException(status_code=404,
+                            detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容，或文件已丢失。")

    file_path = file_info["path"]
    filename = file_info["filename"]
@@ -1299,7 +1300,8 @@ async def temp_translate(
        decoded_content = file_content.encode('utf-8')
    try:
        workflow_config = MarkdownBasedWorkflowConfig(
-            convert_engine="mineru", converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version),
+            convert_engine="mineru",
+            converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version),
            translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id,
                                                 to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature,
                                                 thinking=thinking, chunk_size=chunk_size, concurrent=concurrent),
--- a/docutranslate/converter/converter_identity.py
+++ b/docutranslate/converter/converter_identity.py
@@ -1,4 +1,7 @@
-from docutranslate.converter.base import Converter
+from dataclasses import dataclass
+from typing import Hashable
+
+from docutranslate.converter.base import Converter, ConverterConfig
 from docutranslate.ir.document import Document


--- a/docutranslate/converter/x2xlsx/converter_csv2xlsx.py
+++ b/docutranslate/converter/x2xlsx/converter_csv2xlsx.py
@@ -1,17 +1,24 @@
 import asyncio
 import csv
-import logging
+from dataclasses import dataclass
 from io import BytesIO, StringIO
+from typing import Hashable

 # 引入 chardet 用于编码检测
 import chardet
 import openpyxl
-from docutranslate.converter.x2xlsx.base import X2XlsxConverter
+
+from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
 from docutranslate.ir.document import Document


 # 配置一个基本的日志记录器（如果您的项目尚未配置）
 # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+@dataclass(kw_only=True)
+class ConverterCsv2XlsxConfig(X2XlsxConverterConfig):
+
+    def gethash(self) -> Hashable:
+        return "1"


 class ConverterCsv2Xlsx(X2XlsxConverter):
@@ -25,6 +32,8 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
    - 完善的错误处理和日志记录。
    """

+    def __init__(self, config: ConverterCsv2XlsxConfig):
+        super().__init__(config=config)

    def convert(self, document: Document) -> Document:
        """
--- a/docutranslate/translator/ai_translator/xlsx_translator.py
+++ b/docutranslate/translator/ai_translator/xlsx_translator.py
@@ -1,5 +1,5 @@
 import asyncio
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from io import BytesIO
 from typing import Self, Literal, List, Optional

--- a/docutranslate/workflow/xlsx_workflow.py
+++ b/docutranslate/workflow/xlsx_workflow.py
@@ -1,11 +1,12 @@
 import asyncio
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Self, Type
+from typing import Self

+from docutranslate.converter.base import ConverterConfig
 from docutranslate.converter.converter_identity import ConverterIdentity
 from docutranslate.converter.x2xlsx.base import X2XlsxConverter
-from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx
+from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx, ConverterCsv2XlsxConfig
 from docutranslate.exporter.base import ExporterConfig
 from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter
 from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter
@@ -24,11 +25,6 @@ class XlsxWorkflowConfig(WorkflowConfig):

 class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig],
                   XlsxExportable[ExporterConfig], CsvExportable[ExporterConfig]):
-    _converter_factory: dict[
-        str, Type[X2XlsxConverter | ConverterIdentity]] = {
-        ".csv": ConverterCsv2Xlsx,
-        ".xlsx": ConverterIdentity
-    }

    def __init__(self, config: XlsxWorkflowConfig):
        super().__init__(config=config)
@@ -36,17 +32,25 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta
            for sub_config in [self.config.translator_config]:
                if sub_config:
                    sub_config.logger = config.logger
+        self._converter_factory: dict[
+            str, tuple[
+                type[X2XlsxConverter | ConverterIdentity], ConverterConfig|None]] = {
+            ".csv": (ConverterCsv2Xlsx, ConverterCsv2XlsxConfig(logger=self.logger)),
+            ".xlsx": (ConverterIdentity,None)
+        }

    def _get_document_xlsx(self, document: Document) -> Document:
        suffix = document.suffix
-        converter_type = self._converter_factory.get(suffix)
-        if converter_type is None:
+        converter_types = self._converter_factory.get(suffix)
+        if converter_types is None:
            raise ValueError(f"Xlsx工作流不支持{suffix}格式文件")
-        converter = converter_type()
+        converter_type, converter_config = converter_types
+        converter = converter_type(converter_config)
+
        return converter.convert(document)

-    def _pre_translate(self, document_pre_transalte: Document):
-        document = document_pre_transalte.copy()
+    def _pre_translate(self, document_pre_translate: Document):
+        document = document_pre_translate.copy()
        translate_config = self.config.translator_config
        translator = XlsxTranslator(translate_config)
        return document, translator