diff --git a/docutranslate/app.py b/docutranslate/app.py index 4fa7275..a238dc5 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -1,7 +1,6 @@ import asyncio import base64 import binascii -import io import logging import os import shutil @@ -12,13 +11,12 @@ import uuid from contextlib import asynccontextmanager, closing from pathlib import Path from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type -from urllib.parse import quote import httpx import uvicorn from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html -from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, FileResponse +from fastapi.responses import HTMLResponse, JSONResponse, FileResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel, Field, field_validator @@ -247,7 +245,8 @@ class MarkdownWorkflowParams(BaseWorkflowParams): mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。") formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。") code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。") - model_version: Literal["pipeline", "vlm"] = Field("vlm", description="Mineru模型的版本,'vlm'是更新的版本。仅 `mineru` 引擎有效。") + model_version: Literal["pipeline", "vlm"] = Field("vlm", + description="Mineru模型的版本,'vlm'是更新的版本。仅 `mineru` 引擎有效。") @field_validator('mineru_token') def check_mineru_token(cls, v, values): @@ -348,7 +347,8 @@ TranslatePayload = Annotated[ # 4. 创建最终的请求体模型 class TranslateServiceRequest(BaseModel): file_name: str = Field(..., description="上传的原始文件名,含扩展名。", - examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"]) + examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", + "index.html"]) file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."]) payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。") @@ -654,7 +654,7 @@ async def _perform_translation( await workflow.translate_async() # 4. 任务成功,生成所有可下载文件并存储 - task_logger.info("翻译完成,正在生成结果文件...") + task_logger.info("翻译完成,正在生成临时结果文件...") temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_") task_state["temp_dir"] = temp_dir downloadable_files = {} @@ -689,7 +689,8 @@ async def _perform_translation( html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available) elif isinstance(workflow, EpubWorkflow): html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available) - export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html", True) + export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html", + True) if isinstance(workflow, MDFormatsExportable): export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True) export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False) @@ -711,14 +712,13 @@ async def _perform_translation( # 循环生成文件 for file_type, (export_func, filename, is_string_output) in export_map.items(): try: - task_logger.info(f"正在生成 {file_type} 文件: {filename}") content = await asyncio.to_thread(export_func) content_bytes = content.encode('utf-8') if is_string_output else content file_path = os.path.join(temp_dir, filename) with open(file_path, "wb") as f: f.write(content_bytes) downloadable_files[file_type] = {"path": file_path, "filename": filename} - task_logger.info(f"成功生成 {file_type} 文件于: {file_path}") + task_logger.info(f"成功生成 {file_type} 文件") except Exception as export_error: task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True) @@ -760,7 +760,7 @@ async def _perform_translation( if task_state["error_flag"] and temp_dir and os.path.isdir(temp_dir): shutil.rmtree(temp_dir) - task_logger.info(f"因任务失败,已清理临时目录: {temp_dir}") + task_logger.info(f"因任务失败,已清理临时目录") task_state["temp_dir"] = None task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。") @@ -1199,7 +1199,8 @@ async def service_content( file_info = task_state.get("downloadable_files", {}).get(file_type) if not file_info or not os.path.exists(file_info.get("path")): - raise HTTPException(status_code=404, detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。") + raise HTTPException(status_code=404, + detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。") file_path = file_info["path"] filename = file_info["filename"] @@ -1299,7 +1300,8 @@ async def temp_translate( decoded_content = file_content.encode('utf-8') try: workflow_config = MarkdownBasedWorkflowConfig( - convert_engine="mineru", converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version), + convert_engine="mineru", + converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version), translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id, to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature, thinking=thinking, chunk_size=chunk_size, concurrent=concurrent), @@ -1339,4 +1341,4 @@ def run_app(port: int | None = None): if __name__ == "__main__": - run_app() \ No newline at end of file + run_app() diff --git a/docutranslate/converter/base.py b/docutranslate/converter/base.py index 80e1117..5256649 100644 --- a/docutranslate/converter/base.py +++ b/docutranslate/converter/base.py @@ -29,4 +29,4 @@ class Converter(ABC): ... async def convert_async(self, document: Document) -> Document: - ... + ... \ No newline at end of file diff --git a/docutranslate/converter/converter_identity.py b/docutranslate/converter/converter_identity.py index 70d3ffb..1fb614f 100644 --- a/docutranslate/converter/converter_identity.py +++ b/docutranslate/converter/converter_identity.py @@ -1,4 +1,7 @@ -from docutranslate.converter.base import Converter +from dataclasses import dataclass +from typing import Hashable + +from docutranslate.converter.base import Converter, ConverterConfig from docutranslate.ir.document import Document diff --git a/docutranslate/converter/x2xlsx/converter_csv2xlsx.py b/docutranslate/converter/x2xlsx/converter_csv2xlsx.py index 7329044..43926fb 100644 --- a/docutranslate/converter/x2xlsx/converter_csv2xlsx.py +++ b/docutranslate/converter/x2xlsx/converter_csv2xlsx.py @@ -1,17 +1,24 @@ import asyncio import csv -import logging +from dataclasses import dataclass from io import BytesIO, StringIO +from typing import Hashable # 引入 chardet 用于编码检测 import chardet import openpyxl -from docutranslate.converter.x2xlsx.base import X2XlsxConverter + +from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig from docutranslate.ir.document import Document # 配置一个基本的日志记录器(如果您的项目尚未配置) # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +@dataclass(kw_only=True) +class ConverterCsv2XlsxConfig(X2XlsxConverterConfig): + + def gethash(self) -> Hashable: + return "1" class ConverterCsv2Xlsx(X2XlsxConverter): @@ -25,6 +32,8 @@ class ConverterCsv2Xlsx(X2XlsxConverter): - 完善的错误处理和日志记录。 """ + def __init__(self, config: ConverterCsv2XlsxConfig): + super().__init__(config=config) def convert(self, document: Document) -> Document: """ @@ -107,4 +116,4 @@ class ConverterCsv2Xlsx(X2XlsxConverter): """ 声明此转换器支持的源文件格式。 """ - return [".csv"] \ No newline at end of file + return [".csv"] diff --git a/docutranslate/translator/ai_translator/xlsx_translator.py b/docutranslate/translator/ai_translator/xlsx_translator.py index 5a4f161..e7dfb07 100644 --- a/docutranslate/translator/ai_translator/xlsx_translator.py +++ b/docutranslate/translator/ai_translator/xlsx_translator.py @@ -1,5 +1,5 @@ import asyncio -from dataclasses import dataclass, field +from dataclasses import dataclass from io import BytesIO from typing import Self, Literal, List, Optional @@ -173,4 +173,4 @@ class XlsxTranslator(Translator): document.content = await asyncio.to_thread(self._after_translate, workbook, cells_to_translate, translated_texts, original_texts) - return self \ No newline at end of file + return self diff --git a/docutranslate/workflow/xlsx_workflow.py b/docutranslate/workflow/xlsx_workflow.py index fb10240..e13b441 100644 --- a/docutranslate/workflow/xlsx_workflow.py +++ b/docutranslate/workflow/xlsx_workflow.py @@ -1,11 +1,12 @@ import asyncio from dataclasses import dataclass from pathlib import Path -from typing import Self, Type +from typing import Self +from docutranslate.converter.base import ConverterConfig from docutranslate.converter.converter_identity import ConverterIdentity from docutranslate.converter.x2xlsx.base import X2XlsxConverter -from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx +from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx, ConverterCsv2XlsxConfig from docutranslate.exporter.base import ExporterConfig from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter @@ -23,12 +24,7 @@ class XlsxWorkflowConfig(WorkflowConfig): class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig], - XlsxExportable[ExporterConfig],CsvExportable[ExporterConfig]): - _converter_factory: dict[ - str, Type[X2XlsxConverter | ConverterIdentity]] = { - ".csv": ConverterCsv2Xlsx, - ".xlsx": ConverterIdentity - } + XlsxExportable[ExporterConfig], CsvExportable[ExporterConfig]): def __init__(self, config: XlsxWorkflowConfig): super().__init__(config=config) @@ -36,17 +32,25 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta for sub_config in [self.config.translator_config]: if sub_config: sub_config.logger = config.logger + self._converter_factory: dict[ + str, tuple[ + type[X2XlsxConverter | ConverterIdentity], ConverterConfig|None]] = { + ".csv": (ConverterCsv2Xlsx, ConverterCsv2XlsxConfig(logger=self.logger)), + ".xlsx": (ConverterIdentity,None) + } def _get_document_xlsx(self, document: Document) -> Document: suffix = document.suffix - converter_type = self._converter_factory.get(suffix) - if converter_type is None: + converter_types = self._converter_factory.get(suffix) + if converter_types is None: raise ValueError(f"Xlsx工作流不支持{suffix}格式文件") - converter = converter_type() + converter_type, converter_config = converter_types + converter = converter_type(converter_config) + return converter.convert(document) - def _pre_translate(self, document_pre_transalte: Document): - document = document_pre_transalte.copy() + def _pre_translate(self, document_pre_translate: Document): + document = document_pre_translate.copy() translate_config = self.config.translator_config translator = XlsxTranslator(translate_config) return document, translator