fix
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
import binascii
|
||||||
import io
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
@@ -12,13 +11,12 @@ import uuid
|
|||||||
from contextlib import asynccontextmanager, closing
|
from contextlib import asynccontextmanager, closing
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type
|
from typing import List, Dict, Any, Optional, Literal, Union, Annotated, TYPE_CHECKING, Type
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath
|
from fastapi import FastAPI, HTTPException, APIRouter, Body, Path as FastApiPath
|
||||||
from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html
|
from fastapi.openapi.docs import get_swagger_ui_html, get_swagger_ui_oauth2_redirect_html, get_redoc_html
|
||||||
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, FileResponse
|
from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from pydantic import BaseModel, Field, field_validator
|
from pydantic import BaseModel, Field, field_validator
|
||||||
|
|
||||||
@@ -247,7 +245,8 @@ class MarkdownWorkflowParams(BaseWorkflowParams):
|
|||||||
mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。")
|
mineru_token: Optional[str] = Field(None, description="当 `convert_engine` 为 'mineru' 时必填的API令牌。")
|
||||||
formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。")
|
formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。")
|
||||||
code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。")
|
code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。")
|
||||||
model_version: Literal["pipeline", "vlm"] = Field("vlm", description="Mineru模型的版本,'vlm'是更新的版本。仅 `mineru` 引擎有效。")
|
model_version: Literal["pipeline", "vlm"] = Field("vlm",
|
||||||
|
description="Mineru模型的版本,'vlm'是更新的版本。仅 `mineru` 引擎有效。")
|
||||||
|
|
||||||
@field_validator('mineru_token')
|
@field_validator('mineru_token')
|
||||||
def check_mineru_token(cls, v, values):
|
def check_mineru_token(cls, v, values):
|
||||||
@@ -348,7 +347,8 @@ TranslatePayload = Annotated[
|
|||||||
# 4. 创建最终的请求体模型
|
# 4. 创建最终的请求体模型
|
||||||
class TranslateServiceRequest(BaseModel):
|
class TranslateServiceRequest(BaseModel):
|
||||||
file_name: str = Field(..., description="上传的原始文件名,含扩展名。",
|
file_name: str = Field(..., description="上传的原始文件名,含扩展名。",
|
||||||
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub", "index.html"])
|
examples=["my_paper.pdf", "chapter1.txt", "data.xlsx", "video.srt", "my_book.epub",
|
||||||
|
"index.html"])
|
||||||
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
|
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
|
||||||
payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")
|
payload: TranslatePayload = Field(..., description="包含工作流类型和相应参数的载荷。")
|
||||||
|
|
||||||
@@ -654,7 +654,7 @@ async def _perform_translation(
|
|||||||
await workflow.translate_async()
|
await workflow.translate_async()
|
||||||
|
|
||||||
# 4. 任务成功,生成所有可下载文件并存储
|
# 4. 任务成功,生成所有可下载文件并存储
|
||||||
task_logger.info("翻译完成,正在生成结果文件...")
|
task_logger.info("翻译完成,正在生成临时结果文件...")
|
||||||
temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_")
|
temp_dir = tempfile.mkdtemp(prefix=f"docutranslate_{task_id}_")
|
||||||
task_state["temp_dir"] = temp_dir
|
task_state["temp_dir"] = temp_dir
|
||||||
downloadable_files = {}
|
downloadable_files = {}
|
||||||
@@ -689,7 +689,8 @@ async def _perform_translation(
|
|||||||
html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
|
html_config = Srt2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
elif isinstance(workflow, EpubWorkflow):
|
elif isinstance(workflow, EpubWorkflow):
|
||||||
html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
|
html_config = Epub2HTMLExporterConfig(cdn=is_cdn_available)
|
||||||
export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html", True)
|
export_map['html'] = (lambda: workflow.export_to_html(html_config), f"{filename_stem}_translated.html",
|
||||||
|
True)
|
||||||
if isinstance(workflow, MDFormatsExportable):
|
if isinstance(workflow, MDFormatsExportable):
|
||||||
export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True)
|
export_map['markdown'] = (workflow.export_to_markdown, f"{filename_stem}_translated.md", True)
|
||||||
export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False)
|
export_map['markdown_zip'] = (workflow.export_to_markdown_zip, f"{filename_stem}_translated.zip", False)
|
||||||
@@ -711,14 +712,13 @@ async def _perform_translation(
|
|||||||
# 循环生成文件
|
# 循环生成文件
|
||||||
for file_type, (export_func, filename, is_string_output) in export_map.items():
|
for file_type, (export_func, filename, is_string_output) in export_map.items():
|
||||||
try:
|
try:
|
||||||
task_logger.info(f"正在生成 {file_type} 文件: {filename}")
|
|
||||||
content = await asyncio.to_thread(export_func)
|
content = await asyncio.to_thread(export_func)
|
||||||
content_bytes = content.encode('utf-8') if is_string_output else content
|
content_bytes = content.encode('utf-8') if is_string_output else content
|
||||||
file_path = os.path.join(temp_dir, filename)
|
file_path = os.path.join(temp_dir, filename)
|
||||||
with open(file_path, "wb") as f:
|
with open(file_path, "wb") as f:
|
||||||
f.write(content_bytes)
|
f.write(content_bytes)
|
||||||
downloadable_files[file_type] = {"path": file_path, "filename": filename}
|
downloadable_files[file_type] = {"path": file_path, "filename": filename}
|
||||||
task_logger.info(f"成功生成 {file_type} 文件于: {file_path}")
|
task_logger.info(f"成功生成 {file_type} 文件")
|
||||||
except Exception as export_error:
|
except Exception as export_error:
|
||||||
task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True)
|
task_logger.error(f"生成 {file_type} 文件时出错: {export_error}", exc_info=True)
|
||||||
|
|
||||||
@@ -760,7 +760,7 @@ async def _perform_translation(
|
|||||||
|
|
||||||
if task_state["error_flag"] and temp_dir and os.path.isdir(temp_dir):
|
if task_state["error_flag"] and temp_dir and os.path.isdir(temp_dir):
|
||||||
shutil.rmtree(temp_dir)
|
shutil.rmtree(temp_dir)
|
||||||
task_logger.info(f"因任务失败,已清理临时目录: {temp_dir}")
|
task_logger.info(f"因任务失败,已清理临时目录")
|
||||||
task_state["temp_dir"] = None
|
task_state["temp_dir"] = None
|
||||||
|
|
||||||
task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
|
task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
|
||||||
@@ -1199,7 +1199,8 @@ async def service_content(
|
|||||||
|
|
||||||
file_info = task_state.get("downloadable_files", {}).get(file_type)
|
file_info = task_state.get("downloadable_files", {}).get(file_type)
|
||||||
if not file_info or not os.path.exists(file_info.get("path")):
|
if not file_info or not os.path.exists(file_info.get("path")):
|
||||||
raise HTTPException(status_code=404, detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。")
|
raise HTTPException(status_code=404,
|
||||||
|
detail=f"任务 '{task_id}' 不支持获取 '{file_type}' 类型的内容,或文件已丢失。")
|
||||||
|
|
||||||
file_path = file_info["path"]
|
file_path = file_info["path"]
|
||||||
filename = file_info["filename"]
|
filename = file_info["filename"]
|
||||||
@@ -1299,7 +1300,8 @@ async def temp_translate(
|
|||||||
decoded_content = file_content.encode('utf-8')
|
decoded_content = file_content.encode('utf-8')
|
||||||
try:
|
try:
|
||||||
workflow_config = MarkdownBasedWorkflowConfig(
|
workflow_config = MarkdownBasedWorkflowConfig(
|
||||||
convert_engine="mineru", converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version),
|
convert_engine="mineru",
|
||||||
|
converter_config=ConverterMineruConfig(mineru_token=mineru_token, model_version=model_version),
|
||||||
translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id,
|
translator_config=MDTranslatorConfig(base_url=base_url, api_key=api_key, model_id=model_id,
|
||||||
to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature,
|
to_lang=to_lang, custom_prompt=custom_prompt, temperature=temperature,
|
||||||
thinking=thinking, chunk_size=chunk_size, concurrent=concurrent),
|
thinking=thinking, chunk_size=chunk_size, concurrent=concurrent),
|
||||||
@@ -1339,4 +1341,4 @@ def run_app(port: int | None = None):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_app()
|
run_app()
|
||||||
|
|||||||
@@ -29,4 +29,4 @@ class Converter(ABC):
|
|||||||
...
|
...
|
||||||
|
|
||||||
async def convert_async(self, document: Document) -> Document:
|
async def convert_async(self, document: Document) -> Document:
|
||||||
...
|
...
|
||||||
@@ -1,4 +1,7 @@
|
|||||||
from docutranslate.converter.base import Converter
|
from dataclasses import dataclass
|
||||||
|
from typing import Hashable
|
||||||
|
|
||||||
|
from docutranslate.converter.base import Converter, ConverterConfig
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,24 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import csv
|
import csv
|
||||||
import logging
|
from dataclasses import dataclass
|
||||||
from io import BytesIO, StringIO
|
from io import BytesIO, StringIO
|
||||||
|
from typing import Hashable
|
||||||
|
|
||||||
# 引入 chardet 用于编码检测
|
# 引入 chardet 用于编码检测
|
||||||
import chardet
|
import chardet
|
||||||
import openpyxl
|
import openpyxl
|
||||||
from docutranslate.converter.x2xlsx.base import X2XlsxConverter
|
|
||||||
|
from docutranslate.converter.x2xlsx.base import X2XlsxConverter, X2XlsxConverterConfig
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
# 配置一个基本的日志记录器(如果您的项目尚未配置)
|
# 配置一个基本的日志记录器(如果您的项目尚未配置)
|
||||||
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class ConverterCsv2XlsxConfig(X2XlsxConverterConfig):
|
||||||
|
|
||||||
|
def gethash(self) -> Hashable:
|
||||||
|
return "1"
|
||||||
|
|
||||||
|
|
||||||
class ConverterCsv2Xlsx(X2XlsxConverter):
|
class ConverterCsv2Xlsx(X2XlsxConverter):
|
||||||
@@ -25,6 +32,8 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
|
|||||||
- 完善的错误处理和日志记录。
|
- 完善的错误处理和日志记录。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ConverterCsv2XlsxConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
|
||||||
def convert(self, document: Document) -> Document:
|
def convert(self, document: Document) -> Document:
|
||||||
"""
|
"""
|
||||||
@@ -107,4 +116,4 @@ class ConverterCsv2Xlsx(X2XlsxConverter):
|
|||||||
"""
|
"""
|
||||||
声明此转换器支持的源文件格式。
|
声明此转换器支持的源文件格式。
|
||||||
"""
|
"""
|
||||||
return [".csv"]
|
return [".csv"]
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Self, Literal, List, Optional
|
from typing import Self, Literal, List, Optional
|
||||||
|
|
||||||
@@ -173,4 +173,4 @@ class XlsxTranslator(Translator):
|
|||||||
|
|
||||||
document.content = await asyncio.to_thread(self._after_translate, workbook, cells_to_translate,
|
document.content = await asyncio.to_thread(self._after_translate, workbook, cells_to_translate,
|
||||||
translated_texts, original_texts)
|
translated_texts, original_texts)
|
||||||
return self
|
return self
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self, Type
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.converter.base import ConverterConfig
|
||||||
from docutranslate.converter.converter_identity import ConverterIdentity
|
from docutranslate.converter.converter_identity import ConverterIdentity
|
||||||
from docutranslate.converter.x2xlsx.base import X2XlsxConverter
|
from docutranslate.converter.x2xlsx.base import X2XlsxConverter
|
||||||
from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx
|
from docutranslate.converter.x2xlsx.converter_csv2xlsx import ConverterCsv2Xlsx, ConverterCsv2XlsxConfig
|
||||||
from docutranslate.exporter.base import ExporterConfig
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter
|
from docutranslate.exporter.xlsx.xlsx2csv_exporter import Xlsx2CsvExporter
|
||||||
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter
|
from docutranslate.exporter.xlsx.xlsx2html_exporter import Xlsx2HTMLExporterConfig, Xlsx2HTMLExporter
|
||||||
@@ -23,12 +24,7 @@ class XlsxWorkflowConfig(WorkflowConfig):
|
|||||||
|
|
||||||
|
|
||||||
class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig],
|
class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExportable[Xlsx2HTMLExporterConfig],
|
||||||
XlsxExportable[ExporterConfig],CsvExportable[ExporterConfig]):
|
XlsxExportable[ExporterConfig], CsvExportable[ExporterConfig]):
|
||||||
_converter_factory: dict[
|
|
||||||
str, Type[X2XlsxConverter | ConverterIdentity]] = {
|
|
||||||
".csv": ConverterCsv2Xlsx,
|
|
||||||
".xlsx": ConverterIdentity
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, config: XlsxWorkflowConfig):
|
def __init__(self, config: XlsxWorkflowConfig):
|
||||||
super().__init__(config=config)
|
super().__init__(config=config)
|
||||||
@@ -36,17 +32,25 @@ class XlsxWorkflow(Workflow[XlsxWorkflowConfig, Document, Document], HTMLExporta
|
|||||||
for sub_config in [self.config.translator_config]:
|
for sub_config in [self.config.translator_config]:
|
||||||
if sub_config:
|
if sub_config:
|
||||||
sub_config.logger = config.logger
|
sub_config.logger = config.logger
|
||||||
|
self._converter_factory: dict[
|
||||||
|
str, tuple[
|
||||||
|
type[X2XlsxConverter | ConverterIdentity], ConverterConfig|None]] = {
|
||||||
|
".csv": (ConverterCsv2Xlsx, ConverterCsv2XlsxConfig(logger=self.logger)),
|
||||||
|
".xlsx": (ConverterIdentity,None)
|
||||||
|
}
|
||||||
|
|
||||||
def _get_document_xlsx(self, document: Document) -> Document:
|
def _get_document_xlsx(self, document: Document) -> Document:
|
||||||
suffix = document.suffix
|
suffix = document.suffix
|
||||||
converter_type = self._converter_factory.get(suffix)
|
converter_types = self._converter_factory.get(suffix)
|
||||||
if converter_type is None:
|
if converter_types is None:
|
||||||
raise ValueError(f"Xlsx工作流不支持{suffix}格式文件")
|
raise ValueError(f"Xlsx工作流不支持{suffix}格式文件")
|
||||||
converter = converter_type()
|
converter_type, converter_config = converter_types
|
||||||
|
converter = converter_type(converter_config)
|
||||||
|
|
||||||
return converter.convert(document)
|
return converter.convert(document)
|
||||||
|
|
||||||
def _pre_translate(self, document_pre_transalte: Document):
|
def _pre_translate(self, document_pre_translate: Document):
|
||||||
document = document_pre_transalte.copy()
|
document = document_pre_translate.copy()
|
||||||
translate_config = self.config.translator_config
|
translate_config = self.config.translator_config
|
||||||
translator = XlsxTranslator(translate_config)
|
translator = XlsxTranslator(translate_config)
|
||||||
return document, translator
|
return document, translator
|
||||||
|
|||||||
Reference in New Issue
Block a user