第一个能用的前后端

This commit is contained in:
xunbu
2025-07-29 15:00:49 +08:00
parent 556b6d84ba
commit a4568afafb
11 changed files with 334 additions and 266 deletions

View File

@@ -23,13 +23,27 @@ from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, Fil
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from docutranslate import FileTranslater, __version__ # --- 核心代码重构后的新 Imports ---
from docutranslate.manager.base_manager import BaseManager
from docutranslate.manager.md_based_manager import MarkdownBasedManager
from docutranslate.manager.txt_manager import TXTManager
from docutranslate.manager.interfaces import HTMLExportable, MDFormatsExportable, TXTExportable
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig
from docutranslate.translater.base import AiTranslateConfig
from docutranslate.translater.md_translator import MDTranslateConfig
from docutranslate.translater.txt_translator import TXTTranslateConfig
# ------------------------------------
from docutranslate import __version__
from docutranslate.global_values import available_packages from docutranslate.global_values import available_packages
from docutranslate.logger import global_logger from docutranslate.logger import global_logger
from docutranslate.translater import default_params from docutranslate.translater import default_params
from docutranslate.utils.resource_utils import resource_path from docutranslate.utils.resource_utils import resource_path
# --- 全局配置 --- # --- 全局配置 (MODIFIED) ---
tasks_state: Dict[str, Dict[str, Any]] = {} tasks_state: Dict[str, Dict[str, Any]] = {}
tasks_log_queues: Dict[str, asyncio.Queue] = {} tasks_log_queues: Dict[str, asyncio.Queue] = {}
tasks_log_histories: Dict[str, List[str]] = {} tasks_log_histories: Dict[str, List[str]] = {}
@@ -37,18 +51,33 @@ MAX_LOG_HISTORY = 200
httpx_client: httpx.AsyncClient httpx_client: httpx.AsyncClient
# --- 辅助函数 --- # --- 辅助函数 (MODIFIED) ---
def _create_default_task_state() -> Dict[str, Any]: def _create_default_task_state() -> Dict[str, Any]:
"""创建新的默认任务状态,存储 manager 实例而不是具体内容"""
return { return {
"is_processing": False, "status_message": "空闲", "error_flag": False, "is_processing": False, "status_message": "空闲", "error_flag": False,
"download_ready": False, "markdown_content": None, "markdown_zip_content": None, "download_ready": False,
"html_content": None, "original_filename_stem": None, "task_start_time": 0, "manager_instance": None, # <--- 核心改动:存储翻译后的 Manager 实例
"original_filename_stem": None, "task_start_time": 0,
"task_end_time": 0, "current_task_ref": None, "task_end_time": 0, "current_task_ref": None,
"original_filename": None, "original_filename": None,
} }
# --- 日志处理器 (修改接收task_id用于控制台打印) --- # --- Manager 工厂函数 (NEW) ---
def _get_manager_for_file(filename: str, logger: logging.Logger) -> BaseManager:
"""根据文件名后缀选择并返回合适的 Manager 实例。这是扩展点。"""
suffix = Path(filename).suffix.lower()
if suffix == '.txt':
logger.info("检测到 .txt 文件,使用 TXTManager。")
return TXTManager(logger=logger)
else:
# 默认为基于 Markdown 的流程(处理 .pdf, .docx, .md 等)
logger.info(f"检测到 {suffix} 文件,使用 MarkdownBasedManager。")
return MarkdownBasedManager(logger=logger)
# --- 日志处理器 (保持不变) ---
class QueueAndHistoryHandler(logging.Handler): class QueueAndHistoryHandler(logging.Handler):
def __init__(self, queue_ref: asyncio.Queue, history_list_ref: List[str], max_history_items: int, task_id: str): def __init__(self, queue_ref: asyncio.Queue, history_list_ref: List[str], max_history_items: int, task_id: str):
super().__init__() super().__init__()
@@ -59,18 +88,12 @@ class QueueAndHistoryHandler(logging.Handler):
def emit(self, record: logging.LogRecord): def emit(self, record: logging.LogRecord):
log_entry = self.format(record) log_entry = self.format(record)
# 打印到控制台并带上任务ID前缀
print(f"[{self.task_id}] {log_entry}") print(f"[{self.task_id}] {log_entry}")
# 添加到历史记录
self.history_list.append(log_entry) self.history_list.append(log_entry)
if len(self.history_list) > self.max_history: if len(self.history_list) > self.max_history:
del self.history_list[:len(self.history_list) - self.max_history] del self.history_list[:len(self.history_list) - self.max_history]
# 放入异步队列供API拉取
if self.queue is not None: if self.queue is not None:
try: try:
# 使用事件循环来安全地从线程logging可能在不同线程放入队列
main_loop = getattr(app.state, "main_event_loop", None) main_loop = getattr(app.state, "main_event_loop", None)
if main_loop and main_loop.is_running(): if main_loop and main_loop.is_running():
main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry) main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry)
@@ -82,7 +105,7 @@ class QueueAndHistoryHandler(logging.Handler):
print(f"[{self.task_id}] Error putting log to queue: {e}. Log: {log_entry}") print(f"[{self.task_id}] Error putting log to queue: {e}. Log: {log_entry}")
# --- 应用生命周期事件 --- # --- 应用生命周期事件 (保持不变) ---
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
global httpx_client global httpx_client
@@ -91,75 +114,97 @@ async def lifespan(app: FastAPI):
tasks_state.clear() tasks_state.clear()
tasks_log_queues.clear() tasks_log_queues.clear()
tasks_log_histories.clear() tasks_log_histories.clear()
# 全局日志器配置(如果需要)
global_logger.propagate = False global_logger.propagate = False
global_logger.setLevel(logging.INFO) global_logger.setLevel(logging.INFO)
print("应用启动完成,多任务状态已初始化。") print("应用启动完成,多任务状态已初始化。")
yield yield
await httpx_client.aclose() await httpx_client.aclose()
print("应用关闭,资源已清理。") print("应用关闭,资源已清理。")
# --- Background Task Logic (核心业务逻辑, 已修改) --- # --- Background Task Logic (核心业务逻辑, 已重构) ---
async def _perform_translation(task_id: str, params: Dict[str, Any], file_contents: bytes, original_filename: str): async def _perform_translation(task_id: str, params: Dict[str, Any], file_contents: bytes, original_filename: str):
task_state = tasks_state[task_id] task_state = tasks_state[task_id]
log_queue = tasks_log_queues[task_id] log_queue = tasks_log_queues[task_id]
log_history = tasks_log_histories[task_id] log_history = tasks_log_histories[task_id]
# 1. 为此任务创建一个独立的 logger
task_logger = logging.getLogger(f"task.{task_id}") task_logger = logging.getLogger(f"task.{task_id}")
task_logger.setLevel(logging.INFO) task_logger.setLevel(logging.INFO)
task_logger.propagate = False # 关键:防止日志冒泡到 root logger避免重复输出 task_logger.propagate = False
# 如果 logger 已有 handlers (例如任务重试), 先清空
if task_logger.hasHandlers(): if task_logger.hasHandlers():
task_logger.handlers.clear() task_logger.handlers.clear()
# 2. 创建一个 handler它会处理此任务的日志打印到控制台 & 放入队列)
task_handler = QueueAndHistoryHandler(log_queue, log_history, MAX_LOG_HISTORY, task_id=task_id) task_handler = QueueAndHistoryHandler(log_queue, log_history, MAX_LOG_HISTORY, task_id=task_id)
task_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) task_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
# 3. 将 handler 添加到独立的 task_logger
task_logger.addHandler(task_handler) task_logger.addHandler(task_handler)
task_logger.info(f"后台翻译任务开始: 文件 '{original_filename}'") task_logger.info(f"后台翻译任务开始: 文件 '{original_filename}'")
task_state["status_message"] = f"正在处理 '{original_filename}'..." task_state["status_message"] = f"正在处理 '{original_filename}'..."
try: try:
task_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}") # 1. 选择合适的 Manager
manager = _get_manager_for_file(original_filename, task_logger)
# 4. 将独立的 task_logger 传递给 FileTranslater # 2. 从扁平化的 params 构建结构化的 Config 对象
ft = FileTranslater( ai_config = AiTranslateConfig(
base_url=params['base_url'], key=params['apikey'], model_id=params['model_id'], base_url=params['base_url'],
chunk_size=params['chunk_size'], concurrent=params['concurrent'], api_key=params['apikey'],
temperature=params['temperature'], convert_engin=params['convert_engin'], model_id=params['model_id'],
mineru_token=params['mineru_token'], to_lang=params['to_lang'],
logger=task_logger # <--- 核心修改 custom_prompt=params['custom_prompt_translate'],
temperature=params['temperature'],
timeout=2000, # 保持默认或从params获取
chunk_size=params['chunk_size'],
concurrent=params['concurrent'],
logger=task_logger
) )
await ft.translate_bytes_async( # 3. 读取文件内容
name=original_filename, file=file_contents, to_lang=params['to_lang'], file_stem = Path(original_filename).stem
formula=params['formula_ocr'], code=params['code_ocr'], file_suffix = Path(original_filename).suffix
custom_prompt_translate=params['custom_prompt_translate'], manager.read_bytes(content=file_contents, stem=file_stem, suffix=file_suffix)
refine=params['refine_markdown'], save=False
)
md_content = ft.export_to_markdown()
md_zip_content = await ft.export_to_unembed_markdown_async()
try:
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js",
timeout=3)
html_content = await ft.export_to_html_async(title=task_state["original_filename_stem"], cdn=True)
except (httpx.TimeoutException, httpx.RequestError):
task_logger.info("CDN连接失败使用本地JS进行渲染。")
html_content = await ft.export_to_html_async(title=task_state["original_filename_stem"], cdn=False)
# 4. 根据 Manager 类型执行不同的翻译流程
if isinstance(manager, MarkdownBasedManager):
task_logger.info("使用 Markdown 翻译流程。")
translate_config = MDTranslateConfig(**ai_config.__dict__)
convert_engin = params['convert_engin']
convert_config = None
if convert_engin == 'mineru':
if not params.get('mineru_token'):
raise ValueError("使用 'mineru' 引擎需要提供 'mineru_token'")
convert_config = ConverterMineruConfig(
mineru_token=params['mineru_token'],
formula=params['formula_ocr']
)
elif convert_engin == 'docling':
convert_config = ConverterDoclingConfig(
code=params['code_ocr'],
formula=params['formula_ocr']
)
await manager.translate_async(
convert_engin=convert_engin,
convert_config=convert_config,
translate_config=translate_config
)
elif isinstance(manager, TXTManager):
task_logger.info("使用 TXT 翻译流程。")
translate_config = TXTTranslateConfig(**ai_config.__dict__)
await manager.translate_async(translate_config=translate_config)
else:
raise TypeError(f"不支持的 Manager 类型: {type(manager).__name__}")
# 5. 任务成功,存储 manager 实例并更新状态
end_time = time.time() end_time = time.time()
duration = end_time - task_state["task_start_time"] duration = end_time - task_state["task_start_time"]
task_state.update({ task_state.update({
"markdown_content": md_content, "markdown_zip_content": md_zip_content, "manager_instance": manager, # <--- 存储实例
"html_content": html_content, "status_message": f"翻译成功!用时 {duration:.2f} 秒。", "status_message": f"翻译成功!用时 {duration:.2f} 秒。",
"download_ready": True, "error_flag": False, "task_end_time": end_time, "download_ready": True,
"error_flag": False,
"task_end_time": end_time,
}) })
task_logger.info(f"翻译成功完成,用时 {duration:.2f} 秒。") task_logger.info(f"翻译成功完成,用时 {duration:.2f} 秒。")
@@ -168,9 +213,11 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
duration = end_time - task_state["task_start_time"] duration = end_time - task_state["task_start_time"]
task_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).") task_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).")
task_state.update({ task_state.update({
"status_message": f"翻译任务已取消 (用时 {duration:.2f} 秒).", "error_flag": False, "status_message": f"翻译任务已取消 (用时 {duration:.2f} 秒).",
"download_ready": False, "markdown_content": None, "md_zip_content": None, "error_flag": False,
"html_content": None, "task_end_time": end_time, "download_ready": False,
"manager_instance": None,
"task_end_time": end_time,
}) })
except Exception as e: except Exception as e:
@@ -180,19 +227,20 @@ async def _perform_translation(task_id: str, params: Dict[str, Any], file_conten
task_logger.error(error_message, exc_info=True) task_logger.error(error_message, exc_info=True)
task_state.update({ task_state.update({
"status_message": f"翻译过程中发生错误 (用时 {duration:.2f} 秒): {e}", "status_message": f"翻译过程中发生错误 (用时 {duration:.2f} 秒): {e}",
"error_flag": True, "download_ready": False, "markdown_content": None, "error_flag": True,
"md_zip_content": None, "html_content": None, "task_end_time": end_time, "download_ready": False,
"manager_instance": None,
"task_end_time": end_time,
}) })
finally: finally:
task_state["is_processing"] = False task_state["is_processing"] = False
task_state["current_task_ref"] = None task_state["current_task_ref"] = None
task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。") task_logger.info(f"后台翻译任务 '{original_filename}' 处理结束。")
# 清理 handler释放资源
task_logger.removeHandler(task_handler) task_logger.removeHandler(task_handler)
# --- 核心任务启动与取消逻辑 (无修改) --- # --- 核心任务启动与取消逻辑 (保持不变) ---
async def _start_translation_task( async def _start_translation_task(
task_id: str, task_id: str,
params: Dict[str, Any], params: Dict[str, Any],
@@ -211,7 +259,7 @@ async def _start_translation_task(
task_state["is_processing"] = True task_state["is_processing"] = True
task_state.update({ task_state.update({
"status_message": "任务初始化中...", "error_flag": False, "download_ready": False, "status_message": "任务初始化中...", "error_flag": False, "download_ready": False,
"markdown_content": None, "md_zip_content": None, "html_content": None, "manager_instance": None, # 重置
"original_filename_stem": Path(original_filename).stem, "original_filename_stem": Path(original_filename).stem,
"original_filename": original_filename, "original_filename": original_filename,
"task_start_time": time.time(), "task_end_time": 0, "current_task_ref": None, "task_start_time": time.time(), "task_end_time": 0, "current_task_ref": None,
@@ -227,7 +275,7 @@ async def _start_translation_task(
break break
initial_log_msg = f"收到新的翻译请求: {original_filename}" initial_log_msg = f"收到新的翻译请求: {original_filename}"
print(f"[{task_id}] {initial_log_msg}") # 初始消息直接打印 print(f"[{task_id}] {initial_log_msg}")
log_history.append(initial_log_msg) log_history.append(initial_log_msg)
await log_queue.put(initial_log_msg) await log_queue.put(initial_log_msg)
@@ -261,7 +309,7 @@ def _cancel_translation_logic(task_id: str):
return {"cancelled": True, "message": "取消请求已发送。请等待状态更新。"} return {"cancelled": True, "message": "取消请求已发送。请等待状态更新。"}
# --- FastAPI 应用和路由设置 --- # --- FastAPI 应用和路由设置 (保持不变) ---
tags_metadata = [ tags_metadata = [
{ {
"name": "Service API", "name": "Service API",
@@ -304,7 +352,6 @@ DocuTranslate 后端服务 API提供文档翻译、状态查询、结果下
) )
service_router = APIRouter(prefix="/service", tags=["Service API"]) service_router = APIRouter(prefix="/service", tags=["Service API"])
STATIC_DIR = resource_path("static") STATIC_DIR = resource_path("static")
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static") app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
@@ -313,78 +360,32 @@ app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
# --- Pydantic Models for Service API (MODIFIED) --- # --- Pydantic Models for Service API (MODIFIED) ---
# =================================================================== # ===================================================================
class TranslateServiceRequest(BaseModel): class TranslateServiceRequest(BaseModel):
base_url: str = Field( base_url: str = Field(..., description="LLM API的基础URL。", examples=["https://api.openai.com/v1"])
..., apikey: str = Field(..., description="LLM API的密钥。", examples=["sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxx"])
description="LLM API的基础URL例如 OpenAI, deepseek, 或任何兼容OpenAI的接口。", model_id: str = Field(..., description="要使用的LLM模型ID。", examples=["gpt-4o"])
examples=["https://api.openai.com/v1"] to_lang: str = Field(default="中文", description="目标翻译语言。", examples=["简体中文", "English"])
)
apikey: str = Field( # --- Converter Params ---
..., convert_engin: Literal["mineru", "docling", "auto"] = Field(
description="LLM API的密钥。", "auto",
examples=["sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxx"] description="文档解析引擎。`mineru`在线服务, `docling`本地引擎, `auto`自动选择(优先mineru)。",
) examples=["mineru", "docling", "auto"]
model_id: str = Field(
...,
description="要使用的LLM模型ID。",
examples=["gpt-4o", "gpt-4-turbo", "llama3-70b-8192"]
)
to_lang: str = Field(
default="中文",
description="目标翻译语言。",
examples=["简体中文", "English", "英语"]
)
formula_ocr: bool = Field(
default=True,
description="是否对文档中的公式进行OCR识别和渲染。"
)
code_ocr: bool = Field(
default=True,
description="是否对文档中的代码块进行OCR识别。仅在使用 `docling` 引擎时有效。"
)
refine_markdown: bool = Field(
default=False,
description="是否在翻译前使用AI对原始解析出的Markdown进行一次优化目前不推荐常规使用。"
)
convert_engin: str = Field(
"mineru",
description="文档解析和转换引擎。`mineru` 是默认的在线服务,`docling` 是可选的本地引擎(如果已安装)。",
examples=["mineru", "docling"]
)
mineru_token: Optional[str] = Field(
default=None,
description="当 `convert_engin` 设置为 'mineru'此项为必填的API令牌。",
examples=["your-secret-mineru-token"]
)
chunk_size: int = Field(
default_params["chunk_size"],
description="将文本分割的块大小(以字符为单位)。",
examples=[3000]
)
concurrent: int = Field(
default_params["concurrent"],
description="同时向LLM API发送的并发请求数量。增加此值可以加快翻译速度但需注意不要超过API的速率限制。",
examples=[10]
)
temperature: float = Field(
default_params["temperature"],
description="LLM的温度参数介于0和2之间。较高的值如0.8会使输出更随机而较低的值如0.2)会使其更具确定性。对于翻译任务,建议使用较低的值。",
examples=[0.1]
)
custom_prompt_translate: Optional[str] = Field(
default=None,
description="用户自定义的翻译Prompt。可以提供额外的指令例如要求保留特定术语、指定翻译风格等。它将被附加到默认的系统Prompt之后。",
examples=["请将“DocuTranslate”保持原文不要翻译。"]
)
file_name: str = Field(
...,
description="上传的原始文件名,包含扩展名。用于确定文件类型和生成输出文件名。",
examples=["my_research_paper.pdf"]
)
file_content: str = Field(
...,
description="Base64编码的文件内容。",
examples=["JVBERi0xLjQKJeLjz9MKMSAwIG9iago8PAovVHlwZXMvUGFnZXM..."]
) )
mineru_token: Optional[str] = Field(None, description="当 `convert_engin` 为 'mineru' 时必填的API令牌。")
formula_ocr: bool = Field(True, description="是否对公式进行OCR识别。对 `mineru` 和 `docling` 均有效。")
code_ocr: bool = Field(True, description="是否对代码块进行OCR识别。仅 `docling` 引擎有效。")
# --- Translator Params ---
chunk_size: int = Field(default_params["chunk_size"], description="文本分割的块大小(字符)。")
concurrent: int = Field(default_params["concurrent"], description="并发请求数。")
temperature: float = Field(default_params["temperature"], description="LLM温度参数。")
custom_prompt_translate: Optional[str] = Field(None, description="用户自定义的翻译Prompt。")
# --- File Info ---
file_name: str = Field(..., description="上传的原始文件名,含扩展名。", examples=["my_paper.pdf"])
file_content: str = Field(..., description="Base64编码的文件内容。", examples=["JVBERi0xLjQK..."])
# refine_markdown: bool = Field(False, description="[已废弃] 此功能在新版中已移除。")
class Config: class Config:
json_schema_extra = { json_schema_extra = {
@@ -393,11 +394,10 @@ class TranslateServiceRequest(BaseModel):
"apikey": "sk-your-api-key-here", "apikey": "sk-your-api-key-here",
"model_id": "gpt-4o", "model_id": "gpt-4o",
"to_lang": "简体中文", "to_lang": "简体中文",
"formula_ocr": True,
"code_ocr": True,
"refine_markdown": False,
"convert_engin": "mineru", "convert_engin": "mineru",
"mineru_token": "your-mineru-token-if-any", "mineru_token": "your-mineru-token-if-any",
"formula_ocr": True,
"code_ocr": True,
"chunk_size": 3000, "chunk_size": 3000,
"concurrent": 10, "concurrent": 10,
"temperature": 0.1, "temperature": 0.1,
@@ -409,7 +409,7 @@ class TranslateServiceRequest(BaseModel):
# =================================================================== # ===================================================================
# --- Service Endpoints (/service) (MODIFIED) --- # --- Service Endpoints (/service) (部分已重构) ---
# =================================================================== # ===================================================================
@service_router.post( @service_router.post(
@@ -453,6 +453,12 @@ async def service_translate(request: TranslateServiceRequest = Body(..., descrip
raise HTTPException(status_code=400, detail=f"无效的Base64文件内容: {e}") raise HTTPException(status_code=400, detail=f"无效的Base64文件内容: {e}")
params = request.model_dump(exclude={'file_name', 'file_content'}) params = request.model_dump(exclude={'file_name', 'file_content'})
# 自动选择引擎逻辑
if params['convert_engin'] == 'auto':
params['convert_engin'] = 'mineru' if params.get('mineru_token') else 'docling'
print(f"[{task_id}] 自动选择解析引擎: {params['convert_engin']}")
try: try:
response_data = await _start_translation_task( response_data = await _start_translation_task(
task_id=task_id, task_id=task_id,
@@ -462,7 +468,6 @@ async def service_translate(request: TranslateServiceRequest = Body(..., descrip
) )
return JSONResponse(content=response_data) return JSONResponse(content=response_data)
except HTTPException as e: except HTTPException as e:
# 重新包装为JSONResponse以匹配文档中的响应模型
if e.status_code == 429: if e.status_code == 429:
return JSONResponse(status_code=e.status_code, content={"task_started": False, "message": e.detail}) return JSONResponse(status_code=e.status_code, content={"task_started": False, "message": e.detail})
if e.status_code == 500: if e.status_code == 500:
@@ -538,18 +543,15 @@ async def service_release_task(
task_state = tasks_state.get(task_id) task_state = tasks_state.get(task_id)
message_parts = [] message_parts = []
# 如果任务正在运行,先取消它
if task_state and task_state.get("is_processing") and task_state.get("current_task_ref"): if task_state and task_state.get("is_processing") and task_state.get("current_task_ref"):
try: try:
print(f"[{task_id}] 任务正在进行中,将在释放前尝试取消。") print(f"[{task_id}] 任务正在进行中,将在释放前尝试取消。")
_cancel_translation_logic(task_id) _cancel_translation_logic(task_id)
message_parts.append("任务已被取消。") message_parts.append("任务已被取消。")
except HTTPException as e: except HTTPException as e:
# 忽略取消失败的异常(例如任务已完成),因为我们的最终目标是释放资源
print(f"[{task_id}] 取消任务时出现预期中的情况(可能已完成): {e.detail}") print(f"[{task_id}] 取消任务时出现预期中的情况(可能已完成): {e.detail}")
message_parts.append(f"任务取消步骤已跳过(可能已完成或取消)。") message_parts.append(f"任务取消步骤已跳过(可能已完成或取消)。")
# 释放所有相关资源
tasks_state.pop(task_id, None) tasks_state.pop(task_id, None)
tasks_log_queues.pop(task_id, None) tasks_log_queues.pop(task_id, None)
tasks_log_histories.pop(task_id, None) tasks_log_histories.pop(task_id, None)
@@ -588,11 +590,7 @@ async def service_release_task(
"original_filename": "annual_report_2023.pdf", "original_filename": "annual_report_2023.pdf",
"task_start_time": 1678886400.123, "task_start_time": 1678886400.123,
"task_end_time": 0, "task_end_time": 0,
"downloads": { "downloads": {}
"markdown": None,
"markdown_zip": None,
"html": None
}
} }
}, },
"completed": { "completed": {
@@ -626,11 +624,7 @@ async def service_release_task(
"original_filename": "annual_report_2023.pdf", "original_filename": "annual_report_2023.pdf",
"task_start_time": 1678886400.123, "task_start_time": 1678886400.123,
"task_end_time": 1678886445.793, "task_end_time": 1678886445.793,
"downloads": { "downloads": {}
"markdown": None,
"markdown_zip": None,
"html": None
}
} }
} }
} }
@@ -650,8 +644,17 @@ async def service_get_status(
if not task_state: if not task_state:
raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'") raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'")
def generate_service_url(file_type): # (MODIFIED) 动态生成可用的下载链接
return f"/service/download/{task_id}/{file_type}" if task_state["download_ready"] else None downloads = {}
if task_state.get("download_ready") and task_state.get("manager_instance"):
manager = task_state["manager_instance"]
if isinstance(manager, HTMLExportable):
downloads["html"] = f"/service/download/{task_id}/html"
if isinstance(manager, MDFormatsExportable):
downloads["markdown"] = f"/service/download/{task_id}/markdown"
downloads["markdown_zip"] = f"/service/download/{task_id}/markdown_zip"
if isinstance(manager, TXTExportable):
downloads["txt"] = f"/service/download/{task_id}/txt"
return JSONResponse(content={ return JSONResponse(content={
"task_id": task_id, "task_id": task_id,
@@ -663,11 +666,7 @@ async def service_get_status(
"original_filename": task_state.get("original_filename"), "original_filename": task_state.get("original_filename"),
"task_start_time": task_state["task_start_time"], "task_start_time": task_state["task_start_time"],
"task_end_time": task_state["task_end_time"], "task_end_time": task_state["task_end_time"],
"downloads": { "downloads": downloads
"markdown": generate_service_url("markdown"),
"markdown_zip": generate_service_url("markdown_zip"),
"html": generate_service_url("html"),
}
}) })
@@ -711,7 +710,50 @@ async def service_get_logs(
return JSONResponse(content={"logs": new_logs}) return JSONResponse(content={"logs": new_logs})
FileType = Literal["markdown", "markdown_zip", "html"] FileType = Literal["markdown", "markdown_zip", "html", "txt"]
async def _get_content_from_manager(task_id: str, file_type: FileType) -> tuple[bytes | str, str, str]:
"""辅助函数,从 manager 获取内容、媒体类型和文件名"""
task_state = tasks_state.get(task_id)
if not task_state:
raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'")
if not task_state.get("download_ready") or not task_state.get("manager_instance"):
raise HTTPException(status_code=404, detail="内容尚未准备好。")
manager: BaseManager = task_state["manager_instance"]
filename_stem = task_state['original_filename_stem']
try:
if file_type == 'html' and isinstance(manager, HTMLExportable):
# 自动判断使用哪种 HTML Export Config
config = MD2HTMLExportConfig(cdn=True) if isinstance(manager, MarkdownBasedManager) else TXT2HTMLExportConfig(cdn=True)
try:
# 尝试连接CDN失败则回退
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js", timeout=3)
except (httpx.TimeoutException, httpx.RequestError):
manager.logger.info("CDN连接失败使用本地JS进行渲染。")
if hasattr(config, 'cdn'):
config.cdn = False
content = manager.export_to_html(config)
return content.encode('utf-8'), "text/html; charset=utf-8", f"{filename_stem}_translated.html"
if file_type == 'markdown' and isinstance(manager, MDFormatsExportable):
md_content = manager.export_to_markdown()
return md_content.encode('utf-8'), "text/markdown; charset=utf-8", f"{filename_stem}_translated.md"
if file_type == 'markdown_zip' and isinstance(manager, MDFormatsExportable):
return manager.export_to_markdown_zip(), "application/zip", f"{filename_stem}_translated.zip"
if file_type == 'txt' and isinstance(manager, TXTExportable):
txt_content = manager.export_to_txt()
return txt_content.encode('utf-8'), "text/plain; charset=utf-8", f"{filename_stem}_translated.txt"
except Exception as e:
manager.logger.error(f"导出 {file_type} 时出错: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"导出 {file_type} 时发生内部错误: {e}")
raise HTTPException(status_code=404, detail=f"此任务不支持导出 '{file_type}' 类型的文件。")
@service_router.get( @service_router.get(
@@ -724,7 +766,8 @@ FileType = Literal["markdown", "markdown_zip", "html"]
"content": { "content": {
"text/markdown": {"schema": {"type": "string", "format": "binary"}}, "text/markdown": {"schema": {"type": "string", "format": "binary"}},
"application/zip": {"schema": {"type": "string", "format": "binary"}}, "application/zip": {"schema": {"type": "string", "format": "binary"}},
"text/html": {"schema": {"type": "string", "format": "binary"}} "text/html": {"schema": {"type": "string", "format": "binary"}},
"text/plain": {"schema": {"type": "string", "format": "binary"}},
} }
}, },
404: { 404: {
@@ -738,24 +781,9 @@ async def service_download_file(
file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html"]) file_type: FileType = FastApiPath(..., description="要下载的文件类型。", examples=["html"])
): ):
"""根据任务ID和文件类型下载翻译结果。""" """根据任务ID和文件类型下载翻译结果。"""
task_state = tasks_state.get(task_id) content, media_type, filename = await _get_content_from_manager(task_id, file_type)
if not task_state: raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'")
if not task_state["download_ready"]: raise HTTPException(status_code=404, detail="内容尚未准备好。")
content_map = {
"markdown": (task_state["markdown_content"], "text/markdown",
f"{task_state['original_filename_stem']}_translated.md"),
"markdown_zip": (task_state["markdown_zip_content"], "application/zip",
f"{task_state['original_filename_stem']}_translated.zip"),
"html": (task_state["html_content"], "text/html", f"{task_state['original_filename_stem']}_translated.html"),
}
if file_type not in content_map: raise HTTPException(status_code=404, detail="无效的文件类型。")
content, media_type, filename = content_map[file_type]
if content is None: raise HTTPException(status_code=404, detail=f"{file_type.capitalize()} 内容不可用。")
headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"} headers = {"Content-Disposition": f"attachment; filename*=UTF-8''{quote(filename, safe='', encoding='utf-8')}"}
if isinstance(content, str): return StreamingResponse(io.StringIO(content), media_type=media_type, headers=headers)
return StreamingResponse(io.BytesIO(content), media_type=media_type, headers=headers) return StreamingResponse(io.BytesIO(content), media_type=media_type, headers=headers)
@@ -767,7 +795,7 @@ async def service_download_file(
- **返回结构**: JSON对象包含 `file_type`, `filename`, 和 `content` 三个字段。 - **返回结构**: JSON对象包含 `file_type`, `filename`, 和 `content` 三个字段。
- **内容编码**: - **内容编码**:
- 对于 `html` `markdown` 类型, `content` 字段包含原始的文本内容。 - 对于 `html`, `markdown`, `txt` 类型, `content` 字段包含原始的文本内容。
- 对于 `markdown_zip` 类型, `content` 字段包含Base64编码后的字符串。 - 对于 `markdown_zip` 类型, `content` 字段包含Base64编码后的字符串。
- **使用场景**: 适用于需要以编程方式处理文件内容及其元数据(如建议的文件名)的客户端。 - **使用场景**: 适用于需要以编程方式处理文件内容及其元数据(如建议的文件名)的客户端。
- **下载就绪**: 调用前请通过状态接口确认 `download_ready` 为 `true`。 - **下载就绪**: 调用前请通过状态接口确认 `download_ready` 为 `true`。
@@ -778,19 +806,11 @@ async def service_download_file(
"content": { "content": {
"application/json": { "application/json": {
"examples": { "examples": {
"markdown": {
"summary": "Markdown 内容",
"value": {
"file_type": "markdown",
"original_filename": "my_doc.pdf",
"content": "# 标题\n\n这是翻译后的Markdown内容..."
}
},
"html": { "html": {
"summary": "HTML 内容", "summary": "HTML 内容",
"value": { "value": {
"file_type": "html", "file_type": "html",
"original_filename": "my_doc.pdf", "original_filename": "my_doc_translated.html",
"content": "<h1>标题</h1><p>这是翻译后的HTML内容...</p>" "content": "<h1>标题</h1><p>这是翻译后的HTML内容...</p>"
} }
}, },
@@ -798,7 +818,7 @@ async def service_download_file(
"summary": "ZIP 内容 (Base64)", "summary": "ZIP 内容 (Base64)",
"value": { "value": {
"file_type": "markdown_zip", "file_type": "markdown_zip",
"filename": "my_doc.pdf", "filename": "my_doc_translated.zip",
"content": "UEsDBBQAAAAIA... (base64-encoded string)" "content": "UEsDBBQAAAAIA... (base64-encoded string)"
} }
} }
@@ -807,7 +827,7 @@ async def service_download_file(
} }
}, },
404: { 404: {
"description": "资源未找到。可能的原因包括任务ID不存在、任务结果尚未就绪、或请求了无效的文件类型。", "description": "资源未找到。",
"content": {"application/json": {"example": {"detail": "内容尚未准备好。"}}} "content": {"application/json": {"example": {"detail": "内容尚未准备好。"}}}
}, },
} }
@@ -817,30 +837,22 @@ async def service_content(
file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html"]) file_type: FileType = FastApiPath(..., description="要获取内容的文件类型。", examples=["html"])
): ):
"""根据任务ID和文件类型以JSON格式返回内容。zip文件会进行Base64编码。""" """根据任务ID和文件类型以JSON格式返回内容。zip文件会进行Base64编码。"""
task_state = tasks_state.get(task_id) content, _, filename = await _get_content_from_manager(task_id, file_type)
if not task_state:
raise HTTPException(status_code=404, detail=f"找不到任务ID '{task_id}'")
if not task_state["download_ready"]: if isinstance(content, bytes):
raise HTTPException(status_code=404, detail="内容尚未准备好。") try:
# For text-based formats, decode to string
final_content = content.decode('utf-8')
except UnicodeDecodeError:
# For binary formats (like zip), encode to Base64
final_content = base64.b64encode(content).decode('utf-8')
else: # Should not happen with current _get_content_from_manager, but for safety
final_content = content
content_map = {
"markdown": (task_state.get("markdown_content"), task_state['original_filename']),
"markdown_zip": (task_state.get("markdown_zip_content"), task_state['original_filename']),
"html": (task_state.get("html_content"), task_state['original_filename']),
}
raw_content, filename = content_map.get(file_type, (None, None))
if raw_content is None:
raise HTTPException(status_code=404, detail=f"'{file_type}' 类型的内容不可用或生成失败。")
# 如果内容是字节串 (zip)则进行Base64编码否则直接使用字符串。
final_content = base64.b64encode(raw_content).decode('utf-8') if isinstance(raw_content, bytes) else raw_content
return JSONResponse(content={ return JSONResponse(content={
"file_type": file_type, "file_type": file_type,
"original_filename": filename, "filename": filename,
"content": final_content "content": final_content
}) })
@@ -854,14 +866,15 @@ async def service_content(
responses={ responses={
200: { 200: {
"description": "成功返回可用引擎列表。", "description": "成功返回可用引擎列表。",
"content": {"application/json": {"example": ["mineru", "docling"]}} "content": {"application/json": {"example": ["auto", "mineru", "docling"]}}
} }
} }
) )
async def service_get_engin_list(): async def service_get_engin_list():
"""返回可用的文档解析引擎列表。""" """返回可用的文档解析引擎列表。"""
engin_list = ["mineru"] engin_list = ["auto", "mineru"]
if available_packages.get("docling"): engin_list.append("docling") if available_packages.get("docling"):
engin_list.append("docling")
return JSONResponse(content=engin_list) return JSONResponse(content=engin_list)
@@ -952,10 +965,12 @@ async def custom_swagger_ui_html():
swagger_css_url="/static/swagger/swagger.css", swagger_css_url="/static/swagger/swagger.css",
) )
@app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False) @app.get(app.swagger_ui_oauth2_redirect_url, include_in_schema=False)
async def swagger_ui_redirect(): async def swagger_ui_redirect():
return get_swagger_ui_oauth2_redirect_html() return get_swagger_ui_oauth2_redirect_html()
@app.get("/redoc", include_in_schema=False) @app.get("/redoc", include_in_schema=False)
async def redoc_html(): async def redoc_html():
return get_redoc_html( return get_redoc_html(
@@ -963,10 +978,12 @@ async def redoc_html():
title=app.title + " - ReDoc", title=app.title + " - ReDoc",
redoc_js_url="/static/redoc/redoc.js", redoc_js_url="/static/redoc/redoc.js",
) )
### ###
@app.post("/temp/translate", @app.post("/temp/translate",
summary="[临时]同步翻译接口", summary="[临时]同步翻译接口 (已重构)",
description="一个简单的、同步的翻译接口,用于快速测试。不涉及后台任务、状态管理或多格式输出。**不建议在生产环境中使用。**", description="一个简单的、同步的翻译接口,用于快速测试。不涉及后台任务、状态管理或多格式输出。**不建议在生产环境中使用。**",
tags=["Temp"], tags=["Temp"],
responses={ responses={
@@ -990,43 +1007,49 @@ async def temp_translate(
base_url: str = Body(..., description="LLM API的基础URL。", examples=["https://api.openai.com/v1"]), base_url: str = Body(..., description="LLM API的基础URL。", examples=["https://api.openai.com/v1"]),
api_key: str = Body(..., description="LLM API的密钥。", examples=["sk-xxxxxxxxxx"]), api_key: str = Body(..., description="LLM API的密钥。", examples=["sk-xxxxxxxxxx"]),
model_id: str = Body(..., description="使用的模型ID。", examples=["gpt-4-turbo"]), model_id: str = Body(..., description="使用的模型ID。", examples=["gpt-4-turbo"]),
mineru_token: str = Body(..., description="Mineru引擎的Token。"), mineru_token: Optional[str] = Body(None, description="Mineru引擎的Token。"),
file_name: str = Body(..., file_name: str = Body(...,
description="文件名用以判断文件类型。当后缀为txt时该接口返回普通文本为其他后缀时返回翻译后的markdown文本", description="文件名用以判断文件类型。当后缀为txt时该接口返回普通文本为其他后缀时返回翻译后的markdown文本",
examples=["test.txt", "test.md", "test.pdf"]), examples=["test.txt", "test.md", "test.pdf"]),
file_content: str = Body(..., description="文件内容可以是纯文本或Base64编码的字符串。"), file_content: str = Body(..., description="文件内容可以是纯文本或Base64编码的字符串。"),
to_lang: str = Body("中文", description="目标语言。", examples=["中文", "英文", "English"]), to_lang: str = Body("中文", description="目标语言。", examples=["中文", "英文", "English"]),
concurrent: int = Body(default_params["concurrent"], description="ai翻译请求并发数"), concurrent: int = Body(default_params["concurrent"], description="ai翻译请求并发数"),
temperature: float | None = Body(default_params["temperature"], description="ai翻译请求温度"), temperature: float = Body(default_params["temperature"], description="ai翻译请求温度"),
chunk_size: int = Body(default_params["chunk_size"], description="文本分块大小bytes"), chunk_size: int = Body(default_params["chunk_size"], description="文本分块大小bytes"),
custom_prompt_translate: str | None = Body(None, description="翻译自定义提示词", custom_prompt_translate: Optional[str] = Body(None, description="翻译自定义提示词",
examples=["人名保持原文不翻译"]), examples=["人名保持原文不翻译"]),
): ):
"""一个用于快速测试的同步翻译接口。""" """一个用于快速测试的同步翻译接口。"""
try:
def is_base64(s): decoded_content = base64.b64decode(file_content)
try: except (ValueError, binascii.Error):
base64.b64decode(s, validate=True) decoded_content = file_content.encode('utf-8')
return True
except (ValueError, binascii.Error):
return False
ft = FileTranslater(base_url=base_url,
key=api_key,
model_id=model_id,
mineru_token=mineru_token,
concurrent=concurrent,
temperature=temperature,
chunk_size=chunk_size,
)
try: try:
decoded_content = base64.b64decode(file_content) if is_base64(file_content) else file_content.encode('utf-8') manager = _get_manager_for_file(file_name, global_logger)
await ft.translate_bytes_async(name=file_name, file=decoded_content, to_lang=to_lang, save=False,
custom_prompt_translate=custom_prompt_translate) ai_config = AiTranslateConfig(
return {"success": True, "content": ft.export_to_markdown()} base_url=base_url, api_key=api_key, model_id=model_id, to_lang=to_lang,
custom_prompt=custom_prompt_translate, temperature=temperature,
chunk_size=chunk_size, concurrent=concurrent, logger=global_logger, timeout=2000
)
manager.read_bytes(decoded_content, Path(file_name).stem, Path(file_name).suffix)
if isinstance(manager, MarkdownBasedManager):
translate_config = MDTranslateConfig(**ai_config.__dict__)
convert_config = ConverterMineruConfig(mineru_token=mineru_token) if mineru_token else None
convert_engin = 'mineru' if mineru_token else None
await manager.translate_async(convert_engin, convert_config, translate_config)
return {"success": True, "content": manager.document_translated.get_text()}
elif isinstance(manager, TXTManager):
translate_config = TXTTranslateConfig(**ai_config.__dict__)
await manager.translate_async(translate_config)
return {"success": True, "content": manager.export_to_txt()}
except Exception as e: except Exception as e:
print(f"翻译出现错误:{e.__repr__()}") print(f"临时翻译接口出现错误:{e.__repr__()}")
return {"success": False, "reason": e.__repr__()} return {"success": False, "reason": e.__repr__()}
@@ -1050,10 +1073,10 @@ def run_app(port: int | None = None):
print(f"正在启动 DocuTranslate WebUI 版本号:{__version__}\n") print(f"正在启动 DocuTranslate WebUI 版本号:{__version__}\n")
print(f"服务接口文档: http://127.0.0.1:{port_to_use}/docs\n") print(f"服务接口文档: http://127.0.0.1:{port_to_use}/docs\n")
print(f"请用浏览器访问 http://127.0.0.1:{port_to_use}\n") print(f"请用浏览器访问 http://127.0.0.1:{port_to_use}\n")
uvicorn.run(app, host=None, port=port_to_use, workers=1) uvicorn.run(app, host="0.0.0.0", port=port_to_use, workers=1)
except Exception as e: except Exception as e:
print(f"启动失败: {e}") print(f"启动失败: {e}")
if __name__ == "__main__": if __name__ == "__main__":
run_app() run_app()

View File

@@ -1,6 +1,3 @@
from typing import Self
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.interfaces import Exporter from docutranslate.exporter.interfaces import Exporter
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.ir.markdown_document import MarkdownDocument

View File

@@ -9,18 +9,12 @@ from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
@dataclass @dataclass
class MD2MDExportConfig(ExportConfig): class MD2MDExportConfig(ExportConfig):
embed_images: bool = True pass
class MD2MDExporter(MDExporter): class MD2MDExporter(MDExporter):
def __init__(self, export_config: MD2MDExportConfig | None=None): def __init__(self, export_config: MD2MDExportConfig | None=None):
export_config=export_config or MD2MDExportConfig() pass
self.embed_images=export_config.embed_images
def export(self,document:MarkdownDocument)->Document: def export(self,document:MarkdownDocument)->Document:
if self.embed_images: return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)
return Document.from_bytes(suffix=".md",content=document.content,stem=document.stem)
else:
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)

View File

@@ -0,0 +1,21 @@
from dataclasses import dataclass
from docutranslate.exporter.export_config import ExportConfig
from docutranslate.exporter.md2x.interfaces import MDExporter
from docutranslate.ir.markdown_document import MarkdownDocument,Document
from docutranslate.utils.markdown_utils import unembed_base64_images_to_zip
@dataclass
class MD2MDZIPExportConfig(ExportConfig):
pass
class MD2MDZipExporter(MDExporter):
def __init__(self, export_config: MD2MDZIPExportConfig | None=None):
pass
def export(self,document:MarkdownDocument)->Document:
return Document.from_bytes(suffix=".zip",content=unembed_base64_images_to_zip(document.content.decode(), markdown_name=document.name),stem=document.stem)

View File

@@ -25,8 +25,10 @@ class TXT2HTMLExporter(TXTExporter):
# language=html # language=html
pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />' pico = f'<style>{resource_path("static/pico.css").read_text(encoding="utf-8")}</style>' if not cdn else r'<link rel="stylesheet" href="https://s4.zstatic.net/ajax/libs/picocss/2.1.1/pico.min.css" integrity="sha512-+4kjFgVD0n6H3xt19Ox84B56MoS7srFn60tgdWFuO4hemtjhySKyW4LnftYZn46k3THUEiTTsbVjrHai+0MOFw==" crossorigin="anonymous" referrerpolicy="no-referrer" />'
body='\n'.join([r'<p>'+para+'</p>' for para in document.content.decode().split("\n")])
render = jinja2.Template(html_template).render( render = jinja2.Template(html_template).render(
title=document.stem, title=document.stem,
pico=pico, pico=pico,
body=body,
) )
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem) return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)

View File

@@ -6,27 +6,42 @@ from docutranslate.exporter.export_config import ExportConfig
T = TypeVar("T", bound=ExportConfig) T = TypeVar("T", bound=ExportConfig)
@runtime_checkable @runtime_checkable
class HTMLExportable(Protocol): class HTMLExportable(Protocol[T]):
def export_to_html(self, export_config: T) -> str: def export_to_html(self, export_config: T | None = None) -> str:
... ...
def save_as_html(self, name: str, output_dir: Path | str, export_config: T) -> Self: def save_as_html(self, name: str, output_dir: Path | str, export_config: T | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class MDExportable(Protocol): class MDExportable(Protocol[T]):
def export_to_markdown(self, export_config: T) -> str:
def export_to_markdown(self, export_config: T | None = None) -> str:
... ...
def save_as_markdown(self, name: str, output_dir: Path | str, export_config: T) -> Self: def save_as_markdown(self, name: str, output_dir: Path | str, export_config: T | None = None) -> Self:
... ...
@runtime_checkable @runtime_checkable
class TXTExportable(Protocol): class MDZIPExportable(Protocol[T]):
def export_to_markdown_zip(self, export_config: T | None = None) -> bytes:
...
def save_as_markdown_zip(self, name: str, output_dir: Path | str, export_config: T | None = None) -> Self:
...
@runtime_checkable
class MDFormatsExportable(MDZIPExportable[T], MDExportable[T], Protocol):
...
@runtime_checkable
class TXTExportable(Protocol[T]):
def export_to_txt(self) -> str: def export_to_txt(self) -> str:
... ...

View File

@@ -9,13 +9,14 @@ from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig,
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
from docutranslate.exporter.md2x.md2mdzip_exporter import MD2MDZIPExportConfig, MD2MDZipExporter
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
from docutranslate.manager.base_manager import BaseManager from docutranslate.manager.base_manager import BaseManager
from docutranslate.manager.interfaces import HTMLExportable, MDExportable from docutranslate.manager.interfaces import MDFormatsExportable, HTMLExportable
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): class MarkdownBasedManager(BaseManager, HTMLExportable, MDFormatsExportable):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = { self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
@@ -39,7 +40,8 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
elif convert_engin in self._converter_factory: elif convert_engin in self._converter_factory:
converter_class, config_class = self._converter_factory[convert_engin] converter_class, config_class = self._converter_factory[convert_engin]
if not isinstance(convert_config, config_class): if not isinstance(convert_config, config_class):
raise TypeError(f"未传入正确的convert_config应传入{config_class.__name__}类型") raise TypeError(
f"未传入正确的convert_config应传入{config_class.__name__}类型,现为{type(convert_config).__name__}类型")
converter = converter_class(convert_config, logger=self.logger) converter = converter_class(convert_config, logger=self.logger)
else: else:
raise ValueError(f"不存在{convert_engin}解析引擎") raise ValueError(f"不存在{convert_engin}解析引擎")
@@ -89,16 +91,26 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
return docu.content.decode() return docu.content.decode()
def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str: def export_to_markdown(self, export_config: MD2MDExportConfig | None = None) -> str:
docu = self._export(MD2MDExporter(export_config)) docu = self._export(MD2MDExporter())
return docu.content.decode() return docu.content.decode()
def export_to_markdown_zip(self, export_config: MD2MDZIPExportConfig | None = None) -> bytes:
docu = self._export(MD2MDZipExporter())
return docu.content
def save_as_html(self, name: str = None, output_dir: Path | str = "./output", def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2HTMLExportConfig | None = None) -> Self: export_config: MD2HTMLExportConfig | None = None) -> Self:
self._save(exporter=MD2HTMLExporter(export_config), name=name, output_dir=output_dir) self._save(exporter=MD2HTMLExporter(), name=name, output_dir=output_dir)
return self return self
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDExportConfig | None = None) -> Self: export_config: MD2MDExportConfig | None = None) -> Self:
self._save(exporter=MD2MDExporter(export_config), name=name, output_dir=output_dir) self._save(exporter=MD2MDExporter(), name=name, output_dir=output_dir)
return self
def save_as_markdown_zip(self, name: str = None, output_dir: Path | str = "./output",
export_config: MD2MDZIPExportConfig | None = None) -> Self:
self._save(exporter=MD2MDZipExporter(), name=name, output_dir=output_dir)
return self return self

View File

@@ -27,7 +27,7 @@ class TXTManager(BaseManager, HTMLExportable,TXTExportable):
self.document_translated = document self.document_translated = document
return self return self
def export_to_html(self, export_config: TXT2HTMLExportConfig) -> str: def export_to_html(self, export_config: TXT2HTMLExportConfig=None) -> str:
docu = self._export(TXT2HTMLExporter(export_config)) docu = self._export(TXT2HTMLExporter(export_config))
return docu.content.decode() return docu.content.decode()

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,5 @@
default_params = {
"chunk_size": 3000,
"concurrent": 30,
"temperature": 0.7,
}

View File

@@ -1,6 +1,5 @@
import asyncio import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from logging import Logger
from typing import Self from typing import Self
from docutranslate.agents import MDTranslateAgent from docutranslate.agents import MDTranslateAgent