From 6896d018c8d44273cc6113f9b915215e1044565b Mon Sep 17 00:00:00 2001 From: xunbu Date: Sun, 11 May 2025 22:40:29 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96ui?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/workspace.xml | 88 ++--- README.md | 20 +- docutranslate/app.py | 740 ++++++++++++++++++------------------------- pyproject.toml | 2 +- 4 files changed, 377 insertions(+), 473 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index e4ade9c..67e437e 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -6,7 +6,9 @@ + + - { + "keyToString": { + "DefaultHtmlFileTemplate": "HTML File", + "JavaScript 调试.output.html (1).executor": "Run", + "JavaScript 调试.output.html.executor": "Run", + "JavaScript 调试.regex_中文.html.executor": "Run", + "JavaScript 调试.test2.html.executor": "Run", + "JavaScript 调试.test2_英文.html.executor": "Run", + "JavaScript 调试.test4-1_中文.html.executor": "Run", + "JavaScript 调试.互联网认证授权机制.html.executor": "Run", + "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run", + "JavaScript 调试.毕业论文_英文.html.executor": "Run", + "ModuleVcsDetector.initialDetectionPerformed": "true", + "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run", + "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", + "Python.PDFtranslater (1).executor": "Run", + "Python.PDFtranslater (2).executor": "Run", + "Python.agent.executor": "Debug", + "Python.agent_utils.executor": "Run", + "Python.app.executor": "Run", + "Python.convert.executor": "Run", + "Python.markdown_splitter.executor": "Debug", + "Python.markdown_utils.executor": "Run", + "Python.test.executor": "Run", + "Python.test1.executor": "Run", + "Python.test2.executor": "Run", + "Python.test3.executor": "Run", + "Python.test4.executor": "Run", + "Python.translater.executor": "Run", + "RunOnceActivity.ShowReadmeOnStart": "true", + "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true", + "RunOnceActivity.git.unshallow": "true", + "git-widget-placeholder": "main", + "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "settings.editor.selected.configurable": "preferences.pluginManager", + "vue.rearranger.settings.migration": "true" } -}]]> +} @@ -645,7 +647,7 @@ - + diff --git a/README.md b/README.md index d7f649f..e33d54d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # 简介 -## DocuTranslate +## DocuTranslate + [![image](https://img.shields.io/badge/github-DocuTranslate-blue)](https://github.com/xunbu/docutranslate) 文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译 @@ -26,6 +27,7 @@ # 前置条件 ## huggingface换源 + > 不能科学上网的友友注意了 无法访问的huggingface的电脑在以下操作时请换源[点击测试](https://huggingface.co) @@ -61,11 +63,21 @@ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' ## 注意事项(第一次使用必看) -以下操作会自动从[huggingface](https://huggingface.co)下载模型,windows需要使用**管理员模式**打开IDE运行脚本,并按需换源[换源指南](#huggingface换源) +以下操作会自动从[huggingface](https://huggingface.co)下载模型,windows需要使用**管理员模式** +打开IDE运行脚本,并按需换源[换源指南](#huggingface换源) - 第一次使用该库读取、翻译非markdown文本 - 第一次使用该库的公式识别或代码识别功能 +## 使用ui界面 + +```python +from docutranslate import app +import uvicorn + +uvicorn.run(app, host="127.0.0.1", port=8010) +``` + ## 翻译文件 ```python @@ -81,7 +93,7 @@ translater.translate_file("<文件路径>", to_lang="中文") translater.translate_file("<文件路径>", to_lang="中文", formula=True, code=True) # 在先修复文本再翻译(适用于翻译pdf,但更耗时耗费) -translater.translate_file("<文件路径>", to_lang="中文",refine=True) +translater.translate_file("<文件路径>", to_lang="中文", refine=True) ``` > 下载模型时请用管理员模式打开终端运行文件(windows),并按需换源 @@ -129,7 +141,7 @@ translater = FileTranslater(base_url="", # 默认的模型baseurl chunksize=3500, # markdown分块长度(单位byte),分块越大效果越好,不建议超过8000 max_concurrent=10, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上 docling_artifact=None, # 使用提前下载好的docling模型 - timeout=2000,# 调用api的超时时间 + timeout=2000, # 调用api的超时时间 tips=True # 开场提示 ) diff --git a/docutranslate/app.py b/docutranslate/app.py index 4f8564c..b5bd949 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -1,7 +1,6 @@ import asyncio import io import logging -import os from pathlib import Path from typing import AsyncGenerator @@ -10,18 +9,26 @@ from fastapi import FastAPI, File, Form, UploadFile, Request, HTTPException from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse from fastapi.templating import Jinja2Templates -# 假设这些导入能够正确找到你的库代码 -from docutranslate import FileTranslater # Your existing FileTranslater -from docutranslate.logger import translater_logger # Your existing logger +# 导入文档翻译相关模块 +from docutranslate import FileTranslater +from docutranslate.logger import translater_logger -os.environ["FASTAPI_RUNNING"] = "true" +# 设置FastAPI运行标识 app = FastAPI() -# --- 异步队列和自定义日志处理器设置 --- -log_queue = asyncio.Queue() -SHUTDOWN_SENTINEL = object() # 使用一个唯一的对象作为哨兵 +# --- 全局配置 --- +SHUTDOWN_SENTINEL = object() # 哨兵对象,用于标识关闭 +log_queue = asyncio.Queue() # 日志队列 +current_state = { + "markdown_content": None, + "html_content": None, + "original_filename_stem": None, + "is_processing": False +} +templates = Jinja2Templates(directory=".") +# --- 日志处理器 --- class AsyncQueueHandler(logging.Handler): def __init__(self, queue: asyncio.Queue): super().__init__() @@ -29,66 +36,43 @@ class AsyncQueueHandler(logging.Handler): def emit(self, record: logging.LogRecord): log_entry = self.format(record) - # 在 FastAPI 应用上下文中运行时,尝试使用 app.state.main_event_loop - main_loop = getattr(app.state, "main_event_loop", None) - if main_loop and main_loop.is_running(): - main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry) - else: - # 如果主循环不可用或未运行(例如,在测试中或非常早期的启动/非常晚的关闭阶段) - # 这是一个备用方案,但不如 call_soon_threadsafe 安全 - try: - # 如果在主事件循环上下文之外,或者事件循环已停止, - # put_nowait 可能仍然有效,因为它不依赖于正在运行的特定循环来放置项目 - # 但理想情况下,日志记录应在主循环活跃时发生。 + try: + # 尝试使用主事件循环安全地添加日志到队列 + main_loop = getattr(app.state, "main_event_loop", None) + if main_loop and main_loop.is_running(): + main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry) + else: + # 备用方案 self.queue.put_nowait(log_entry) - except RuntimeError: # 例如,如果队列本身与已关闭的循环关联 - print(f"Error putting log to queue (loop likely closed): {log_entry[:100]}...") # 记录部分日志以避免过长输出 - self.handleError(record) # 调用基类的错误处理 - except Exception as e: - print(f"Error putting log to queue (no main loop/not running): {e}") - self.handleError(record) + except Exception as e: + print(f"Error putting log to queue: {e}") + self.handleError(record) +# --- 应用生命周期事件 --- @app.on_event("startup") async def startup_event(): app.state.main_event_loop = asyncio.get_running_loop() + # 配置日志处理器 queue_handler = AsyncQueueHandler(log_queue) queue_handler.setLevel(logging.INFO) - ui_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - queue_handler.setFormatter(ui_formatter) - # 检查 translater_logger 是否已经有这个类型的 handler,避免重复添加 + queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + + # 避免重复添加handler if not any(isinstance(h, AsyncQueueHandler) for h in translater_logger.handlers): translater_logger.addHandler(queue_handler) - translater_logger.info("Application startup complete. Log queue handler configured.") + translater_logger.info("应用启动完成,日志队列处理器已配置。") @app.on_event("shutdown") async def shutdown_event(): - translater_logger.info("Application shutting down. Signaling log streamer to stop.") - # 向队列发送哨兵值以停止日志流生成器 + translater_logger.info("应用正在关闭,通知日志流停止。") await log_queue.put(SHUTDOWN_SENTINEL) - # (可选) 短暂等待,以允许生成器处理哨兵并退出 - await asyncio.sleep(0.1) - translater_logger.info("Log streamer signaled.") - # (可选) 清空队列中剩余的日志,如果不想在关闭时处理它们 - # while not log_queue.empty(): - # try: - # log_queue.get_nowait() - # log_queue.task_done() - # except asyncio.QueueEmpty: - # break - # translater_logger.info("Log queue cleared during shutdown.") + await asyncio.sleep(0.1) # 给处理器留出时间处理哨兵 -# --- 全局状态 --- -current_translation_state = { - "markdown_content": None, "html_content": None, "original_filename_stem": None, - "error": None, "is_processing": False -} -templates = Jinja2Templates(directory=".") # 假设模板在当前目录或使用字符串模板 - -# --- HTML 模板字符串 --- -HTML_TEMPLATE_STR = """ +# --- HTML模板 --- +HTML_TEMPLATE = """ @@ -97,134 +81,31 @@ HTML_TEMPLATE_STR = """ DocuTranslate @@ -236,7 +117,7 @@ HTML_TEMPLATE_STR = """ DocuTranslate - +
API 配置 @@ -255,12 +136,12 @@ HTML_TEMPLATE_STR = """
- +
- +
@@ -282,22 +163,13 @@ HTML_TEMPLATE_STR = """
- - - + + +
- + @@ -313,130 +185,146 @@ HTML_TEMPLATE_STR = """
运行日志
- + @@ -444,187 +332,189 @@ HTML_TEMPLATE_STR = """ """ -# --- FastAPI Endpoints --- +# --- 日志流处理 --- +async def log_stream_generator() -> AsyncGenerator[str, None]: + last_heartbeat = asyncio.get_event_loop().time() + heartbeat_interval = 15 # 15秒发送一次心跳 + + try: + while True: + try: + # 等待日志消息,带超时 + log_message = await asyncio.wait_for(log_queue.get(), timeout=1.0) + + # 检查关闭哨兵 + if log_message is SHUTDOWN_SENTINEL: + translater_logger.info("日志流收到关闭信号,正在退出。") + log_queue.task_done() + break + + # 正常处理日志 + escaped_message = log_message.replace('&', '&').replace('<', '<').replace('>', '>') + yield f"data: {escaped_message}
\n\n" + log_queue.task_done() + last_heartbeat = asyncio.get_event_loop().time() + + except asyncio.TimeoutError: + # 超时,检查是否需要发送心跳 + current_time = asyncio.get_event_loop().time() + if current_time - last_heartbeat >= heartbeat_interval: + yield "data: :heartbeat\n\n" + last_heartbeat = current_time + + except asyncio.CancelledError: + translater_logger.info("日志流被取消。") + raise + + except asyncio.CancelledError: + translater_logger.info("日志流任务被外部取消。") + raise + finally: + translater_logger.info("日志流生成器结束。") + + +# --- API端点 --- @app.get("/", response_class=HTMLResponse) -async def main_page_get_endpoint(request: Request): - # Clear log queue only if not processing, to avoid clearing logs of an ongoing task - # when page is reloaded. However, SSE should keep logs flowing. - # This logic might be redundant if SSE handles logs independently of page reloads. - if not current_translation_state["is_processing"]: +async def main_page(): + # 如果没有处理中的任务,清空日志队列 + if not current_state["is_processing"]: while not log_queue.empty(): try: - log_queue.get_nowait(); + item = log_queue.get_nowait() + if item is SHUTDOWN_SENTINEL: + await log_queue.put(SHUTDOWN_SENTINEL) log_queue.task_done() except asyncio.QueueEmpty: break - context = {"request": request, "config": {}, "message": None, "error": False, "download_ready": False} - # If you are using Jinja2Templates with a file: - # return templates.TemplateResponse("your_template_name.html", context) - # If using the string template: - jinja_env = templates.env # Or initialize a Jinja2 Environment if not using FastAPI's templates - template_obj = jinja_env.from_string(HTML_TEMPLATE_STR) - return HTMLResponse(content=template_obj.render(context)) - - -async def log_stream_generator() -> AsyncGenerator[str, None]: - last_heartbeat_time = asyncio.get_event_loop().time() - heartbeat_interval = 15 # Send heartbeat every 15 seconds - is_shutting_down = False - - try: - while not is_shutting_down: # Loop until sentinel or cancellation - log_message = None - try: - # Wait for a log message with a timeout, so we can send heartbeats - # and check for shutdown sentinel periodically. - log_message = await asyncio.wait_for(log_queue.get(), timeout=1.0) - except asyncio.TimeoutError: - # No log message in this interval, proceed to check heartbeat - pass - except asyncio.CancelledError: # Handle cancellation if client disconnects - translater_logger.info("Log stream generator cancelled by client disconnect.") - raise # Re-raise to ensure task cleanup - - if log_message is SHUTDOWN_SENTINEL: - translater_logger.info("Log stream generator received shutdown sentinel. Exiting.") - log_queue.task_done() # Mark sentinel as processed - is_shutting_down = True - break # Exit the loop - - if log_message: # Process actual log message - # Basic HTML escaping for log messages to prevent XSS if logs contain HTML/JS - escaped_message = log_message.replace('&', '&').replace('<', '<').replace('>', '>') - yield f"data: {escaped_message}
\n\n" - log_queue.task_done() - last_heartbeat_time = asyncio.get_event_loop().time() # Reset heartbeat timer on actual data - - current_time = asyncio.get_event_loop().time() - if current_time - last_heartbeat_time >= heartbeat_interval: - yield "data: :heartbeat\n\n" - last_heartbeat_time = current_time - - except asyncio.CancelledError: # Catch again if cancellation happens outside the get() - translater_logger.info("Log stream generator task was cancelled externally.") - # Ensure any pending item in queue due to this generator is marked done IF it was fetched - # However, at this point, it's safer to just re-raise. - raise - finally: - translater_logger.info("Log stream generator finished.") - # Ensure the queue is not blocked if join() is ever used elsewhere for this queue. - # If a log_message was retrieved but not task_done'd before cancellation/sentinel, - # this could be an issue. The current logic should cover it. + # 返回HTML模板 + return HTMLResponse(content=HTML_TEMPLATE) @app.get("/stream-logs") -async def stream_logs_endpoint(request: Request): +async def stream_logs(): return StreamingResponse(log_stream_generator(), media_type="text/event-stream") -@app.post("/translate", response_class=JSONResponse) -async def handle_translate_endpoint( - request: Request, # Keep request if needed for other things, like client IP - base_url: str = Form(...), apikey: str = Form(...), model_id: str = Form(...), - to_lang: str = Form("中文"), formula_ocr: bool = Form(False), - code_ocr: bool = Form(False), refine_markdown: bool = Form(False), +@app.post("/translate") +async def handle_translate( + base_url: str = Form(...), + apikey: str = Form(...), + model_id: str = Form(...), + to_lang: str = Form("中文"), + formula_ocr: bool = Form(False), + code_ocr: bool = Form(False), + refine_markdown: bool = Form(False), file: UploadFile = File(...) ): - if current_translation_state["is_processing"]: - return JSONResponse(status_code=429, content={"error": True, "message": "另一个翻译任务正在进行中,请稍后再试。"}) + # 检查是否有正在进行的任务 + if current_state["is_processing"]: + return JSONResponse( + status_code=429, + content={"error": True, "message": "另一个翻译任务正在进行中,请稍后再试。"} + ) - current_translation_state["is_processing"] = True - # It's good practice to clear the log queue for a new task if appropriate, - # or ensure old logs don't interfere. The AsyncQueueHandler means logs are - # continuously added, so clearing here makes sense for a "fresh" log view per task. - # However, the main page GET also clears it, so be mindful of desired behavior. - # For now, let's assume logs for a new task should start fresh. + # 设置处理状态 + current_state["is_processing"] = True + + # 清空日志队列 while not log_queue.empty(): try: item = log_queue.get_nowait() - if item is SHUTDOWN_SENTINEL: # Put sentinel back if accidentally removed - log_queue.put_nowait(SHUTDOWN_SENTINEL) + if item is SHUTDOWN_SENTINEL: + await log_queue.put(SHUTDOWN_SENTINEL) log_queue.task_done() except asyncio.QueueEmpty: break translater_logger.info("收到翻译请求。") - response_data = {"error": False, "message": "", "download_ready": False, "markdown_url": None, "html_url": None, - "original_filename_stem": None} + response_data = { + "error": False, + "message": "", + "download_ready": False, + "markdown_url": None, + "html_url": None, + "original_filename_stem": None + } - file_contents = None # Initialize to ensure it's defined for finally block try: - file_contents = await file.read() # Read file contents - original_filename = file.filename if file.filename else "uploaded_file" - current_translation_state["original_filename_stem"] = Path(original_filename).stem - response_data["original_filename_stem"] = current_translation_state["original_filename_stem"] + # 读取文件内容 + file_contents = await file.read() + original_filename = file.filename or "uploaded_file" + file_stem = Path(original_filename).stem + + current_state["original_filename_stem"] = file_stem + response_data["original_filename_stem"] = file_stem translater_logger.info(f"文件 '{original_filename}' 已上传, 大小: {len(file_contents)} 字节。") + # 创建翻译器并翻译 ft = FileTranslater(base_url=base_url, key=apikey, model_id=model_id, tips=False) - # Run the blocking translation task in a separate thread + # 在单独的线程中运行翻译任务 await asyncio.to_thread( - ft.translate_bytes, name=original_filename, file=file_contents, to_lang=to_lang, - formula=formula_ocr, code=code_ocr, refine=refine_markdown, save=False - # save=False if handling content in memory + ft.translate_bytes, + name=original_filename, + file=file_contents, + to_lang=to_lang, + formula=formula_ocr, + code=code_ocr, + refine=refine_markdown, + save=False ) - # Assuming FileTranslater populates its internal state with translated content - current_translation_state["markdown_content"] = ft.export_to_markdown() - current_translation_state["html_content"] = ft.export_to_html( - title=current_translation_state["original_filename_stem"]) # Pass title if your method supports it + # 保存翻译结果 + current_state["markdown_content"] = ft.export_to_markdown() + current_state["html_content"] = ft.export_to_html(title=file_stem) + # 设置响应数据 response_data["message"] = "翻译成功!下载链接已生成。" response_data["download_ready"] = True - response_data["markdown_url"] = f"/download/markdown/{response_data['original_filename_stem']}_translated.md" - response_data["html_url"] = f"/download/html/{response_data['original_filename_stem']}_translated.html" + response_data["markdown_url"] = f"/download/markdown/{file_stem}_translated.md" + response_data["html_url"] = f"/download/html/{file_stem}_translated.html" + translater_logger.info("翻译流程处理完毕。") except Exception as e: - translater_logger.error(f"翻译失败: {e}", exc_info=True) # exc_info=True for traceback + translater_logger.error(f"翻译失败: {e}", exc_info=True) response_data["error"] = True response_data["message"] = f"翻译过程中发生错误: {str(e)}" finally: - current_translation_state["is_processing"] = False - if file: # Ensure file object exists - await file.close() # Close the UploadFile object - # Do not clear file_contents here as it's used by translate_bytes - # The content is in memory; if it were a temp file, you'd delete it here. + current_state["is_processing"] = False + await file.close() return JSONResponse(content=response_data) # --- 下载接口 --- @app.get("/download/markdown/{filename_with_ext}") -async def download_markdown_endpoint(filename_with_ext: str): # filename_with_ext from URL - # Use original_filename_stem from state to construct the expected filename for security/consistency - if current_translation_state["markdown_content"] and current_translation_state["original_filename_stem"]: - # Compare requested filename stem with stored stem if necessary, or just use stored stem - actual_filename = f"{current_translation_state['original_filename_stem']}_translated.md" - # if Path(filename_with_ext).stem != Path(actual_filename).stem: - # raise HTTPException(status_code=404, detail="文件名不匹配或内容不可用。") +async def download_markdown(filename_with_ext: str): + if not current_state["markdown_content"] or not current_state["original_filename_stem"]: + raise HTTPException(status_code=404, detail="无 Markdown 翻译内容可用。") - return StreamingResponse(io.StringIO(current_translation_state["markdown_content"]), media_type="text/markdown", - headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""}) - raise HTTPException(status_code=404, detail="无 Markdown 翻译内容可用。") + actual_filename = f"{current_state['original_filename_stem']}_translated.md" + return StreamingResponse( + io.StringIO(current_state["markdown_content"]), + media_type="text/markdown", + headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""} + ) @app.get("/download/html/{filename_with_ext}") -async def download_html_endpoint(filename_with_ext: str): - if current_translation_state["html_content"] and current_translation_state["original_filename_stem"]: - actual_filename = f"{current_translation_state['original_filename_stem']}_translated.html" - # if Path(filename_with_ext).stem != Path(actual_filename).stem: - # raise HTTPException(status_code=404, detail="文件名不匹配或内容不可用。") +async def download_html(filename_with_ext: str): + if not current_state["html_content"] or not current_state["original_filename_stem"]: + raise HTTPException(status_code=404, detail="无 HTML 翻译内容可用。") - return HTMLResponse(content=current_translation_state["html_content"], media_type="text/html", - headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""}) - raise HTTPException(status_code=404, detail="无 HTML 翻译内容可用。") + actual_filename = f"{current_state['original_filename_stem']}_translated.html" + return HTMLResponse( + content=current_state["html_content"], + media_type="text/html", + headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""} + ) -# --- Uvicorn 启动 --- +# --- 启动服务 --- if __name__ == "__main__": - print("正在启动 FastAPI 文档翻译服务 (使用 asyncio.Queue 和 SSE)...") # Updated message + print("正在启动 FastAPI 文档翻译服务...") print("请访问 http://127.0.0.1:8010") - # Consider adding reload_dirs if you have other modules like docutranslate in development - uvicorn.run(app, host="127.0.0.1", port=8010) # Removed reload=True for this specific test - # Add it back if you are actively developing \ No newline at end of file + uvicorn.run(app, host="127.0.0.1", port=8010) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 742ec7d..1ba17ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docutranslate" -version = "0.1.8" +version = "0.1.9" description = "文件翻译工具" readme = "README.md" requires-python = ">=3.10"