From 85573561e44ead160f5238f51989644bfe200c22 Mon Sep 17 00:00:00 2001 From: xunbu Date: Tue, 20 May 2025 18:16:58 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9mineru=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .idea/workspace.xml | 253 +++++++++++-------- README.md | 43 +++- docutranslate/app.py | 93 ++++--- docutranslate/converter/__init__.py | 3 + docutranslate/converter/converter.py | 25 ++ docutranslate/converter/converter_docling.py | 80 ++++++ docutranslate/converter/converter_mineru.py | 220 ++++++++++++++++ docutranslate/static/index.html | 181 ++++++++----- docutranslate/translater.py | 177 +++++++++---- pyproject.toml | 2 +- 10 files changed, 814 insertions(+), 263 deletions(-) create mode 100644 docutranslate/converter/__init__.py create mode 100644 docutranslate/converter/converter.py create mode 100644 docutranslate/converter/converter_docling.py create mode 100644 docutranslate/converter/converter_mineru.py diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 5aac348..ac454cc 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -5,7 +5,16 @@ - + + + + + + + + + + - { - "keyToString": { - "DefaultHtmlFileTemplate": "HTML File", - "JavaScript 调试.output.html (1).executor": "Run", - "JavaScript 调试.output.html.executor": "Run", - "JavaScript 调试.regex.md_中文.html.executor": "Run", - "JavaScript 调试.regex_中文.html.executor": "Run", - "JavaScript 调试.test.html.executor": "Run", - "JavaScript 调试.test2.html.executor": "Run", - "JavaScript 调试.test2_英文.html.executor": "Run", - "JavaScript 调试.test4-1_中文.html.executor": "Run", - "JavaScript 调试.互联网认证授权机制.html.executor": "Run", - "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run", - "JavaScript 调试.毕业论文_英文.html.executor": "Run", - "ModuleVcsDetector.initialDetectionPerformed": "true", - "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run", - "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", - "Python 测试.pytest (test_html.py 内).executor": "Run", - "Python.2test2 (1).executor": "Run", - "Python.PDFtranslater (1).executor": "Run", - "Python.PDFtranslater (2).executor": "Run", - "Python.agent.executor": "Debug", - "Python.agent_utils.executor": "Run", - "Python.app (1).executor": "Run", - "Python.app.executor": "Run", - "Python.app2.executor": "Run", - "Python.app_test (1).executor": "Run", - "Python.convert.executor": "Run", - "Python.markdown_splitter.executor": "Debug", - "Python.markdown_utils.executor": "Run", - "Python.test.executor": "Run", - "Python.test1.executor": "Run", - "Python.test2.executor": "Run", - "Python.test3.executor": "Run", - "Python.test4.executor": "Run", - "Python.testhtml.executor": "Run", - "Python.translater.executor": "Run", - "Python.切分测试.executor": "Run", - "RunOnceActivity.ShowReadmeOnStart": "true", - "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true", - "RunOnceActivity.git.unshallow": "true", - "git-widget-placeholder": "main", - "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate", - "list.type.of.created.stylesheet": "CSS", - "node.js.detected.package.eslint": "true", - "node.js.detected.package.tslint": "true", - "node.js.selected.package.eslint": "(autodetect)", - "node.js.selected.package.tslint": "(autodetect)", - "nodejs_package_manager_path": "npm", - "settings.editor.selected.configurable": "preferences.pluginManager", - "vue.rearranger.settings.migration": "true" + +}]]> @@ -267,7 +279,27 @@ @@ -568,14 +605,26 @@ + + + + - + @@ -583,14 +632,17 @@ + + + - + @@ -601,6 +653,7 @@ + \ No newline at end of file diff --git a/README.md b/README.md index fe0d246..9fc08fe 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ [![image](https://img.shields.io/badge/github-DocuTranslate-blue)](https://github.com/xunbu/docutranslate) -文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译 +文件翻译工具,借助[docling](https://github.com/docling-project/docling)、[minerU](https://mineru.net/)与大语言模型实现多种格式文件的翻译 + +> QQ交流群:1047781902 # 整合包 对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases) 上下载最新的整合包,该整合包点击即用,您所需的只是获取某个ai平台的api-key。 +以及可以在mineru申请token进行pdf识别【可选】 # 安装 @@ -34,7 +37,16 @@ # 前置条件 -## huggingface换源 +本翻译工具的翻译流程总体如下: + +1. 使用文本转换引擎将文档转换成markdown(有docling(本地)、minerU(联网)两种引擎) +2. 使用大语言模型翻译markdown文本(需要申请api-key或本地部署) + +## 使用docling引擎注意事项 + +使用docling将文档转换为markdown时,需要下载模型到本地(也可以提前下载,见FAQ),因此可能会遇到一些网络问题 + +### huggingface换源 > 不能科学上网的友友注意了 @@ -43,12 +55,12 @@ - 第一次读取非markdown文本 - 第一次使用公式识别或代码识别功能 -### 方法1 +#### 方法1 设置电脑的环境变量(记得设置后重启IDE) `HF_ENDPOINT=https://hf-mirror.com` -### 方法2 +#### 方法2 在代码开头设置环境变量 @@ -60,6 +72,13 @@ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' ###其余代码写在下方 ``` +## 使用minerU引擎注意事项 + +使用minerU将文档转换为markdown时,需要在minerU平台申请token + +1. 打开[minerU官网](https://mineru.net/apiManage/docs)申请token +2. 申请成功后,在[API Token管理界面](https://mineru.net/apiManage/token)创建API Token + ## 获取大模型平台的baseurl、key、model-id 由于需要使用大语言模型进行markdown调整与翻译,所以需要预先获取模型的baseurl、key、model-id @@ -90,7 +109,12 @@ from docutranslate.translater import FileTranslater translater = FileTranslater(base_url="", key="", - model_id="") + model_id="", + convert_engin="docling" # 默认使用docling + # convert_engin="mineru",# 使用mineru + # mineru_token="<申请的mineru_token>"#使用mineru时必填 + ) + # 不开启公式、代码识别(默认输出为markdown文件) translater.translate_file("<文件路径>", to_lang="中文") @@ -141,12 +165,14 @@ translater.read_file("<文件路径>").save_as_markdown() from docutranslate import FileTranslater translater = FileTranslater(base_url="", # 默认的模型baseurl - key="", # 默认的模型api-key + key="", # 默认的大语言模型平台api-key model_id="", # 默认的模型id chunksize=2000, # markdown分块长度(单位byte),分块越大效果越好(也越慢),不建议超过8000 max_concurrent=20, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上 - docling_artifact=None, # 使用提前下载好的docling模型 timeout=2000, # 调用api的超时时间 + docling_artifact=None, # 使用提前下载好的docling模型 + convert_engin="mineru", # 可选docling或minerU + mineru_token="", # minerU的token,使用minerU时必填 tips=True # 开场提示 ) @@ -206,7 +232,8 @@ from docutranslate.utils.docling_utils import get_docling_artifacts print(get_docling_artifacts()) # 会显示模型下载文件夹,通常在`C:\Users\\.cache\docling\models` ``` -> 创建FileTranslater时携带模型文件夹即可 +> 将模型文件夹命名为docling_artifact放置在项目下 +> 或创建FileTranslater时docling_artifact参数设置为文件夹位置 ```python from docutranslate import FileTranslater diff --git a/docutranslate/app.py b/docutranslate/app.py index 3495a9a..9bc7c6e 100644 --- a/docutranslate/app.py +++ b/docutranslate/app.py @@ -11,7 +11,7 @@ from fastapi import FastAPI, File, Form, UploadFile, Request, HTTPException from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse,FileResponse from fastapi.templating import Jinja2Templates from fastapi.staticfiles import StaticFiles -from docutranslate import FileTranslater +from docutranslate import FileTranslater # Assuming FileTranslater is in docutranslate module from docutranslate.logger import translater_logger from docutranslate.utils.resource_utils import resource_path @@ -19,11 +19,10 @@ app = FastAPI() STATIC_DIR=resource_path("static") -# print(f"__file__:{Path(__file__).resolve()}") app.mount("/static",StaticFiles(directory=STATIC_DIR), name="static") # --- 全局配置 --- -log_queue: Optional[asyncio.Queue] = None # Will be initialized in startup_event +log_queue: Optional[asyncio.Queue] = None current_state: Dict[str, Any] = { "is_processing": False, "status_message": "空闲", @@ -36,9 +35,9 @@ current_state: Dict[str, Any] = { "task_end_time": 0, "current_task_ref": None, } -templates = Jinja2Templates(directory=".") -MAX_LOG_HISTORY = 200 # Max items for the persistent log_history list -log_history: List[str] = [] # Keeps a longer history, not directly for "unread" +templates = Jinja2Templates(directory=".") # Not strictly used if index.html is served as FileResponse +MAX_LOG_HISTORY = 200 +log_history: List[str] = [] # --- 日志处理器 --- @@ -51,27 +50,22 @@ class QueueAndHistoryHandler(logging.Handler): def emit(self, record: logging.LogRecord): log_entry = self.format(record) - - # Add to the persistent history (capped) + print(log_entry) # Keep console log for server visibility self.history_list.append(log_entry) if len(self.history_list) > self.max_history: del self.history_list[:len(self.history_list) - self.max_history] - # Add to the "unread" queue for frontend consumption - try: - # Ensure self.queue is not None (it's initialized at startup) - if self.queue is not None: + if self.queue is not None: + try: main_loop = getattr(app.state, "main_event_loop", None) if main_loop and main_loop.is_running(): main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry) else: - self.queue.put_nowait(log_entry) # Fallback - else: - print(f"CRITICAL: Log queue not initialized. Log: {log_entry}") - except asyncio.QueueFull: - print(f"Log queue is full. Log dropped: {log_entry}") # Or handle differently - except Exception as e: - print(f"Error putting log to queue: {e}. Log: {log_entry}") + self.queue.put_nowait(log_entry) + except asyncio.QueueFull: + print(f"Log queue is full. Log dropped: {log_entry}") + except Exception as e: + print(f"Error putting log to queue: {e}. Log: {log_entry}") # --- 应用生命周期事件 --- @@ -79,7 +73,7 @@ class QueueAndHistoryHandler(logging.Handler): async def startup_event(): global log_queue app.state.main_event_loop = asyncio.get_running_loop() - log_queue = asyncio.Queue() # Initialize the global log_queue + log_queue = asyncio.Queue() for handler in translater_logger.handlers[:]: translater_logger.removeHandler(handler) @@ -93,7 +87,7 @@ async def startup_event(): translater_logger.setLevel(logging.INFO) log_history.clear() - while not log_queue.empty(): # Clear queue just in case + while not log_queue.empty(): try: log_queue.get_nowait() except asyncio.QueueEmpty: @@ -112,6 +106,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori try: translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}") translater_logger.info(f"文件大小: {len(file_contents)} 字节。目标语言: {params['to_lang']}") + translater_logger.info(f"使用转换引擎: {params['convert_engin']}") translater_logger.info( f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}") @@ -119,7 +114,9 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori base_url=params['base_url'], key=params['apikey'], model_id=params['model_id'], - tips=False + convert_engin=params['convert_engin'], + mineru_token=params['mineru_token'], + tips=False # Assuming tips are not needed for server-side processing ) await ft.translate_bytes_async( name=original_filename, @@ -152,7 +149,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori translater_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).") current_state.update({ "status_message": f"翻译任务已取消(若有转换任务仍会后台进行) (用时 {duration:.2f} 秒).", - "error_flag": False, # Cancellation is not an error in this context + "error_flag": False, "download_ready": False, "markdown_content": None, "html_content": None, @@ -180,11 +177,25 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori # --- API Endpoints --- @app.get("/", response_class=HTMLResponse) async def main_page(request: Request): - return FileResponse(STATIC_DIR/"index.html") + # Serve index.html from the static directory or root project directory + # Assuming index.html is at the same level as app.py or in STATIC_DIR + # For simplicity, if index.html is at root: + # return FileResponse(Path(__file__).parent / "index.html") + # If using Jinja2Templates and index.html is in "templates" folder: + # return templates.TemplateResponse("index.html", {"request": request}) + # Using FileResponse for index.html directly: + index_path = Path("index.html") # Adjust if index.html is elsewhere + if not index_path.exists(): + # Fallback to static dir if not in root + index_path = STATIC_DIR / "index.html" + if not index_path.exists(): + raise HTTPException(status_code=404, detail="index.html not found") + return FileResponse(index_path) @app.post("/translate") async def handle_translate( + request: Request, # Added request for potential future use, not strictly needed now base_url: str = Form(...), apikey: str = Form(...), model_id: str = Form(...), @@ -192,6 +203,8 @@ async def handle_translate( formula_ocr: bool = Form(False), code_ocr: bool = Form(False), refine_markdown: bool = Form(False), + convert_engin: str = Form(...), # New parameter + mineru_token: Optional[str] = Form(None), # New parameter file: UploadFile = File(...) ): global current_state, log_queue, log_history @@ -209,6 +222,12 @@ async def handle_translate( content={"task_started": False, "message": "没有选择文件或文件无效。"} ) + if convert_engin == "mineru" and (not mineru_token or not mineru_token.strip()): + return JSONResponse( + status_code=400, + content={"task_started": False, "message": "使用 Mineru 引擎时必须提供有效的 Mineru Token。"} + ) + current_state["is_processing"] = True original_filename_for_init = file.filename or "uploaded_file" @@ -224,26 +243,22 @@ async def handle_translate( "current_task_ref": None, }) - # Clear logs for the new task log_history.clear() - if log_queue: # Ensure log_queue is initialized + if log_queue: while not log_queue.empty(): try: log_queue.get_nowait() except asyncio.QueueEmpty: break - # Add initial log entry for the new task - # We create a LogRecord manually to ensure it goes through the formatter and handler initial_log_msg = f"收到新的翻译请求: {original_filename_for_init}" if translater_logger.handlers and isinstance(translater_logger.handlers[0], QueueAndHistoryHandler): - # Use the existing handler to format and queue/store the log record = logging.LogRecord( name=translater_logger.name, level=logging.INFO, pathname="", lineno=0, msg=initial_log_msg, args=(), exc_info=None, func="" ) - translater_logger.handlers[0].emit(record) # This will add to both queue and history - else: # Fallback if handler setup is unusual + translater_logger.handlers[0].emit(record) + else: translater_logger.info(initial_log_msg) try: @@ -255,6 +270,8 @@ async def handle_translate( "base_url": base_url, "apikey": apikey, "model_id": model_id, "to_lang": to_lang, "formula_ocr": formula_ocr, "code_ocr": code_ocr, "refine_markdown": refine_markdown, + "convert_engin": convert_engin, # Pass to task + "mineru_token": mineru_token, # Pass to task } loop = asyncio.get_running_loop() @@ -332,18 +349,17 @@ async def get_status(): @app.get("/get-logs") -async def get_logs_from_queue(): # Renamed for clarity, though path is the same +async def get_logs_from_queue(): global log_queue new_logs = [] - if log_queue: # Ensure log_queue is initialized + if log_queue: while not log_queue.empty(): try: - log_entry = log_queue.get_nowait() # Consume from queue + log_entry = log_queue.get_nowait() new_logs.append(log_entry) - log_queue.task_done() # Important for queue management if using join() elsewhere + log_queue.task_done() except asyncio.QueueEmpty: break - # No total_count, as the frontend just appends what it receives return JSONResponse(content={"logs": new_logs}) @@ -384,10 +400,11 @@ async def download_html(filename_with_ext: str): def run_app(): - print("正在启动 DocuTranslate") + print("正在启动 DocuTranslate WebUI") print("请访问 http://127.0.0.1:8010") uvicorn.run(app, host="127.0.0.1", port=8010, workers=1) if __name__ == "__main__": - run_app() + + run_app() \ No newline at end of file diff --git a/docutranslate/converter/__init__.py b/docutranslate/converter/__init__.py new file mode 100644 index 0000000..058266d --- /dev/null +++ b/docutranslate/converter/__init__.py @@ -0,0 +1,3 @@ +from .converter import Document,Converter +from .converter_mineru import ConverterMineru +from .converter_docling import ConverterDocling \ No newline at end of file diff --git a/docutranslate/converter/converter.py b/docutranslate/converter/converter.py new file mode 100644 index 0000000..c2e008b --- /dev/null +++ b/docutranslate/converter/converter.py @@ -0,0 +1,25 @@ +from typing import Protocol +from pathlib import Path + + +class Document: + def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None): + if path is None and (filename is None or filebytes is None): + raise Exception("Document的路径或filename、filebytes不能同时为空") + self.filebytes = filebytes + self.filename = filename + self.path = path + if path: + if isinstance(path,str): + path=Path(path) + self.path=path + self.filename=path.name + self.filebytes=path.read_bytes() + +class Converter(Protocol): + #转换为markdown + def convert(self,document:Document)->str: + ... + + async def convert_async(self,document:Document)->str: + ... \ No newline at end of file diff --git a/docutranslate/converter/converter_docling.py b/docutranslate/converter/converter_docling.py new file mode 100644 index 0000000..c2b35ad --- /dev/null +++ b/docutranslate/converter/converter_docling.py @@ -0,0 +1,80 @@ +import os +import time +from io import BytesIO +from pathlib import Path + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import DocumentStream +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.settings import settings +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling_core.types.doc import ImageRefMode +from huggingface_hub.errors import LocalEntryNotFoundError + +from docutranslate.logger import translater_logger + +from docutranslate.converter import Converter, Document + +import asyncio + +IMAGE_RESOLUTION_SCALE = 4 + + +def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False, + artifacts_path: Path | str | None = None) -> str: + pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) + pipeline_options.do_ocr = False + pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE + pipeline_options.generate_picture_images = True + # pipeline_options.table_structure_options.mode = TableFormerMode.FAST + pipeline_options.table_structure_options.do_cell_matching = False + if formula: + pipeline_options.do_formula_enrichment = True + if code: + pipeline_options.do_code_enrichment = True + # pipeline_options.accelerator_options= AcceleratorOptions( + # num_threads=4, device=AcceleratorDevice.AUTO + # ) + # 打印时间 + settings.debug.profile_pipeline_timings = True + converter = DocumentConverter(format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + + }) + try: + conversion_result = converter.convert(file_path) + result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) + except LocalEntryNotFoundError: + translater_logger.info(f"无法连接huggingface,正在尝试换源") + os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' + conversion_result = converter.convert(file_path) + result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) + # translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}") + return result + + +class ConverterDocling(Converter): + def __init__(self, code=True, formula=True, artifact=None): + self.code = code + self.formula = formula + self.artifact = artifact + + def convert(self, document): + assert isinstance(document.filename, str) + translater_logger.info(f"正在将文档转换为markdown") + time1 = time.time() + document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes)) + result = file2markdown_embed_images(document_stream, formula=self.formula, code=self.code, + artifacts_path=self.artifact) + translater_logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") + return result + + async def convert_async(self, document: Document) -> str: + return await asyncio.to_thread( + self.convert, + document + ) + + +if __name__ == '__main__': + pass diff --git a/docutranslate/converter/converter_mineru.py b/docutranslate/converter/converter_mineru.py new file mode 100644 index 0000000..593dfc8 --- /dev/null +++ b/docutranslate/converter/converter_mineru.py @@ -0,0 +1,220 @@ +import asyncio +import base64 +import io +import mimetypes +import os +import re +import time +import zipfile +import httpx +from docutranslate.converter import Converter, Document +from docutranslate.logger import translater_logger + +URL = 'https://mineru.net/api/v4/file-urls/batch' + +client=httpx.Client(trust_env=False) + +#TODO: 提供更详细的logger +class ConverterMineru(Converter): + def __init__(self, token: str, formula=True): + self.mineru_token = token.strip() + self.client_async = httpx.AsyncClient() + self.formula = formula + + def _get_header(self): + return { + 'Content-Type': 'application/json', + "Authorization": f"Bearer {self.mineru_token}" + } + + def _get_upload_data(self, document: Document): + return { + "enable_formula": self.formula, + "language": "auto", + "enable_table": True, + "files": [ + {"name": f"{document.filename}", "is_ocr": True} + ] + } + + def upload(self, document: Document): + # 获取上传链接 + response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document)) + response.raise_for_status() + result = response.json() + # print('response success. result:{}'.format(result)) + if result["code"] == 0: + batch_id = result["data"]["batch_id"] + urls = result["data"]["file_urls"] + # print('batch_id:{},urls:{}'.format(batch_id, urls)) + # 获取 + res_upload = client.put(urls[0], content=document.filebytes) + res_upload.raise_for_status() + # print(f"{urls[0]} upload success") + return batch_id + else: + raise Exception('apply upload url failed,reason:{}'.format(result.msg)) + + def get_file_url(self, batch_id: str) -> str: + while True: + url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}' + header = self._get_header() + res = client.get(url, headers=header) + res.raise_for_status() + fileinfo = res.json()["data"]["extract_result"][0] + if fileinfo["state"] == "done": + fileurl = fileinfo["full_zip_url"] + return fileurl + else: + time.sleep(3) + + def convert(self, document: Document) -> str: + translater_logger.info(f"正在将文档转换为markdown") + time1=time.time() + batch_id = self.upload(document) + file_url = self.get_file_url(batch_id) + result=get_md_from_zip_url_with_inline_images(zip_url=file_url) + translater_logger.info(f"已转换为markdown,耗时{time.time()-time1}秒") + return result + + # TODO: 实现细粒度更高的协程 + async def convert_async(self, document: Document) -> str: + # 待优化 + return await asyncio.to_thread( + self.convert, + document + ) + + +def get_md_from_zip_url_with_inline_images( + zip_url: str, + filename_in_zip: str = "full.md", + encoding: str = "utf-8" +) -> str | None: + """ + 从给定的ZIP文件URL中下载并提取指定文件的内容, + 并将Markdown文件中的相对路径图片转换为内联Base64图片。 + + Args: + zip_url (str): ZIP文件的下载链接。 + filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。 + 默认为 "full.md"。 + encoding (str): 目标文件的预期编码。默认为 "utf-8"。 + + Returns: + str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。 + """ + try: + print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") + response = client.get(zip_url, timeout=60.0) # 增加超时 + response.raise_for_status() + print("ZIP文件下载完成。") + + zip_file_bytes = io.BytesIO(response.content) + + print(f"正在尝试打开内存中的ZIP存档...") + with zipfile.ZipFile(zip_file_bytes, 'r') as archive: + print(f"ZIP存档已打开。正在查找文件 '{filename_in_zip}'...") + + if filename_in_zip not in archive.namelist(): + print(f"错误: 文件 '{filename_in_zip}' 在ZIP压缩包中未找到。") + print(f"压缩包中的可用文件列表: {archive.namelist()}") + return None + + md_content_bytes = archive.read(filename_in_zip) + print(f"文件 '{filename_in_zip}' 已找到并读取。") + md_content_text = md_content_bytes.decode(encoding) + print(f"文件内容已使用 '{encoding}' 编码成功解码。") + + # --- 新增:处理图片 --- + print("开始处理Markdown中的图片...") + # 获取Markdown文件在ZIP包内的基本目录,用于解析相对图片路径 + # 例如,如果 filename_in_zip 是 "docs/guide/full.md", base_md_path_in_zip 是 "docs/guide" + # 如果 filename_in_zip 是 "full.md", base_md_path_in_zip 是 "" + base_md_path_in_zip = os.path.dirname(filename_in_zip) + + def replace_image_with_base64(match): + alt_text = match.group(1) + original_image_path = match.group(2) + + # 检查是否是外部链接或已经是data URI + if original_image_path.startswith(('http://', 'https://', 'data:')): + print(f" 跳过外部或已内联图片: {original_image_path}") + return match.group(0) # 返回原始匹配 + + # 构建图片在ZIP文件中的绝对路径 + # os.path.join 会正确处理 base_md_path_in_zip 为空字符串的情况 + image_path_in_zip = os.path.join(base_md_path_in_zip, original_image_path) + # zipfile 使用正斜杠,并且路径是相对于zip根目录的,os.path.normpath确保路径格式正确 + image_path_in_zip = os.path.normpath(image_path_in_zip).replace(os.sep, '/') + + # 确保路径不是以 './' 开头,如果filename_in_zip在根目录且图片路径也是相对的 + if image_path_in_zip.startswith('./'): + image_path_in_zip = image_path_in_zip[2:] + + # print(f" 尝试内联图片: '{original_image_path}' (解析为ZIP内路径: '{image_path_in_zip}')") + + try: + image_bytes = archive.read(image_path_in_zip) + + # 猜测MIME类型 + mime_type, _ = mimetypes.guess_type(image_path_in_zip) + if not mime_type: + # 备用:根据扩展名手动判断一些常见类型 + ext = os.path.splitext(image_path_in_zip)[1].lower() + if ext == '.png': + mime_type = 'image/png' + elif ext in ['.jpg', '.jpeg']: + mime_type = 'image/jpeg' + elif ext == '.gif': + mime_type = 'image/gif' + elif ext == '.svg': + mime_type = 'image/svg+xml' + elif ext == '.webp': + mime_type = 'image/webp' + else: + print(f" 警告: 无法确定图片 '{image_path_in_zip}' 的MIME类型。跳过内联。") + return match.group(0) # 返回原始匹配 + + base64_encoded_data = base64.b64encode(image_bytes).decode('utf-8') + new_image_tag = f"![{alt_text}](data:{mime_type};base64,{base64_encoded_data})" + # print(f" 成功内联图片: {original_image_path} -> data:{mime_type[:20]}...") + return new_image_tag + except KeyError: + print(f" 警告: 图片 '{image_path_in_zip}' 在ZIP压缩包中未找到。原始链接将被保留。") + return match.group(0) # 图片不在zip中,返回原始匹配 + except Exception as e_img: + print(f" 错误: 处理图片 '{image_path_in_zip}' 时发生错误: {e_img}。原始链接将被保留。") + return match.group(0) + + # 正则表达式查找Markdown图片: ![alt text](path/to/image.ext) + # 修改了正则表达式,使其不贪婪地匹配alt文本和路径 + image_regex = r"!\[(.*?)\]\((.*?)\)" + modified_md_content = re.sub(image_regex, replace_image_with_base64, md_content_text) + + print("图片处理完成。") + return modified_md_content + + except httpx.HTTPStatusError as e: + print(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}") + print(f"响应内容: {e.response.text[:200]}...") + return None + except httpx.RequestError as e: + print(f"下载ZIP文件时发生错误 (httpx): {e}") + return None + except zipfile.BadZipFile: + print("错误: 下载的文件不是一个有效的ZIP压缩文件或已损坏。") + return None + except UnicodeDecodeError: + print(f"错误: 无法使用 '{encoding}' 编码解码文件 '{filename_in_zip}' 的内容。") + print("请尝试其他编码,如 'gbk', 'latin1' 等,或确认文件本身的编码。") + return None + except Exception as e: + print(f"发生未知错误: {e}") + import traceback + traceback.print_exc() # 打印完整的堆栈跟踪,便于调试 + return None + + +if __name__ == '__main__': + pass diff --git a/docutranslate/static/index.html b/docutranslate/static/index.html index 104af4b..cf52ca3 100644 --- a/docutranslate/static/index.html +++ b/docutranslate/static/index.html @@ -28,11 +28,11 @@ } .error-message { - color: #d32f2f; + color: #d32f2f; /* Pico invalid color */ } .success-message { - color: #2e7d32; + color: #2e7d32; /* Pico valid color */ } .form-group { @@ -65,9 +65,15 @@ .checkbox-group { display: flex; flex-wrap: wrap; + gap: 1rem; /* Added gap for better spacing */ margin-bottom: 1rem; } + .checkbox-group label { /* Ensure checkboxes are aligned */ + margin-right: 10px; + } + + #resultArea { margin-top: 1.5rem; padding-top: 1rem; @@ -116,7 +122,6 @@ display: none; } - /* Styles for drag and drop area */ #fileDropArea { border: 2px dashed #ccc; padding: 20px; @@ -126,16 +131,16 @@ } #fileDropArea.drag-over { - border-color: #1095c1; /* Pico primary color (定量替换 var(--pico-primary-focus)) */ - background-color: #e7f5fa; /* Pico primary background (定量替换 var(--pico-primary-background)) */ + border-color: #1095c1; + background-color: #e7f5fa; } #fileDropArea.file-selected { - border-color: #2e7d32; /* Pico success color (定量替换 var(--pico-form-element-valid-border-color, #2e7d32)) */ - background-color: #e8f5e9; /* Light green (定量替换 var(--pico-form-element-valid-background-color, #e8f5e9)) */ + border-color: #2e7d32; + background-color: #e8f5e9; } - #fileDropArea p { /* General style for

inside drop area */ + #fileDropArea p { margin: 0.5rem 0; color: #555; } @@ -149,19 +154,18 @@ #fileNameDisplay.has-file { font-style: normal; font-weight: bold; - color: #1a531d; /* Darker green or success color (定量替换 var(--pico-form-element-valid-border-color, #1a531d)) */ + color: #1a531d; } - #fileDropArea.input-error { - border-color: #d32f2f !important; /* (定量替换 var(--pico-form-element-invalid-border-color, #d32f2f)) */ + #fileDropArea.input-error, input.input-error, select.input-error { /* Extended to input/select */ + border-color: #d32f2f !important; } #fileNameDisplay.input-error-text { - color: #d32f2f !important; /* (定量替换 var(--pico-form-element-invalid-border-color, #d32f2f)) */ + color: #d32f2f !important; font-weight: bold; } - @media (max-width: 768px) { .form-grid { grid-template-columns: 1fr; @@ -176,7 +180,6 @@

-
@@ -206,17 +209,34 @@
- +
- - - + + +
- API 配置 + 文档转换引擎配置 +
+ + +
+ +
+ + +
+ 翻译API配置
@@ -225,8 +245,7 @@ - + @@ -235,14 +254,12 @@
- +
@@ -267,7 +284,7 @@