From 35ea4eff9b10d100df3dfb1493ae12472f372a4a Mon Sep 17 00:00:00 2001 From: xunbu Date: Tue, 16 Dec 2025 09:05:40 +0800 Subject: [PATCH] update --- docutranslate/__init__.py | 2 +- .../converter/x2md/converter_mineru.py | 205 ++++++++++++++---- pyproject.toml | 2 +- 更新日志.txt | 5 + 4 files changed, 166 insertions(+), 48 deletions(-) diff --git a/docutranslate/__init__.py b/docutranslate/__init__.py index d7690dc..0237d1b 100644 --- a/docutranslate/__init__.py +++ b/docutranslate/__init__.py @@ -1,3 +1,3 @@ # SPDX-FileCopyrightText: 2025 QinHan # SPDX-License-Identifier: MPL-2.0 -__version__="1.5.5" \ No newline at end of file +__version__="1.5.6" \ No newline at end of file diff --git a/docutranslate/converter/x2md/converter_mineru.py b/docutranslate/converter/x2md/converter_mineru.py index 78c2728..f9cb63f 100644 --- a/docutranslate/converter/x2md/converter_mineru.py +++ b/docutranslate/converter/x2md/converter_mineru.py @@ -4,11 +4,20 @@ import asyncio import time import zipfile +import io from dataclasses import dataclass -from typing import Hashable, Literal +from typing import Hashable, Literal, List, Tuple import httpx +# 尝试导入 pypdf,用于处理 PDF 拆分 +try: + from pypdf import PdfReader, PdfWriter + + HAS_PYPDF = True +except ImportError: + HAS_PYPDF = False + from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig from docutranslate.ir.attachment_manager import AttachMent from docutranslate.ir.document import Document @@ -34,12 +43,6 @@ timeout = httpx.Timeout( write=200.0, # 写入超时 (发送数据的最长时间) pool=1.0 # 从连接池获取连接的超时时间 ) -# if USE_PROXY: -# client = httpx.Client(proxies=get_httpx_proxies(), timeout=timeout, verify=False) -# client_async = httpx.AsyncClient(proxies=get_httpx_proxies(), timeout=timeout, verify=False) -# else: -# client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False) -# client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False) limits = httpx.Limits(max_connections=500, max_keepalive_connections=20) client = httpx.Client(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False) @@ -53,6 +56,7 @@ class ConverterMineru(X2MarkdownConverter): self.formula = config.formula_ocr self.model_version = config.model_version self.attachments: list[AttachMent] = [] + self.max_pages = 600 # Mineru 的限制 def _get_header(self): return { @@ -71,20 +75,55 @@ class ConverterMineru(X2MarkdownConverter): ] } + def _split_pdf(self, content: bytes) -> List[bytes]: + """ + 检查 PDF 页数,如果超过限制则进行拆分。 + 返回拆分后的 bytes 列表。如果不超限,返回包含原内容的单元素列表。 + """ + if not HAS_PYPDF: + self.logger.warning("未安装 pypdf,无法进行 PDF 页数检查和拆分。如果文件超过 600 页可能会失败。") + return [content] + + try: + reader = PdfReader(io.BytesIO(content)) + total_pages = len(reader.pages) + + if total_pages <= self.max_pages: + return [content] + + self.logger.info(f"PDF 页数 ({total_pages}) 超过限制 ({self.max_pages}),正在进行拆分...") + chunks = [] + + for i in range(0, total_pages, self.max_pages): + writer = PdfWriter() + end_page = min(i + self.max_pages, total_pages) + + for page_num in range(i, end_page): + writer.add_page(reader.pages[page_num]) + + with io.BytesIO() as output_stream: + writer.write(output_stream) + chunks.append(output_stream.getvalue()) + + self.logger.info(f"PDF 已拆分为 {len(chunks)} 个部分。") + return chunks + + except Exception as e: + self.logger.error(f"PDF 拆分失败: {e}") + # 如果拆分出错,尝试按原文件上传(兜底) + return [content] + def upload(self, document: Document): # 获取上传链接 response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document)) response.raise_for_status() result = response.json() - # print('response success. result:{}'.format(result)) if result["code"] == 0: batch_id = result["data"]["batch_id"] urls = result["data"]["file_urls"] - # print('batch_id:{},urls:{}'.format(batch_id, urls)) # 获取 res_upload = client.put(urls[0], content=document.content) res_upload.raise_for_status() - # print(f"{urls[0]} upload success") return batch_id else: raise Exception('apply upload url failed,reason:{}'.format(result)) @@ -94,15 +133,12 @@ class ConverterMineru(X2MarkdownConverter): response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document)) response.raise_for_status() result = response.json() - # print('response success. result:{}'.format(result)) if result["code"] == 0: batch_id = result["data"]["batch_id"] urls = result["data"]["file_urls"] - # print('batch_id:{},urls:{}'.format(batch_id, urls)) # 获取 res_upload = await client_async.put(urls[0], content=document.content) res_upload.raise_for_status() - # print(f"{urls[0]} upload success") return batch_id else: raise Exception('apply upload url failed,reason:{}'.format(result)) @@ -117,6 +153,8 @@ class ConverterMineru(X2MarkdownConverter): if fileinfo["state"] == "done": file_url = fileinfo["full_zip_url"] return file_url + elif fileinfo["state"] == "failed": + raise Exception(f"Mineru 处理失败: {fileinfo.get('message', 'Unknown error')}") else: time.sleep(3) @@ -130,31 +168,121 @@ class ConverterMineru(X2MarkdownConverter): if fileinfo["state"] == "done": file_url = fileinfo["full_zip_url"] return file_url + elif fileinfo["state"] == "failed": + raise Exception(f"Mineru 处理失败: {fileinfo.get('message', 'Unknown error')}") else: await asyncio.sleep(3) + def _process_single_chunk(self, content: bytes, original_doc: Document, index: int = 0) -> Tuple[str, bytes]: + """ + 处理单个分片:构造Document -> 上传 -> 等待 -> 下载 -> 提取 Markdown + """ + # 根据 Document 类的定义,name 是属性,由 stem+suffix 组成 + # 所以我们需要构造正确的 stem 来改变文件名 + new_stem = original_doc.stem + if index > 0: + new_stem = f"{original_doc.stem}_part{index}" + + chunk_doc = Document.from_bytes(content=content, suffix=original_doc.suffix, stem=new_stem) + + batch_id = self.upload(chunk_doc) + file_url = self.get_file_url(batch_id) + md_content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url) + return md_content, mineru_parsed + + async def _process_single_chunk_async(self, content: bytes, original_doc: Document, index: int = 0) -> Tuple[ + str, bytes]: + """ + 异步处理单个分片 + """ + new_stem = original_doc.stem + if index > 0: + new_stem = f"{original_doc.stem}_part{index}" + + chunk_doc = Document.from_bytes(content=content, suffix=original_doc.suffix, stem=new_stem) + + batch_id = await self.upload_async(chunk_doc) + file_url = await self.get_file_url_async(batch_id) + md_content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) + return md_content, mineru_parsed + def convert(self, document: Document) -> MarkdownDocument: self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}") time1 = time.time() - batch_id = self.upload(document) - file_url = self.get_file_url(batch_id) - content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url) - if mineru_parsed: - self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru"))) + + # 1. 检查是否需要拆分 (仅针对 PDF) + chunks = [document.content] + is_split = False + if document.suffix.lower() == '.pdf': + chunks = self._split_pdf(document.content) + if len(chunks) > 1: + is_split = True + + combined_md = [] + + # 2. 依次处理每个分片 + for i, chunk_content in enumerate(chunks): + if is_split: + self.logger.info(f"正在处理分片 {i + 1}/{len(chunks)}...") + + md_content, mineru_parsed = self._process_single_chunk(chunk_content, document, i) + combined_md.append(md_content) + + # 保存对应的原始解析包 + suffix_name = "" if not is_split else f"_part{i + 1}" + if mineru_parsed: + self.attachments.append( + AttachMent(f"mineru{suffix_name}", + Document.from_bytes(content=mineru_parsed, suffix=".zip", stem=f"mineru{suffix_name}")) + ) + + # 3. 合并 Markdown + final_content = "\n\n".join(combined_md) + self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") - md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) + md_document = MarkdownDocument.from_bytes(content=final_content.encode("utf-8"), suffix=".md", + stem=document.stem) return md_document async def convert_async(self, document: Document) -> MarkdownDocument: - self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}") + self.logger.info(f"正在将文档转换为markdown (Async), model_version:{self.model_version}") time1 = time.time() - batch_id = await self.upload_async(document) - file_url = await self.get_file_url_async(batch_id) - content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) - if mineru_parsed: - self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru"))) + + # 1. 检查是否需要拆分 + chunks = [document.content] + is_split = False + if document.suffix.lower() == '.pdf': + # 这里的拆分操作是 CPU 密集型,如果是超大 PDF,建议放到 thread pool 中运行 + # chunks = await asyncio.to_thread(self._split_pdf, document.content) + chunks = self._split_pdf(document.content) + if len(chunks) > 1: + is_split = True + + # 2. 并发处理所有分片 + tasks = [] + for i, chunk_content in enumerate(chunks): + tasks.append(self._process_single_chunk_async(chunk_content, document, i)) + + # 等待所有分片处理完成 + results = await asyncio.gather(*tasks) + + combined_md = [] + for i, (md_content, mineru_parsed) in enumerate(results): + combined_md.append(md_content) + + suffix_name = "" if not is_split else f"_part{i + 1}" + if mineru_parsed: + self.attachments.append( + AttachMent(f"mineru{suffix_name}", + Document.from_bytes(content=mineru_parsed, suffix=".zip", stem=f"mineru{suffix_name}")) + ) + + # 3. 合并 Markdown + final_content = "\n\n".join(combined_md) + self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒") - md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) + md_document = MarkdownDocument.from_bytes(content=final_content.encode("utf-8"), suffix=".md", + stem=document.stem) return md_document def support_format(self) -> list[str]: @@ -169,18 +297,12 @@ def get_md_from_zip_url_with_inline_images( """ 从给定的ZIP文件URL中下载并提取指定文件的内容, 并将Markdown文件中的相对路径图片转换为内联Base64图片。 - - Args: - zip_url (str): ZIP文件的下载链接。 - filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。 - 默认为 "full.md"。 - encoding (str): 目标文件的预期编码。默认为 "utf-8"。 """ try: - print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") + # print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") response = client.get(zip_url) # 增加超时 response.raise_for_status() - print("ZIP文件下载完成。") + # print("ZIP文件下载完成。") return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip, encoding=encoding), response.content @@ -208,21 +330,12 @@ async def get_md_from_zip_url_with_inline_images_async( """ 从给定的ZIP文件URL中下载并提取指定文件的内容, 并将Markdown文件中的相对路径图片转换为内联Base64图片。 - - Args: - zip_url (str): ZIP文件的下载链接。 - filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。 - 默认为 "full.md"。 - encoding (str): 目标文件的预期编码。默认为 "utf-8"。 - - Returns: - str : 如果成功,返回处理后的Markdown文本内容。 """ try: - print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") + # print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") response = await client_async.get(zip_url) # 增加超时 response.raise_for_status() - print("ZIP文件下载完成。") + # print("ZIP文件下载完成。") return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip, encoding=encoding), response.content @@ -243,4 +356,4 @@ async def get_md_from_zip_url_with_inline_images_async( if __name__ == '__main__': - pass + pass \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 9c5fc51..f767689 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "pysubs2>=1.8.0", "httpx>=0.28.1", "python-pptx>=1.0.2", - + "pypdf>=6.4.2", ] dynamic = ["version"] diff --git a/更新日志.txt b/更新日志.txt index 4faafc2..acbbe54 100644 --- a/更新日志.txt +++ b/更新日志.txt @@ -1,4 +1,9 @@ 更新日志 +v1.5.6版 2025.12.16 +优化 +- mineru单文件可以超过600页 +修复 +- 修复markdown有长代码块时翻译失败的问题 ---------------- v1.5.5版 2025.12.14 优化