This commit is contained in:
xunbu
2025-12-16 09:05:40 +08:00
parent a0984ef1a8
commit 35ea4eff9b
4 changed files with 166 additions and 48 deletions

View File

@@ -1,3 +1,3 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
__version__="1.5.5" __version__="1.5.6"

View File

@@ -4,11 +4,20 @@
import asyncio import asyncio
import time import time
import zipfile import zipfile
import io
from dataclasses import dataclass from dataclasses import dataclass
from typing import Hashable, Literal from typing import Hashable, Literal, List, Tuple
import httpx import httpx
# 尝试导入 pypdf用于处理 PDF 拆分
try:
from pypdf import PdfReader, PdfWriter
HAS_PYPDF = True
except ImportError:
HAS_PYPDF = False
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
from docutranslate.ir.attachment_manager import AttachMent from docutranslate.ir.attachment_manager import AttachMent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
@@ -34,12 +43,6 @@ timeout = httpx.Timeout(
write=200.0, # 写入超时 (发送数据的最长时间) write=200.0, # 写入超时 (发送数据的最长时间)
pool=1.0 # 从连接池获取连接的超时时间 pool=1.0 # 从连接池获取连接的超时时间
) )
# if USE_PROXY:
# client = httpx.Client(proxies=get_httpx_proxies(), timeout=timeout, verify=False)
# client_async = httpx.AsyncClient(proxies=get_httpx_proxies(), timeout=timeout, verify=False)
# else:
# client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
# client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
limits = httpx.Limits(max_connections=500, max_keepalive_connections=20) limits = httpx.Limits(max_connections=500, max_keepalive_connections=20)
client = httpx.Client(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False) client = httpx.Client(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False)
@@ -53,6 +56,7 @@ class ConverterMineru(X2MarkdownConverter):
self.formula = config.formula_ocr self.formula = config.formula_ocr
self.model_version = config.model_version self.model_version = config.model_version
self.attachments: list[AttachMent] = [] self.attachments: list[AttachMent] = []
self.max_pages = 600 # Mineru 的限制
def _get_header(self): def _get_header(self):
return { return {
@@ -71,20 +75,55 @@ class ConverterMineru(X2MarkdownConverter):
] ]
} }
def _split_pdf(self, content: bytes) -> List[bytes]:
"""
检查 PDF 页数,如果超过限制则进行拆分。
返回拆分后的 bytes 列表。如果不超限,返回包含原内容的单元素列表。
"""
if not HAS_PYPDF:
self.logger.warning("未安装 pypdf无法进行 PDF 页数检查和拆分。如果文件超过 600 页可能会失败。")
return [content]
try:
reader = PdfReader(io.BytesIO(content))
total_pages = len(reader.pages)
if total_pages <= self.max_pages:
return [content]
self.logger.info(f"PDF 页数 ({total_pages}) 超过限制 ({self.max_pages}),正在进行拆分...")
chunks = []
for i in range(0, total_pages, self.max_pages):
writer = PdfWriter()
end_page = min(i + self.max_pages, total_pages)
for page_num in range(i, end_page):
writer.add_page(reader.pages[page_num])
with io.BytesIO() as output_stream:
writer.write(output_stream)
chunks.append(output_stream.getvalue())
self.logger.info(f"PDF 已拆分为 {len(chunks)} 个部分。")
return chunks
except Exception as e:
self.logger.error(f"PDF 拆分失败: {e}")
# 如果拆分出错,尝试按原文件上传(兜底)
return [content]
def upload(self, document: Document): def upload(self, document: Document):
# 获取上传链接 # 获取上传链接
response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document)) response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
response.raise_for_status() response.raise_for_status()
result = response.json() result = response.json()
# print('response success. result:{}'.format(result))
if result["code"] == 0: if result["code"] == 0:
batch_id = result["data"]["batch_id"] batch_id = result["data"]["batch_id"]
urls = result["data"]["file_urls"] urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取 # 获取
res_upload = client.put(urls[0], content=document.content) res_upload = client.put(urls[0], content=document.content)
res_upload.raise_for_status() res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id return batch_id
else: else:
raise Exception('apply upload url failed,reason:{}'.format(result)) raise Exception('apply upload url failed,reason:{}'.format(result))
@@ -94,15 +133,12 @@ class ConverterMineru(X2MarkdownConverter):
response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document)) response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
response.raise_for_status() response.raise_for_status()
result = response.json() result = response.json()
# print('response success. result:{}'.format(result))
if result["code"] == 0: if result["code"] == 0:
batch_id = result["data"]["batch_id"] batch_id = result["data"]["batch_id"]
urls = result["data"]["file_urls"] urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取 # 获取
res_upload = await client_async.put(urls[0], content=document.content) res_upload = await client_async.put(urls[0], content=document.content)
res_upload.raise_for_status() res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id return batch_id
else: else:
raise Exception('apply upload url failed,reason:{}'.format(result)) raise Exception('apply upload url failed,reason:{}'.format(result))
@@ -117,6 +153,8 @@ class ConverterMineru(X2MarkdownConverter):
if fileinfo["state"] == "done": if fileinfo["state"] == "done":
file_url = fileinfo["full_zip_url"] file_url = fileinfo["full_zip_url"]
return file_url return file_url
elif fileinfo["state"] == "failed":
raise Exception(f"Mineru 处理失败: {fileinfo.get('message', 'Unknown error')}")
else: else:
time.sleep(3) time.sleep(3)
@@ -130,31 +168,121 @@ class ConverterMineru(X2MarkdownConverter):
if fileinfo["state"] == "done": if fileinfo["state"] == "done":
file_url = fileinfo["full_zip_url"] file_url = fileinfo["full_zip_url"]
return file_url return file_url
elif fileinfo["state"] == "failed":
raise Exception(f"Mineru 处理失败: {fileinfo.get('message', 'Unknown error')}")
else: else:
await asyncio.sleep(3) await asyncio.sleep(3)
def _process_single_chunk(self, content: bytes, original_doc: Document, index: int = 0) -> Tuple[str, bytes]:
"""
处理单个分片构造Document -> 上传 -> 等待 -> 下载 -> 提取 Markdown
"""
# 根据 Document 类的定义name 是属性,由 stem+suffix 组成
# 所以我们需要构造正确的 stem 来改变文件名
new_stem = original_doc.stem
if index > 0:
new_stem = f"{original_doc.stem}_part{index}"
chunk_doc = Document.from_bytes(content=content, suffix=original_doc.suffix, stem=new_stem)
batch_id = self.upload(chunk_doc)
file_url = self.get_file_url(batch_id)
md_content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url)
return md_content, mineru_parsed
async def _process_single_chunk_async(self, content: bytes, original_doc: Document, index: int = 0) -> Tuple[
str, bytes]:
"""
异步处理单个分片
"""
new_stem = original_doc.stem
if index > 0:
new_stem = f"{original_doc.stem}_part{index}"
chunk_doc = Document.from_bytes(content=content, suffix=original_doc.suffix, stem=new_stem)
batch_id = await self.upload_async(chunk_doc)
file_url = await self.get_file_url_async(batch_id)
md_content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
return md_content, mineru_parsed
def convert(self, document: Document) -> MarkdownDocument: def convert(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}") self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}")
time1 = time.time() time1 = time.time()
batch_id = self.upload(document)
file_url = self.get_file_url(batch_id) # 1. 检查是否需要拆分 (仅针对 PDF)
content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url) chunks = [document.content]
if mineru_parsed: is_split = False
self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru"))) if document.suffix.lower() == '.pdf':
chunks = self._split_pdf(document.content)
if len(chunks) > 1:
is_split = True
combined_md = []
# 2. 依次处理每个分片
for i, chunk_content in enumerate(chunks):
if is_split:
self.logger.info(f"正在处理分片 {i + 1}/{len(chunks)}...")
md_content, mineru_parsed = self._process_single_chunk(chunk_content, document, i)
combined_md.append(md_content)
# 保存对应的原始解析包
suffix_name = "" if not is_split else f"_part{i + 1}"
if mineru_parsed:
self.attachments.append(
AttachMent(f"mineru{suffix_name}",
Document.from_bytes(content=mineru_parsed, suffix=".zip", stem=f"mineru{suffix_name}"))
)
# 3. 合并 Markdown
final_content = "\n\n".join(combined_md)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}") self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) md_document = MarkdownDocument.from_bytes(content=final_content.encode("utf-8"), suffix=".md",
stem=document.stem)
return md_document return md_document
async def convert_async(self, document: Document) -> MarkdownDocument: async def convert_async(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}") self.logger.info(f"正在将文档转换为markdown (Async), model_version:{self.model_version}")
time1 = time.time() time1 = time.time()
batch_id = await self.upload_async(document)
file_url = await self.get_file_url_async(batch_id) # 1. 检查是否需要拆分
content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url) chunks = [document.content]
if mineru_parsed: is_split = False
self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru"))) if document.suffix.lower() == '.pdf':
# 这里的拆分操作是 CPU 密集型,如果是超大 PDF建议放到 thread pool 中运行
# chunks = await asyncio.to_thread(self._split_pdf, document.content)
chunks = self._split_pdf(document.content)
if len(chunks) > 1:
is_split = True
# 2. 并发处理所有分片
tasks = []
for i, chunk_content in enumerate(chunks):
tasks.append(self._process_single_chunk_async(chunk_content, document, i))
# 等待所有分片处理完成
results = await asyncio.gather(*tasks)
combined_md = []
for i, (md_content, mineru_parsed) in enumerate(results):
combined_md.append(md_content)
suffix_name = "" if not is_split else f"_part{i + 1}"
if mineru_parsed:
self.attachments.append(
AttachMent(f"mineru{suffix_name}",
Document.from_bytes(content=mineru_parsed, suffix=".zip", stem=f"mineru{suffix_name}"))
)
# 3. 合并 Markdown
final_content = "\n\n".join(combined_md)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}") self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem) md_document = MarkdownDocument.from_bytes(content=final_content.encode("utf-8"), suffix=".md",
stem=document.stem)
return md_document return md_document
def support_format(self) -> list[str]: def support_format(self) -> list[str]:
@@ -169,18 +297,12 @@ def get_md_from_zip_url_with_inline_images(
""" """
从给定的ZIP文件URL中下载并提取指定文件的内容 从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片。 并将Markdown文件中的相对路径图片转换为内联Base64图片。
Args:
zip_url (str): ZIP文件的下载链接。
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称包括路径
默认为 "full.md"
encoding (str): 目标文件的预期编码。默认为 "utf-8"
""" """
try: try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") # print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = client.get(zip_url) # 增加超时 response = client.get(zip_url) # 增加超时
response.raise_for_status() response.raise_for_status()
print("ZIP文件下载完成。") # print("ZIP文件下载完成。")
return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip, return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip,
encoding=encoding), response.content encoding=encoding), response.content
@@ -208,21 +330,12 @@ async def get_md_from_zip_url_with_inline_images_async(
""" """
从给定的ZIP文件URL中下载并提取指定文件的内容 从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片。 并将Markdown文件中的相对路径图片转换为内联Base64图片。
Args:
zip_url (str): ZIP文件的下载链接。
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称包括路径
默认为 "full.md"
encoding (str): 目标文件的预期编码。默认为 "utf-8"
Returns:
str : 如果成功返回处理后的Markdown文本内容。
""" """
try: try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...") # print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = await client_async.get(zip_url) # 增加超时 response = await client_async.get(zip_url) # 增加超时
response.raise_for_status() response.raise_for_status()
print("ZIP文件下载完成。") # print("ZIP文件下载完成。")
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip, return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
encoding=encoding), response.content encoding=encoding), response.content
@@ -243,4 +356,4 @@ async def get_md_from_zip_url_with_inline_images_async(
if __name__ == '__main__': if __name__ == '__main__':
pass pass

View File

@@ -20,7 +20,7 @@ dependencies = [
"pysubs2>=1.8.0", "pysubs2>=1.8.0",
"httpx>=0.28.1", "httpx>=0.28.1",
"python-pptx>=1.0.2", "python-pptx>=1.0.2",
"pypdf>=6.4.2",
] ]
dynamic = ["version"] dynamic = ["version"]

View File

@@ -1,4 +1,9 @@
更新日志 更新日志
v1.5.6版 2025.12.16
优化
- mineru单文件可以超过600页
修复
- 修复markdown有长代码块时翻译失败的问题
---------------- ----------------
v1.5.5版 2025.12.14 v1.5.5版 2025.12.14
优化 优化