update
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
__version__="1.5.5"
|
__version__="1.5.6"
|
||||||
@@ -4,11 +4,20 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import io
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Hashable, Literal
|
from typing import Hashable, Literal, List, Tuple
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
# 尝试导入 pypdf,用于处理 PDF 拆分
|
||||||
|
try:
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
HAS_PYPDF = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_PYPDF = False
|
||||||
|
|
||||||
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
|
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
|
||||||
from docutranslate.ir.attachment_manager import AttachMent
|
from docutranslate.ir.attachment_manager import AttachMent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
@@ -34,12 +43,6 @@ timeout = httpx.Timeout(
|
|||||||
write=200.0, # 写入超时 (发送数据的最长时间)
|
write=200.0, # 写入超时 (发送数据的最长时间)
|
||||||
pool=1.0 # 从连接池获取连接的超时时间
|
pool=1.0 # 从连接池获取连接的超时时间
|
||||||
)
|
)
|
||||||
# if USE_PROXY:
|
|
||||||
# client = httpx.Client(proxies=get_httpx_proxies(), timeout=timeout, verify=False)
|
|
||||||
# client_async = httpx.AsyncClient(proxies=get_httpx_proxies(), timeout=timeout, verify=False)
|
|
||||||
# else:
|
|
||||||
# client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
|
||||||
# client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
|
||||||
|
|
||||||
limits = httpx.Limits(max_connections=500, max_keepalive_connections=20)
|
limits = httpx.Limits(max_connections=500, max_keepalive_connections=20)
|
||||||
client = httpx.Client(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False)
|
client = httpx.Client(limits=limits, trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||||
@@ -53,6 +56,7 @@ class ConverterMineru(X2MarkdownConverter):
|
|||||||
self.formula = config.formula_ocr
|
self.formula = config.formula_ocr
|
||||||
self.model_version = config.model_version
|
self.model_version = config.model_version
|
||||||
self.attachments: list[AttachMent] = []
|
self.attachments: list[AttachMent] = []
|
||||||
|
self.max_pages = 600 # Mineru 的限制
|
||||||
|
|
||||||
def _get_header(self):
|
def _get_header(self):
|
||||||
return {
|
return {
|
||||||
@@ -71,20 +75,55 @@ class ConverterMineru(X2MarkdownConverter):
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _split_pdf(self, content: bytes) -> List[bytes]:
|
||||||
|
"""
|
||||||
|
检查 PDF 页数,如果超过限制则进行拆分。
|
||||||
|
返回拆分后的 bytes 列表。如果不超限,返回包含原内容的单元素列表。
|
||||||
|
"""
|
||||||
|
if not HAS_PYPDF:
|
||||||
|
self.logger.warning("未安装 pypdf,无法进行 PDF 页数检查和拆分。如果文件超过 600 页可能会失败。")
|
||||||
|
return [content]
|
||||||
|
|
||||||
|
try:
|
||||||
|
reader = PdfReader(io.BytesIO(content))
|
||||||
|
total_pages = len(reader.pages)
|
||||||
|
|
||||||
|
if total_pages <= self.max_pages:
|
||||||
|
return [content]
|
||||||
|
|
||||||
|
self.logger.info(f"PDF 页数 ({total_pages}) 超过限制 ({self.max_pages}),正在进行拆分...")
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
for i in range(0, total_pages, self.max_pages):
|
||||||
|
writer = PdfWriter()
|
||||||
|
end_page = min(i + self.max_pages, total_pages)
|
||||||
|
|
||||||
|
for page_num in range(i, end_page):
|
||||||
|
writer.add_page(reader.pages[page_num])
|
||||||
|
|
||||||
|
with io.BytesIO() as output_stream:
|
||||||
|
writer.write(output_stream)
|
||||||
|
chunks.append(output_stream.getvalue())
|
||||||
|
|
||||||
|
self.logger.info(f"PDF 已拆分为 {len(chunks)} 个部分。")
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"PDF 拆分失败: {e}")
|
||||||
|
# 如果拆分出错,尝试按原文件上传(兜底)
|
||||||
|
return [content]
|
||||||
|
|
||||||
def upload(self, document: Document):
|
def upload(self, document: Document):
|
||||||
# 获取上传链接
|
# 获取上传链接
|
||||||
response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
|
response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
result = response.json()
|
result = response.json()
|
||||||
# print('response success. result:{}'.format(result))
|
|
||||||
if result["code"] == 0:
|
if result["code"] == 0:
|
||||||
batch_id = result["data"]["batch_id"]
|
batch_id = result["data"]["batch_id"]
|
||||||
urls = result["data"]["file_urls"]
|
urls = result["data"]["file_urls"]
|
||||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
|
||||||
# 获取
|
# 获取
|
||||||
res_upload = client.put(urls[0], content=document.content)
|
res_upload = client.put(urls[0], content=document.content)
|
||||||
res_upload.raise_for_status()
|
res_upload.raise_for_status()
|
||||||
# print(f"{urls[0]} upload success")
|
|
||||||
return batch_id
|
return batch_id
|
||||||
else:
|
else:
|
||||||
raise Exception('apply upload url failed,reason:{}'.format(result))
|
raise Exception('apply upload url failed,reason:{}'.format(result))
|
||||||
@@ -94,15 +133,12 @@ class ConverterMineru(X2MarkdownConverter):
|
|||||||
response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
|
response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
result = response.json()
|
result = response.json()
|
||||||
# print('response success. result:{}'.format(result))
|
|
||||||
if result["code"] == 0:
|
if result["code"] == 0:
|
||||||
batch_id = result["data"]["batch_id"]
|
batch_id = result["data"]["batch_id"]
|
||||||
urls = result["data"]["file_urls"]
|
urls = result["data"]["file_urls"]
|
||||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
|
||||||
# 获取
|
# 获取
|
||||||
res_upload = await client_async.put(urls[0], content=document.content)
|
res_upload = await client_async.put(urls[0], content=document.content)
|
||||||
res_upload.raise_for_status()
|
res_upload.raise_for_status()
|
||||||
# print(f"{urls[0]} upload success")
|
|
||||||
return batch_id
|
return batch_id
|
||||||
else:
|
else:
|
||||||
raise Exception('apply upload url failed,reason:{}'.format(result))
|
raise Exception('apply upload url failed,reason:{}'.format(result))
|
||||||
@@ -117,6 +153,8 @@ class ConverterMineru(X2MarkdownConverter):
|
|||||||
if fileinfo["state"] == "done":
|
if fileinfo["state"] == "done":
|
||||||
file_url = fileinfo["full_zip_url"]
|
file_url = fileinfo["full_zip_url"]
|
||||||
return file_url
|
return file_url
|
||||||
|
elif fileinfo["state"] == "failed":
|
||||||
|
raise Exception(f"Mineru 处理失败: {fileinfo.get('message', 'Unknown error')}")
|
||||||
else:
|
else:
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
@@ -130,31 +168,121 @@ class ConverterMineru(X2MarkdownConverter):
|
|||||||
if fileinfo["state"] == "done":
|
if fileinfo["state"] == "done":
|
||||||
file_url = fileinfo["full_zip_url"]
|
file_url = fileinfo["full_zip_url"]
|
||||||
return file_url
|
return file_url
|
||||||
|
elif fileinfo["state"] == "failed":
|
||||||
|
raise Exception(f"Mineru 处理失败: {fileinfo.get('message', 'Unknown error')}")
|
||||||
else:
|
else:
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
def _process_single_chunk(self, content: bytes, original_doc: Document, index: int = 0) -> Tuple[str, bytes]:
|
||||||
|
"""
|
||||||
|
处理单个分片:构造Document -> 上传 -> 等待 -> 下载 -> 提取 Markdown
|
||||||
|
"""
|
||||||
|
# 根据 Document 类的定义,name 是属性,由 stem+suffix 组成
|
||||||
|
# 所以我们需要构造正确的 stem 来改变文件名
|
||||||
|
new_stem = original_doc.stem
|
||||||
|
if index > 0:
|
||||||
|
new_stem = f"{original_doc.stem}_part{index}"
|
||||||
|
|
||||||
|
chunk_doc = Document.from_bytes(content=content, suffix=original_doc.suffix, stem=new_stem)
|
||||||
|
|
||||||
|
batch_id = self.upload(chunk_doc)
|
||||||
|
file_url = self.get_file_url(batch_id)
|
||||||
|
md_content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||||
|
return md_content, mineru_parsed
|
||||||
|
|
||||||
|
async def _process_single_chunk_async(self, content: bytes, original_doc: Document, index: int = 0) -> Tuple[
|
||||||
|
str, bytes]:
|
||||||
|
"""
|
||||||
|
异步处理单个分片
|
||||||
|
"""
|
||||||
|
new_stem = original_doc.stem
|
||||||
|
if index > 0:
|
||||||
|
new_stem = f"{original_doc.stem}_part{index}"
|
||||||
|
|
||||||
|
chunk_doc = Document.from_bytes(content=content, suffix=original_doc.suffix, stem=new_stem)
|
||||||
|
|
||||||
|
batch_id = await self.upload_async(chunk_doc)
|
||||||
|
file_url = await self.get_file_url_async(batch_id)
|
||||||
|
md_content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
||||||
|
return md_content, mineru_parsed
|
||||||
|
|
||||||
def convert(self, document: Document) -> MarkdownDocument:
|
def convert(self, document: Document) -> MarkdownDocument:
|
||||||
self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}")
|
self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}")
|
||||||
time1 = time.time()
|
time1 = time.time()
|
||||||
batch_id = self.upload(document)
|
|
||||||
file_url = self.get_file_url(batch_id)
|
# 1. 检查是否需要拆分 (仅针对 PDF)
|
||||||
content, mineru_parsed = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
chunks = [document.content]
|
||||||
if mineru_parsed:
|
is_split = False
|
||||||
self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru")))
|
if document.suffix.lower() == '.pdf':
|
||||||
|
chunks = self._split_pdf(document.content)
|
||||||
|
if len(chunks) > 1:
|
||||||
|
is_split = True
|
||||||
|
|
||||||
|
combined_md = []
|
||||||
|
|
||||||
|
# 2. 依次处理每个分片
|
||||||
|
for i, chunk_content in enumerate(chunks):
|
||||||
|
if is_split:
|
||||||
|
self.logger.info(f"正在处理分片 {i + 1}/{len(chunks)}...")
|
||||||
|
|
||||||
|
md_content, mineru_parsed = self._process_single_chunk(chunk_content, document, i)
|
||||||
|
combined_md.append(md_content)
|
||||||
|
|
||||||
|
# 保存对应的原始解析包
|
||||||
|
suffix_name = "" if not is_split else f"_part{i + 1}"
|
||||||
|
if mineru_parsed:
|
||||||
|
self.attachments.append(
|
||||||
|
AttachMent(f"mineru{suffix_name}",
|
||||||
|
Document.from_bytes(content=mineru_parsed, suffix=".zip", stem=f"mineru{suffix_name}"))
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. 合并 Markdown
|
||||||
|
final_content = "\n\n".join(combined_md)
|
||||||
|
|
||||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
md_document = MarkdownDocument.from_bytes(content=final_content.encode("utf-8"), suffix=".md",
|
||||||
|
stem=document.stem)
|
||||||
return md_document
|
return md_document
|
||||||
|
|
||||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||||
self.logger.info(f"正在将文档转换为markdown,model_version:{self.model_version}")
|
self.logger.info(f"正在将文档转换为markdown (Async), model_version:{self.model_version}")
|
||||||
time1 = time.time()
|
time1 = time.time()
|
||||||
batch_id = await self.upload_async(document)
|
|
||||||
file_url = await self.get_file_url_async(batch_id)
|
# 1. 检查是否需要拆分
|
||||||
content, mineru_parsed = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
chunks = [document.content]
|
||||||
if mineru_parsed:
|
is_split = False
|
||||||
self.attachments.append(AttachMent("mineru",Document.from_bytes(content=mineru_parsed, suffix=".zip", stem="mineru")))
|
if document.suffix.lower() == '.pdf':
|
||||||
|
# 这里的拆分操作是 CPU 密集型,如果是超大 PDF,建议放到 thread pool 中运行
|
||||||
|
# chunks = await asyncio.to_thread(self._split_pdf, document.content)
|
||||||
|
chunks = self._split_pdf(document.content)
|
||||||
|
if len(chunks) > 1:
|
||||||
|
is_split = True
|
||||||
|
|
||||||
|
# 2. 并发处理所有分片
|
||||||
|
tasks = []
|
||||||
|
for i, chunk_content in enumerate(chunks):
|
||||||
|
tasks.append(self._process_single_chunk_async(chunk_content, document, i))
|
||||||
|
|
||||||
|
# 等待所有分片处理完成
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
combined_md = []
|
||||||
|
for i, (md_content, mineru_parsed) in enumerate(results):
|
||||||
|
combined_md.append(md_content)
|
||||||
|
|
||||||
|
suffix_name = "" if not is_split else f"_part{i + 1}"
|
||||||
|
if mineru_parsed:
|
||||||
|
self.attachments.append(
|
||||||
|
AttachMent(f"mineru{suffix_name}",
|
||||||
|
Document.from_bytes(content=mineru_parsed, suffix=".zip", stem=f"mineru{suffix_name}"))
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. 合并 Markdown
|
||||||
|
final_content = "\n\n".join(combined_md)
|
||||||
|
|
||||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
md_document = MarkdownDocument.from_bytes(content=final_content.encode("utf-8"), suffix=".md",
|
||||||
|
stem=document.stem)
|
||||||
return md_document
|
return md_document
|
||||||
|
|
||||||
def support_format(self) -> list[str]:
|
def support_format(self) -> list[str]:
|
||||||
@@ -169,18 +297,12 @@ def get_md_from_zip_url_with_inline_images(
|
|||||||
"""
|
"""
|
||||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||||
|
|
||||||
Args:
|
|
||||||
zip_url (str): ZIP文件的下载链接。
|
|
||||||
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。
|
|
||||||
默认为 "full.md"。
|
|
||||||
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
# print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
||||||
response = client.get(zip_url) # 增加超时
|
response = client.get(zip_url) # 增加超时
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
print("ZIP文件下载完成。")
|
# print("ZIP文件下载完成。")
|
||||||
return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip,
|
return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip,
|
||||||
encoding=encoding), response.content
|
encoding=encoding), response.content
|
||||||
|
|
||||||
@@ -208,21 +330,12 @@ async def get_md_from_zip_url_with_inline_images_async(
|
|||||||
"""
|
"""
|
||||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||||
|
|
||||||
Args:
|
|
||||||
zip_url (str): ZIP文件的下载链接。
|
|
||||||
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。
|
|
||||||
默认为 "full.md"。
|
|
||||||
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str : 如果成功,返回处理后的Markdown文本内容。
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
# print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
||||||
response = await client_async.get(zip_url) # 增加超时
|
response = await client_async.get(zip_url) # 增加超时
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
print("ZIP文件下载完成。")
|
# print("ZIP文件下载完成。")
|
||||||
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
|
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
|
||||||
encoding=encoding), response.content
|
encoding=encoding), response.content
|
||||||
|
|
||||||
@@ -243,4 +356,4 @@ async def get_md_from_zip_url_with_inline_images_async(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pass
|
pass
|
||||||
@@ -20,7 +20,7 @@ dependencies = [
|
|||||||
"pysubs2>=1.8.0",
|
"pysubs2>=1.8.0",
|
||||||
"httpx>=0.28.1",
|
"httpx>=0.28.1",
|
||||||
"python-pptx>=1.0.2",
|
"python-pptx>=1.0.2",
|
||||||
|
"pypdf>=6.4.2",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user