mineru实现更高粒度的协程
This commit is contained in:
@@ -11,20 +11,19 @@ URL = 'https://mineru.net/api/v4/file-urls/batch'
|
|||||||
|
|
||||||
timeout = httpx.Timeout(
|
timeout = httpx.Timeout(
|
||||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||||
read=120.0, # 读取超时 (等待服务器响应的最长时间)
|
read=200.0, # 读取超时 (等待服务器响应的最长时间)
|
||||||
write=120.0, # 写入超时 (发送数据的最长时间)
|
write=200.0, # 写入超时 (发送数据的最长时间)
|
||||||
pool=1.0 # 从连接池获取连接的超时时间
|
pool=1.0 # 从连接池获取连接的超时时间
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||||
|
client_async=httpx.AsyncClient(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||||
|
|
||||||
# TODO: 提供更详细的logger
|
# TODO: 提供更详细的logger
|
||||||
class ConverterMineru(Converter):
|
class ConverterMineru(Converter):
|
||||||
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
||||||
self.mineru_token = token.strip()
|
self.mineru_token = token.strip()
|
||||||
self.client_async = httpx.AsyncClient(timeout=timeout)
|
|
||||||
self.formula = formula
|
self.formula = formula
|
||||||
self.logger=logger if logger else global_logger
|
self.logger=logger if logger else global_logger
|
||||||
|
|
||||||
@@ -62,6 +61,24 @@ class ConverterMineru(Converter):
|
|||||||
else:
|
else:
|
||||||
raise Exception('apply upload url failed,reason:{}'.format(result))
|
raise Exception('apply upload url failed,reason:{}'.format(result))
|
||||||
|
|
||||||
|
async def upload_async(self, document: Document):
|
||||||
|
# 获取上传链接
|
||||||
|
response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
# print('response success. result:{}'.format(result))
|
||||||
|
if result["code"] == 0:
|
||||||
|
batch_id = result["data"]["batch_id"]
|
||||||
|
urls = result["data"]["file_urls"]
|
||||||
|
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||||
|
# 获取
|
||||||
|
res_upload = await client_async.put(urls[0], content=document.filebytes)
|
||||||
|
res_upload.raise_for_status()
|
||||||
|
# print(f"{urls[0]} upload success")
|
||||||
|
return batch_id
|
||||||
|
else:
|
||||||
|
raise Exception('apply upload url failed,reason:{}'.format(result))
|
||||||
|
|
||||||
def get_file_url(self, batch_id: str) -> str:
|
def get_file_url(self, batch_id: str) -> str:
|
||||||
while True:
|
while True:
|
||||||
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
|
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
|
||||||
@@ -75,6 +92,19 @@ class ConverterMineru(Converter):
|
|||||||
else:
|
else:
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
|
async def get_file_url_async(self, batch_id: str) -> str:
|
||||||
|
while True:
|
||||||
|
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
|
||||||
|
header = self._get_header()
|
||||||
|
res = await client_async.get(url, headers=header)
|
||||||
|
res.raise_for_status()
|
||||||
|
fileinfo = res.json()["data"]["extract_result"][0]
|
||||||
|
if fileinfo["state"] == "done":
|
||||||
|
fileurl = fileinfo["full_zip_url"]
|
||||||
|
return fileurl
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
def convert(self, document: Document) -> str:
|
def convert(self, document: Document) -> str:
|
||||||
self.logger.info(f"正在将文档转换为markdown")
|
self.logger.info(f"正在将文档转换为markdown")
|
||||||
time1 = time.time()
|
time1 = time.time()
|
||||||
@@ -84,13 +114,15 @@ class ConverterMineru(Converter):
|
|||||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# TODO: 实现细粒度更高的协程
|
|
||||||
async def convert_async(self, document: Document) -> str:
|
async def convert_async(self, document: Document) -> str:
|
||||||
# 待优化
|
# 待优化
|
||||||
return await asyncio.to_thread(
|
self.logger.info(f"正在将文档转换为markdown")
|
||||||
self.convert,
|
time1 = time.time()
|
||||||
document
|
batch_id = await self.upload_async(document)
|
||||||
)
|
file_url = await self.get_file_url_async(batch_id)
|
||||||
|
result = await asyncio.to_thread(get_md_from_zip_url_with_inline_images,file_url)
|
||||||
|
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
|
return result
|
||||||
|
|
||||||
def set_config(self, cofig: dict):
|
def set_config(self, cofig: dict):
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user