重构代码,新增了MarkdownBasedManager和TXTManager实现
This commit is contained in:
@@ -1,9 +1,3 @@
|
||||
from .converter import Document,Converter
|
||||
from .converter_mineru import ConverterMineru
|
||||
|
||||
from docutranslate.global_values import conditional_import
|
||||
if conditional_import("docling"):
|
||||
from .converter_docling import ConverterDocling
|
||||
|
||||
# 打包docling时取消下面一行注释
|
||||
# from .converter_docling import ConverterDocling
|
||||
"""
|
||||
这个包用来处理document之间的格式转换
|
||||
"""
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
from typing import Protocol
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Document:
|
||||
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
|
||||
if path is None and (filename is None or filebytes is None):
|
||||
raise Exception("Document的路径或filename、filebytes不能同时为空")
|
||||
self.filebytes = filebytes
|
||||
self.filename = filename
|
||||
self.path = path
|
||||
if path:
|
||||
if isinstance(path,str):
|
||||
path=Path(path)
|
||||
self.path=path
|
||||
self.filename=path.name
|
||||
self.filebytes=path.read_bytes()
|
||||
self.suffix=Path(self.filename).suffix
|
||||
self.stem=Path(self.filename).stem
|
||||
|
||||
class Converter(Protocol):
|
||||
#转换为markdown
|
||||
def convert(self,document:Document)->str:
|
||||
...
|
||||
|
||||
async def convert_async(self,document:Document)->str:
|
||||
...
|
||||
12
docutranslate/converter/interfaces.py
Normal file
12
docutranslate/converter/interfaces.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from typing import Protocol, runtime_checkable
|
||||
|
||||
from docutranslate.ir.document import Document
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class Converter(Protocol):
|
||||
def convert(self, document: Document) -> Document:
|
||||
...
|
||||
|
||||
async def convert_async(self, document: Document) -> Document:
|
||||
...
|
||||
0
docutranslate/converter/x2md/__init__.py
Normal file
0
docutranslate/converter/x2md/__init__.py
Normal file
@@ -1,8 +1,9 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@@ -13,34 +14,49 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||
|
||||
from docutranslate.converter import Converter, Document
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
class ConverterDocling(Converter):
|
||||
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
|
||||
self.code = code
|
||||
self.formula = formula
|
||||
self.artifact = artifact
|
||||
self.logger = logger if logger else global_logger
|
||||
@dataclass(frozen=True)
|
||||
class ConverterDoclingConfig:
|
||||
code: bool = True
|
||||
formula: bool = True
|
||||
artifact: Path | None = None
|
||||
|
||||
def convert(self, document):
|
||||
assert isinstance(document.filename, str)
|
||||
|
||||
class ConverterDocling(X2MarkdownConverter):
|
||||
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
|
||||
self.config = config
|
||||
self.code = config.code
|
||||
self.formula = config.formula
|
||||
self.artifact = config.artifact
|
||||
self.logger = logger
|
||||
|
||||
def convert(self, document) -> MarkdownDocument:
|
||||
assert isinstance(document.name, str)
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
|
||||
result = self.file2markdown_embed_images(document_stream)
|
||||
document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content))
|
||||
content = self.file2markdown_embed_images(document_stream)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||
return md_document
|
||||
|
||||
async def convert_async(self, document: Document) -> str:
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
return await asyncio.to_thread(
|
||||
self.convert,
|
||||
document
|
||||
)
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff",
|
||||
".bmp", ".webp"]
|
||||
|
||||
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
|
||||
pipeline_options.do_ocr = False
|
||||
15
docutranslate/converter/x2md/converter_identity.py
Normal file
15
docutranslate/converter/x2md/converter_identity.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
|
||||
class ConverterIdentity(X2MarkdownConverter):
|
||||
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".md"]
|
||||
@@ -1,31 +1,43 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
|
||||
import httpx
|
||||
from docutranslate.converter import Converter, Document
|
||||
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||
|
||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ConverterMineruConfig:
|
||||
mineru_token: str
|
||||
formula: bool = True
|
||||
|
||||
|
||||
timeout = httpx.Timeout(
|
||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||
read=200.0, # 读取超时 (等待服务器响应的最长时间)
|
||||
write=200.0, # 写入超时 (发送数据的最长时间)
|
||||
pool=1.0 # 从连接池获取连接的超时时间
|
||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||
read=200.0, # 读取超时 (等待服务器响应的最长时间)
|
||||
write=200.0, # 写入超时 (发送数据的最长时间)
|
||||
pool=1.0 # 从连接池获取连接的超时时间
|
||||
)
|
||||
|
||||
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
|
||||
|
||||
client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||
client_async=httpx.AsyncClient(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||
|
||||
# TODO: 提供更详细的logger
|
||||
class ConverterMineru(Converter):
|
||||
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
||||
self.mineru_token = token.strip()
|
||||
self.formula = formula
|
||||
self.logger=logger if logger else global_logger
|
||||
class ConverterMineru(X2MarkdownConverter):
|
||||
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
|
||||
self.config = config
|
||||
self.mineru_token = config.mineru_token.strip()
|
||||
self.formula = config.formula
|
||||
self.logger = logger
|
||||
|
||||
def _get_header(self):
|
||||
return {
|
||||
@@ -39,7 +51,7 @@ class ConverterMineru(Converter):
|
||||
"language": "auto",
|
||||
"enable_table": True,
|
||||
"files": [
|
||||
{"name": f"{document.filename}", "is_ocr": True}
|
||||
{"name": f"{document.name}", "is_ocr": True}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -54,7 +66,7 @@ class ConverterMineru(Converter):
|
||||
urls = result["data"]["file_urls"]
|
||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||
# 获取
|
||||
res_upload = client.put(urls[0], content=document.filebytes)
|
||||
res_upload = client.put(urls[0], content=document.content)
|
||||
res_upload.raise_for_status()
|
||||
# print(f"{urls[0]} upload success")
|
||||
return batch_id
|
||||
@@ -72,7 +84,7 @@ class ConverterMineru(Converter):
|
||||
urls = result["data"]["file_urls"]
|
||||
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||
# 获取
|
||||
res_upload = await client_async.put(urls[0], content=document.filebytes)
|
||||
res_upload = await client_async.put(urls[0], content=document.content)
|
||||
res_upload.raise_for_status()
|
||||
# print(f"{urls[0]} upload success")
|
||||
return batch_id
|
||||
@@ -87,8 +99,8 @@ class ConverterMineru(Converter):
|
||||
res.raise_for_status()
|
||||
fileinfo = res.json()["data"]["extract_result"][0]
|
||||
if fileinfo["state"] == "done":
|
||||
fileurl = fileinfo["full_zip_url"]
|
||||
return fileurl
|
||||
file_url = fileinfo["full_zip_url"]
|
||||
return file_url
|
||||
else:
|
||||
time.sleep(3)
|
||||
|
||||
@@ -100,36 +112,40 @@ class ConverterMineru(Converter):
|
||||
res.raise_for_status()
|
||||
fileinfo = res.json()["data"]["extract_result"][0]
|
||||
if fileinfo["state"] == "done":
|
||||
fileurl = fileinfo["full_zip_url"]
|
||||
return fileurl
|
||||
file_url = fileinfo["full_zip_url"]
|
||||
return file_url
|
||||
else:
|
||||
await asyncio.sleep(3)
|
||||
|
||||
def convert(self, document: Document) -> str:
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
batch_id = self.upload(document)
|
||||
file_url = self.get_file_url(batch_id)
|
||||
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||
content = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||
return md_document
|
||||
|
||||
async def convert_async(self, document: Document) -> str:
|
||||
# 待优化
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
batch_id = await self.upload_async(document)
|
||||
file_url = await self.get_file_url_async(batch_id)
|
||||
result = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
||||
content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
|
||||
return md_document
|
||||
|
||||
def support_format(self) -> list[str]:
|
||||
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
|
||||
|
||||
|
||||
def get_md_from_zip_url_with_inline_images(
|
||||
zip_url: str,
|
||||
filename_in_zip: str = "full.md",
|
||||
encoding: str = "utf-8"
|
||||
) -> str | None:
|
||||
) -> str:
|
||||
"""
|
||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||
@@ -152,7 +168,8 @@ def get_md_from_zip_url_with_inline_images(
|
||||
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
raise Exception(
|
||||
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
except httpx.RequestError as e:
|
||||
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||
except zipfile.BadZipFile:
|
||||
@@ -169,7 +186,7 @@ async def get_md_from_zip_url_with_inline_images_async(
|
||||
zip_url: str,
|
||||
filename_in_zip: str = "full.md",
|
||||
encoding: str = "utf-8"
|
||||
) -> str | None:
|
||||
) -> str:
|
||||
"""
|
||||
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||
@@ -181,18 +198,20 @@ async def get_md_from_zip_url_with_inline_images_async(
|
||||
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
||||
|
||||
Returns:
|
||||
str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。
|
||||
str : 如果成功,返回处理后的Markdown文本内容。
|
||||
"""
|
||||
try:
|
||||
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
||||
response = await client_async.get(zip_url) # 增加超时
|
||||
response.raise_for_status()
|
||||
print("ZIP文件下载完成。")
|
||||
return await asyncio.to_thread(embed_inline_image_from_zip,response.content, filename_in_zip=filename_in_zip, encoding=encoding)
|
||||
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
|
||||
encoding=encoding)
|
||||
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
raise Exception(
|
||||
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
|
||||
except httpx.RequestError as e:
|
||||
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||
except zipfile.BadZipFile:
|
||||
@@ -204,5 +223,6 @@ async def get_md_from_zip_url_with_inline_images_async(
|
||||
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
|
||||
raise Exception(f"发生未知错误: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
22
docutranslate/converter/x2md/interfaces.py
Normal file
22
docutranslate/converter/x2md/interfaces.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from typing import runtime_checkable
|
||||
|
||||
from typing import Protocol
|
||||
from docutranslate.converter.interfaces import Converter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class X2MarkdownConverter(Converter,Protocol):
|
||||
"""
|
||||
负责将其它格式的文件转换为markdown
|
||||
"""
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
...
|
||||
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
...
|
||||
|
||||
def support_format(self)->list[str]:
|
||||
...
|
||||
Reference in New Issue
Block a user