重构代码,新增了MarkdownBasedManager和TXTManager实现

This commit is contained in:
xunbu
2025-07-28 23:41:35 +08:00
parent 6ab3278481
commit 80634fe749
45 changed files with 885 additions and 139 deletions

View File

@@ -1,9 +1,3 @@
from .converter import Document,Converter
from .converter_mineru import ConverterMineru
from docutranslate.global_values import conditional_import
if conditional_import("docling"):
from .converter_docling import ConverterDocling
# 打包docling时取消下面一行注释
# from .converter_docling import ConverterDocling
"""
这个包用来处理document之间的格式转换
"""

View File

@@ -1,27 +0,0 @@
from typing import Protocol
from pathlib import Path
class Document:
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
if path is None and (filename is None or filebytes is None):
raise Exception("Document的路径或filename、filebytes不能同时为空")
self.filebytes = filebytes
self.filename = filename
self.path = path
if path:
if isinstance(path,str):
path=Path(path)
self.path=path
self.filename=path.name
self.filebytes=path.read_bytes()
self.suffix=Path(self.filename).suffix
self.stem=Path(self.filename).stem
class Converter(Protocol):
#转换为markdown
def convert(self,document:Document)->str:
...
async def convert_async(self,document:Document)->str:
...

View File

@@ -0,0 +1,12 @@
from typing import Protocol, runtime_checkable
from docutranslate.ir.document import Document
@runtime_checkable
class Converter(Protocol):
def convert(self, document: Document) -> Document:
...
async def convert_async(self, document: Document) -> Document:
...

View File

View File

@@ -1,8 +1,9 @@
import asyncio
import logging
import os
import time
from dataclasses import dataclass
from io import BytesIO
from logging import Logger
from pathlib import Path
from docling.datamodel.base_models import InputFormat
@@ -13,34 +14,49 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub.errors import LocalEntryNotFoundError
from docutranslate.converter import Converter, Document
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
IMAGE_RESOLUTION_SCALE = 4
class ConverterDocling(Converter):
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
self.code = code
self.formula = formula
self.artifact = artifact
self.logger = logger if logger else global_logger
@dataclass(frozen=True)
class ConverterDoclingConfig:
code: bool = True
formula: bool = True
artifact: Path | None = None
def convert(self, document):
assert isinstance(document.filename, str)
class ConverterDocling(X2MarkdownConverter):
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
self.config = config
self.code = config.code
self.formula = config.formula
self.artifact = config.artifact
self.logger = logger
def convert(self, document) -> MarkdownDocument:
assert isinstance(document.name, str)
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
result = self.file2markdown_embed_images(document_stream)
document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content))
content = self.file2markdown_embed_images(document_stream)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
async def convert_async(self, document: Document) -> str:
async def convert_async(self, document: Document) -> MarkdownDocument:
return await asyncio.to_thread(
self.convert,
document
)
def support_format(self) -> list[str]:
return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff",
".bmp", ".webp"]
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
pipeline_options.do_ocr = False

View File

@@ -0,0 +1,15 @@
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class ConverterIdentity(X2MarkdownConverter):
def convert(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
async def convert_async(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
def support_format(self) -> list[str]:
return [".md"]

View File

@@ -1,31 +1,43 @@
import asyncio
import logging
import time
import zipfile
from dataclasses import dataclass
from logging import Logger
import httpx
from docutranslate.converter import Converter, Document
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
URL = 'https://mineru.net/api/v4/file-urls/batch'
@dataclass(frozen=True)
class ConverterMineruConfig:
mineru_token: str
formula: bool = True
timeout = httpx.Timeout(
connect=5.0, # 连接超时 (建立连接的最长时间)
read=200.0, # 读取超时 (等待服务器响应的最长时间)
write=200.0, # 写入超时 (发送数据的最长时间)
pool=1.0 # 从连接池获取连接的超时时间
connect=5.0, # 连接超时 (建立连接的最长时间)
read=200.0, # 读取超时 (等待服务器响应的最长时间)
write=200.0, # 写入超时 (发送数据的最长时间)
pool=1.0 # 从连接池获取连接的超时时间
)
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
client_async=httpx.AsyncClient(trust_env=False,timeout=timeout,proxy=None,verify=False)
# TODO: 提供更详细的logger
class ConverterMineru(Converter):
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
self.mineru_token = token.strip()
self.formula = formula
self.logger=logger if logger else global_logger
class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
self.config = config
self.mineru_token = config.mineru_token.strip()
self.formula = config.formula
self.logger = logger
def _get_header(self):
return {
@@ -39,7 +51,7 @@ class ConverterMineru(Converter):
"language": "auto",
"enable_table": True,
"files": [
{"name": f"{document.filename}", "is_ocr": True}
{"name": f"{document.name}", "is_ocr": True}
]
}
@@ -54,7 +66,7 @@ class ConverterMineru(Converter):
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = client.put(urls[0], content=document.filebytes)
res_upload = client.put(urls[0], content=document.content)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
@@ -72,7 +84,7 @@ class ConverterMineru(Converter):
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = await client_async.put(urls[0], content=document.filebytes)
res_upload = await client_async.put(urls[0], content=document.content)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
@@ -87,8 +99,8 @@ class ConverterMineru(Converter):
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
fileurl = fileinfo["full_zip_url"]
return fileurl
file_url = fileinfo["full_zip_url"]
return file_url
else:
time.sleep(3)
@@ -100,36 +112,40 @@ class ConverterMineru(Converter):
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
fileurl = fileinfo["full_zip_url"]
return fileurl
file_url = fileinfo["full_zip_url"]
return file_url
else:
await asyncio.sleep(3)
def convert(self, document: Document) -> str:
def convert(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
batch_id = self.upload(document)
file_url = self.get_file_url(batch_id)
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
content = get_md_from_zip_url_with_inline_images(zip_url=file_url)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
async def convert_async(self, document: Document) -> str:
# 待优化
async def convert_async(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
batch_id = await self.upload_async(document)
file_url = await self.get_file_url_async(batch_id)
result = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
def support_format(self) -> list[str]:
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
def get_md_from_zip_url_with_inline_images(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str | None:
) -> str:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片
@@ -152,7 +168,8 @@ def get_md_from_zip_url_with_inline_images(
except httpx.HTTPStatusError as e:
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
raise Exception(
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
except httpx.RequestError as e:
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
except zipfile.BadZipFile:
@@ -169,7 +186,7 @@ async def get_md_from_zip_url_with_inline_images_async(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str | None:
) -> str:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片
@@ -181,18 +198,20 @@ async def get_md_from_zip_url_with_inline_images_async(
encoding (str): 目标文件的预期编码默认为 "utf-8"
Returns:
str | None: 如果成功返回处理后的Markdown文本内容否则返回 None
str : 如果成功返回处理后的Markdown文本内容
"""
try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = await client_async.get(zip_url) # 增加超时
response.raise_for_status()
print("ZIP文件下载完成。")
return await asyncio.to_thread(embed_inline_image_from_zip,response.content, filename_in_zip=filename_in_zip, encoding=encoding)
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
encoding=encoding)
except httpx.HTTPStatusError as e:
raise Exception(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
raise Exception(
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
except httpx.RequestError as e:
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
except zipfile.BadZipFile:
@@ -204,5 +223,6 @@ async def get_md_from_zip_url_with_inline_images_async(
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
raise Exception(f"发生未知错误: {e}")
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,22 @@
from typing import runtime_checkable
from typing import Protocol
from docutranslate.converter.interfaces import Converter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@runtime_checkable
class X2MarkdownConverter(Converter,Protocol):
"""
负责将其它格式的文件转换为markdown
"""
def convert(self, document: Document) -> MarkdownDocument:
...
async def convert_async(self, document: Document) -> MarkdownDocument:
...
def support_format(self)->list[str]:
...