重构代码,新增了MarkdownBasedManager和TXTManager实现

This commit is contained in:
xunbu
2025-07-28 23:41:35 +08:00
parent 6ab3278481
commit 80634fe749
45 changed files with 885 additions and 139 deletions

View File

View File

@@ -0,0 +1,93 @@
import asyncio
import os
import time
from dataclasses import dataclass
from io import BytesIO
from logging import Logger
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub.errors import LocalEntryNotFoundError
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
IMAGE_RESOLUTION_SCALE = 4
@dataclass(frozen=True)
class ConverterDoclingConfig:
code: bool = True
formula: bool = True
artifact: Path | None = None
class ConverterDocling(X2MarkdownConverter):
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
self.config = config
self.code = config.code
self.formula = config.formula
self.artifact = config.artifact
self.logger = logger
def convert(self, document) -> MarkdownDocument:
assert isinstance(document.name, str)
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
document_stream = DocumentStream(name=document.name, stream=BytesIO(document.content))
content = self.file2markdown_embed_images(document_stream)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
async def convert_async(self, document: Document) -> MarkdownDocument:
return await asyncio.to_thread(
self.convert,
document
)
def support_format(self) -> list[str]:
return [".pdf", ".docx", ".pptx", ".xlsx", ".md", "html", "xhtml", "csv", ".png", ".jpg", ".jpeg", ".tiff",
".bmp", ".webp"]
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
pipeline_options.do_ocr = False
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
pipeline_options.table_structure_options.do_cell_matching = False
if self.formula:
pipeline_options.do_formula_enrichment = True
if self.code:
pipeline_options.do_code_enrichment = True
# pipeline_options.accelerator_options= AcceleratorOptions(
# num_threads=4, device=AcceleratorDevice.AUTO
# )
# 打印时间
settings.debug.profile_pipeline_timings = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
try:
conversion_result = converter.convert(file_path)
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
except LocalEntryNotFoundError:
self.logger.info(f"无法连接huggingface正在尝试换源")
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
conversion_result = converter.convert(file_path)
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
return result
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,15 @@
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class ConverterIdentity(X2MarkdownConverter):
def convert(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
async def convert_async(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
def support_format(self) -> list[str]:
return [".md"]

View File

@@ -0,0 +1,228 @@
import asyncio
import time
import zipfile
from dataclasses import dataclass
from logging import Logger
import httpx
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
URL = 'https://mineru.net/api/v4/file-urls/batch'
@dataclass(frozen=True)
class ConverterMineruConfig:
mineru_token: str
formula: bool = True
timeout = httpx.Timeout(
connect=5.0, # 连接超时 (建立连接的最长时间)
read=200.0, # 读取超时 (等待服务器响应的最长时间)
write=200.0, # 写入超时 (发送数据的最长时间)
pool=1.0 # 从连接池获取连接的超时时间
)
client = httpx.Client(trust_env=False, timeout=timeout, proxy=None, verify=False)
client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, verify=False)
class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
self.config = config
self.mineru_token = config.mineru_token.strip()
self.formula = config.formula
self.logger = logger
def _get_header(self):
return {
'Content-Type': 'application/json',
"Authorization": f"Bearer {self.mineru_token}"
}
def _get_upload_data(self, document: Document):
return {
"enable_formula": self.formula,
"language": "auto",
"enable_table": True,
"files": [
{"name": f"{document.name}", "is_ocr": True}
]
}
def upload(self, document: Document):
# 获取上传链接
response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
response.raise_for_status()
result = response.json()
# print('response success. result:{}'.format(result))
if result["code"] == 0:
batch_id = result["data"]["batch_id"]
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = client.put(urls[0], content=document.content)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
else:
raise Exception('apply upload url failed,reason:{}'.format(result))
async def upload_async(self, document: Document):
# 获取上传链接
response = await client_async.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
response.raise_for_status()
result = response.json()
# print('response success. result:{}'.format(result))
if result["code"] == 0:
batch_id = result["data"]["batch_id"]
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = await client_async.put(urls[0], content=document.content)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
else:
raise Exception('apply upload url failed,reason:{}'.format(result))
def get_file_url(self, batch_id: str) -> str:
while True:
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
header = self._get_header()
res = client.get(url, headers=header)
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
file_url = fileinfo["full_zip_url"]
return file_url
else:
time.sleep(3)
async def get_file_url_async(self, batch_id: str) -> str:
while True:
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
header = self._get_header()
res = await client_async.get(url, headers=header)
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
file_url = fileinfo["full_zip_url"]
return file_url
else:
await asyncio.sleep(3)
def convert(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
batch_id = self.upload(document)
file_url = self.get_file_url(batch_id)
content = get_md_from_zip_url_with_inline_images(zip_url=file_url)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
async def convert_async(self, document: Document) -> MarkdownDocument:
self.logger.info(f"正在将文档转换为markdown")
time1 = time.time()
batch_id = await self.upload_async(document)
file_url = await self.get_file_url_async(batch_id)
content = await get_md_from_zip_url_with_inline_images_async(zip_url=file_url)
self.logger.info(f"已转换为markdown耗时{time.time() - time1}")
md_document = MarkdownDocument.from_bytes(content=content.encode("utf-8"), suffix=".md", stem=document.stem)
return md_document
def support_format(self) -> list[str]:
return [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]
def get_md_from_zip_url_with_inline_images(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片。
Args:
zip_url (str): ZIP文件的下载链接。
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称包括路径
默认为 "full.md"
encoding (str): 目标文件的预期编码。默认为 "utf-8"
Returns:
str | None: 如果成功返回处理后的Markdown文本内容否则返回 None。
"""
try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = client.get(zip_url) # 增加超时
response.raise_for_status()
print("ZIP文件下载完成。")
return embed_inline_image_from_zip(response.content, filename_in_zip=filename_in_zip, encoding=encoding)
except httpx.HTTPStatusError as e:
raise Exception(
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
except httpx.RequestError as e:
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
except zipfile.BadZipFile:
raise Exception("错误: 下载的文件不是一个有效的ZIP压缩文件或已损坏。")
except UnicodeDecodeError:
raise Exception(f"错误: 无法使用 '{encoding}' 编码解码文件 '{filename_in_zip}' 的内容。")
except Exception as e:
import traceback
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
raise Exception(f"发生未知错误: {e}")
async def get_md_from_zip_url_with_inline_images_async(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片。
Args:
zip_url (str): ZIP文件的下载链接。
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称包括路径
默认为 "full.md"
encoding (str): 目标文件的预期编码。默认为 "utf-8"
Returns:
str : 如果成功返回处理后的Markdown文本内容。
"""
try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = await client_async.get(zip_url) # 增加超时
response.raise_for_status()
print("ZIP文件下载完成。")
return await asyncio.to_thread(embed_inline_image_from_zip, response.content, filename_in_zip=filename_in_zip,
encoding=encoding)
except httpx.HTTPStatusError as e:
raise Exception(
f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}\n响应内容: {e.response.text[:200]}...")
except httpx.RequestError as e:
raise Exception(f"下载ZIP文件时发生错误 (httpx): {e}")
except zipfile.BadZipFile:
raise Exception("错误: 下载的文件不是一个有效的ZIP压缩文件或已损坏。")
except UnicodeDecodeError:
raise Exception(f"错误: 无法使用 '{encoding}' 编码解码文件 '{filename_in_zip}' 的内容。")
except Exception as e:
import traceback
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
raise Exception(f"发生未知错误: {e}")
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,22 @@
from typing import runtime_checkable
from typing import Protocol
from docutranslate.converter.interfaces import Converter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@runtime_checkable
class X2MarkdownConverter(Converter,Protocol):
"""
负责将其它格式的文件转换为markdown
"""
def convert(self, document: Document) -> MarkdownDocument:
...
async def convert_async(self, document: Document) -> MarkdownDocument:
...
def support_format(self)->list[str]:
...