translater现在可以拥有独立logger
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from io import BytesIO
|
||||
@@ -11,62 +13,26 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||
|
||||
from docutranslate.logger import translater_logger
|
||||
|
||||
from docutranslate.converter import Converter, Document
|
||||
|
||||
import asyncio
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False,
|
||||
artifacts_path: Path | str | None = None) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_picture_images = True
|
||||
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
if formula:
|
||||
pipeline_options.do_formula_enrichment = True
|
||||
if code:
|
||||
pipeline_options.do_code_enrichment = True
|
||||
# pipeline_options.accelerator_options= AcceleratorOptions(
|
||||
# num_threads=4, device=AcceleratorDevice.AUTO
|
||||
# )
|
||||
# 打印时间
|
||||
settings.debug.profile_pipeline_timings = True
|
||||
converter = DocumentConverter(format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
|
||||
})
|
||||
try:
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
except LocalEntryNotFoundError:
|
||||
translater_logger.info(f"无法连接huggingface,正在尝试换源")
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
|
||||
return result
|
||||
|
||||
|
||||
class ConverterDocling(Converter):
|
||||
def __init__(self, code=True, formula=True, artifact=None):
|
||||
def __init__(self, code=True, formula=True, artifact=None, logger: logging.Logger | None = None):
|
||||
self.code = code
|
||||
self.formula = formula
|
||||
self.artifact = artifact
|
||||
self.logger = logger if logger else global_logger
|
||||
|
||||
def convert(self, document):
|
||||
assert isinstance(document.filename, str)
|
||||
translater_logger.info(f"正在将文档转换为markdown")
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
|
||||
result = file2markdown_embed_images(document_stream, formula=self.formula, code=self.code,
|
||||
artifacts_path=self.artifact)
|
||||
translater_logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
result = self.file2markdown_embed_images(document_stream)
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
|
||||
async def convert_async(self, document: Document) -> str:
|
||||
@@ -75,11 +41,43 @@ class ConverterDocling(Converter):
|
||||
document
|
||||
)
|
||||
|
||||
def set_config(self,cofig:dict):
|
||||
def file2markdown_embed_images(self, file_path: Path | str | DocumentStream) -> str:
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=self.artifact)
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_picture_images = True
|
||||
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
if self.formula:
|
||||
pipeline_options.do_formula_enrichment = True
|
||||
if self.code:
|
||||
pipeline_options.do_code_enrichment = True
|
||||
# pipeline_options.accelerator_options= AcceleratorOptions(
|
||||
# num_threads=4, device=AcceleratorDevice.AUTO
|
||||
# )
|
||||
# 打印时间
|
||||
settings.debug.profile_pipeline_timings = True
|
||||
converter = DocumentConverter(format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
|
||||
})
|
||||
try:
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
except LocalEntryNotFoundError:
|
||||
self.logger.info(f"无法连接huggingface,正在尝试换源")
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
conversion_result = converter.convert(file_path)
|
||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
|
||||
return result
|
||||
|
||||
def set_config(self, cofig: dict):
|
||||
pass
|
||||
|
||||
def get_config_list(self) ->list[str]|None:
|
||||
def get_config_list(self) -> list[str] | None:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
import zipfile
|
||||
import httpx
|
||||
from docutranslate.converter import Converter, Document
|
||||
from docutranslate.logger import translater_logger
|
||||
from docutranslate.logger import global_logger
|
||||
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||
|
||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||
@@ -21,10 +22,11 @@ client = httpx.Client(trust_env=False,timeout=timeout,proxy=None,verify=False)
|
||||
|
||||
# TODO: 提供更详细的logger
|
||||
class ConverterMineru(Converter):
|
||||
def __init__(self, token: str, formula=True):
|
||||
def __init__(self, token: str, formula=True,logger:logging.Logger|None=None):
|
||||
self.mineru_token = token.strip()
|
||||
self.client_async = httpx.AsyncClient(timeout=timeout)
|
||||
self.formula = formula
|
||||
self.logger=logger if logger else global_logger
|
||||
|
||||
def _get_header(self):
|
||||
return {
|
||||
@@ -74,12 +76,12 @@ class ConverterMineru(Converter):
|
||||
time.sleep(3)
|
||||
|
||||
def convert(self, document: Document) -> str:
|
||||
translater_logger.info(f"正在将文档转换为markdown")
|
||||
self.logger.info(f"正在将文档转换为markdown")
|
||||
time1 = time.time()
|
||||
batch_id = self.upload(document)
|
||||
file_url = self.get_file_url(batch_id)
|
||||
result = get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||
translater_logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
self.logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||
return result
|
||||
|
||||
# TODO: 实现细粒度更高的协程
|
||||
|
||||
Reference in New Issue
Block a user