重构workflow
This commit is contained in:
@@ -1,19 +1,28 @@
|
||||
from typing import Protocol
|
||||
from docutranslate.converter.interfaces import Converter
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
|
||||
from docutranslate.converter.base import Converter, ConverterConfig
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
@dataclass(kw_only=True)
|
||||
class X2MarkdownConverterConfig(ConverterConfig):
|
||||
...
|
||||
|
||||
|
||||
class X2MarkdownConverter(Converter,Protocol):
|
||||
class X2MarkdownConverter(Converter):
|
||||
"""
|
||||
负责将其它格式的文件转换为markdown
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def convert_async(self, document: Document) -> MarkdownDocument:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def support_format(self)->list[str]:
|
||||
...
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from logging import Logger
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@@ -14,34 +13,34 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ConverterDoclingConfig:
|
||||
@dataclass(kw_only=True)
|
||||
class ConverterDoclingConfig(X2MarkdownConverterConfig):
|
||||
code: bool = True
|
||||
formula: bool = True
|
||||
artifact: Path | None = None
|
||||
|
||||
def gethash(self):
|
||||
return self.code,self.formula
|
||||
|
||||
|
||||
class ConverterDocling(X2MarkdownConverter):
|
||||
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
|
||||
self.logger = logger
|
||||
self.config = config
|
||||
def __init__(self, config: ConverterDoclingConfig):
|
||||
super().__init__(config=config)
|
||||
self.code = config.code
|
||||
self.formula = config.formula
|
||||
artifact=Path("./docling_artifact")
|
||||
artifact = Path("./docling_artifact")
|
||||
if artifact.is_dir():
|
||||
self.logger.info("使用./docling_artifact的本地模型")
|
||||
self.artifact=artifact
|
||||
self.artifact = artifact
|
||||
else:
|
||||
self.artifact=config.artifact
|
||||
|
||||
self.artifact = config.artifact
|
||||
|
||||
def convert(self, document) -> MarkdownDocument:
|
||||
assert isinstance(document.name, str)
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.converter.x2md.base import X2MarkdownConverter
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
|
||||
|
||||
class ConverterIdentity(X2MarkdownConverter):
|
||||
|
||||
#TODO:支持markdown_zip格式输入
|
||||
def convert(self, document: Document) -> MarkdownDocument:
|
||||
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)
|
||||
|
||||
|
||||
@@ -3,10 +3,11 @@ import time
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from logging import Logger
|
||||
from typing import Hashable
|
||||
|
||||
import httpx
|
||||
|
||||
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
|
||||
from docutranslate.ir.document import Document
|
||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||
from docutranslate.logger import global_logger
|
||||
@@ -15,11 +16,14 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
|
||||
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ConverterMineruConfig:
|
||||
@dataclass(kw_only=True)
|
||||
class ConverterMineruConfig(X2MarkdownConverterConfig):
|
||||
mineru_token: str
|
||||
formula: bool = True
|
||||
|
||||
def gethash(self) ->Hashable:
|
||||
return self.formula
|
||||
|
||||
|
||||
timeout = httpx.Timeout(
|
||||
connect=5.0, # 连接超时 (建立连接的最长时间)
|
||||
@@ -34,7 +38,7 @@ client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, v
|
||||
|
||||
class ConverterMineru(X2MarkdownConverter):
|
||||
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
|
||||
self.config = config
|
||||
super().__init__(config=config)
|
||||
self.mineru_token = config.mineru_token.strip()
|
||||
self.formula = config.formula
|
||||
self.logger = logger
|
||||
|
||||
Reference in New Issue
Block a user