重构workflow

This commit is contained in:
xunbu
2025-07-30 20:48:11 +08:00
parent 8987e4ef60
commit d25f634e73
38 changed files with 351 additions and 286 deletions

View File

@@ -0,0 +1,29 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from logging import Logger
from typing import Hashable
from docutranslate.ir.document import Document
from docutranslate.logger import global_logger
@dataclass(kw_only=True)
class ConverterConfig(ABC):
logger: Logger | None = None
@abstractmethod
def gethash(self)->Hashable:
...
class Converter(ABC):
def __init__(self, config: ConverterConfig | None = None):
self.config = config
self.logger = config.logger or global_logger
@abstractmethod
def convert(self, document: Document) -> Document:
...
async def convert_async(self, document: Document) -> Document:
...

View File

@@ -1,11 +0,0 @@
from typing import Protocol
from docutranslate.ir.document import Document
class Converter(Protocol):
def convert(self, document: Document) -> Document:
...
async def convert_async(self, document: Document) -> Document:
...

View File

@@ -1,19 +1,28 @@
from typing import Protocol
from docutranslate.converter.interfaces import Converter
from abc import abstractmethod
from dataclasses import dataclass
from docutranslate.converter.base import Converter, ConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
@dataclass(kw_only=True)
class X2MarkdownConverterConfig(ConverterConfig):
...
class X2MarkdownConverter(Converter,Protocol):
class X2MarkdownConverter(Converter):
"""
负责将其它格式的文件转换为markdown
"""
@abstractmethod
def convert(self, document: Document) -> MarkdownDocument:
...
@abstractmethod
async def convert_async(self, document: Document) -> MarkdownDocument:
...
@abstractmethod
def support_format(self)->list[str]:
...

View File

@@ -3,7 +3,6 @@ import os
import time
from dataclasses import dataclass
from io import BytesIO
from logging import Logger
from pathlib import Path
from docling.datamodel.base_models import InputFormat
@@ -14,34 +13,34 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub.errors import LocalEntryNotFoundError
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
IMAGE_RESOLUTION_SCALE = 4
@dataclass(frozen=True)
class ConverterDoclingConfig:
@dataclass(kw_only=True)
class ConverterDoclingConfig(X2MarkdownConverterConfig):
code: bool = True
formula: bool = True
artifact: Path | None = None
def gethash(self):
return self.code,self.formula
class ConverterDocling(X2MarkdownConverter):
def __init__(self, config: ConverterDoclingConfig, logger: Logger = global_logger):
self.logger = logger
self.config = config
def __init__(self, config: ConverterDoclingConfig):
super().__init__(config=config)
self.code = config.code
self.formula = config.formula
artifact=Path("./docling_artifact")
artifact = Path("./docling_artifact")
if artifact.is_dir():
self.logger.info("使用./docling_artifact的本地模型")
self.artifact=artifact
self.artifact = artifact
else:
self.artifact=config.artifact
self.artifact = config.artifact
def convert(self, document) -> MarkdownDocument:
assert isinstance(document.name, str)

View File

@@ -1,10 +1,10 @@
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.converter.x2md.base import X2MarkdownConverter
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
class ConverterIdentity(X2MarkdownConverter):
#TODO:支持markdown_zip格式输入
def convert(self, document: Document) -> MarkdownDocument:
return MarkdownDocument.from_bytes(content=document.content, suffix=".md", stem=document.stem)

View File

@@ -3,10 +3,11 @@ import time
import zipfile
from dataclasses import dataclass
from logging import Logger
from typing import Hashable
import httpx
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
from docutranslate.converter.x2md.base import X2MarkdownConverter, X2MarkdownConverterConfig
from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.logger import global_logger
@@ -15,11 +16,14 @@ from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
URL = 'https://mineru.net/api/v4/file-urls/batch'
@dataclass(frozen=True)
class ConverterMineruConfig:
@dataclass(kw_only=True)
class ConverterMineruConfig(X2MarkdownConverterConfig):
mineru_token: str
formula: bool = True
def gethash(self) ->Hashable:
return self.formula
timeout = httpx.Timeout(
connect=5.0, # 连接超时 (建立连接的最长时间)
@@ -34,7 +38,7 @@ client_async = httpx.AsyncClient(trust_env=False, timeout=timeout, proxy=None, v
class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
self.config = config
super().__init__(config=config)
self.mineru_token = config.mineru_token.strip()
self.formula = config.formula
self.logger = logger