修改code\formula为code_ocr\formula_pcr

This commit is contained in:
xunbu
2025-07-31 10:23:10 +08:00
parent cc8a340461
commit fe2c199ee7
5 changed files with 23 additions and 19 deletions

View File

@@ -307,12 +307,12 @@ async def _perform_translation(
raise ValueError("使用 'mineru' 引擎需要提供 'mineru_token'") raise ValueError("使用 'mineru' 引擎需要提供 'mineru_token'")
convert_config = ConverterMineruConfig( convert_config = ConverterMineruConfig(
mineru_token=payload.mineru_token, mineru_token=payload.mineru_token,
formula=payload.formula_ocr formula_ocr=payload.formula_ocr
) )
elif payload.convert_engin == 'docling': elif payload.convert_engin == 'docling':
convert_config = ConverterDoclingConfig( convert_config = ConverterDoclingConfig(
code=payload.code_ocr, code_ocr=payload.code_ocr,
formula=payload.formula_ocr formula_ocr=payload.formula_ocr
) )
await workflow.translate_async( await workflow.translate_async(

View File

@@ -1,5 +1,6 @@
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import Hashable
from docutranslate.converter.base import Converter, ConverterConfig from docutranslate.converter.base import Converter, ConverterConfig
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
@@ -8,7 +9,9 @@ from docutranslate.ir.markdown_document import MarkdownDocument
@dataclass(kw_only=True) @dataclass(kw_only=True)
class X2MarkdownConverterConfig(ConverterConfig): class X2MarkdownConverterConfig(ConverterConfig):
... ...
@abstractmethod
def gethash(self) ->Hashable:
...
class X2MarkdownConverter(Converter): class X2MarkdownConverter(Converter):
""" """

View File

@@ -22,19 +22,19 @@ IMAGE_RESOLUTION_SCALE = 4
@dataclass(kw_only=True) @dataclass(kw_only=True)
class ConverterDoclingConfig(X2MarkdownConverterConfig): class ConverterDoclingConfig(X2MarkdownConverterConfig):
code: bool = True code_ocr: bool = True
formula: bool = True formula_ocr: bool = True
artifact: Path | None = None artifact: Path | None = None
def gethash(self): def gethash(self):
return self.code,self.formula return self.code_ocr,self.formula_ocr
class ConverterDocling(X2MarkdownConverter): class ConverterDocling(X2MarkdownConverter):
def __init__(self, config: ConverterDoclingConfig): def __init__(self, config: ConverterDoclingConfig):
super().__init__(config=config) super().__init__(config=config)
self.code = config.code self.code = config.code_ocr
self.formula = config.formula self.formula = config.formula_ocr
artifact = Path("./docling_artifact") artifact = Path("./docling_artifact")
if artifact.is_dir(): if artifact.is_dir():
self.logger.info("使用./docling_artifact的本地模型") self.logger.info("使用./docling_artifact的本地模型")

View File

@@ -19,10 +19,10 @@ URL = 'https://mineru.net/api/v4/file-urls/batch'
@dataclass(kw_only=True) @dataclass(kw_only=True)
class ConverterMineruConfig(X2MarkdownConverterConfig): class ConverterMineruConfig(X2MarkdownConverterConfig):
mineru_token: str mineru_token: str
formula: bool = True formula_ocr: bool = True
def gethash(self) ->Hashable: def gethash(self) ->Hashable:
return self.formula return self.formula_ocr
timeout = httpx.Timeout( timeout = httpx.Timeout(
@@ -40,7 +40,7 @@ class ConverterMineru(X2MarkdownConverter):
def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger): def __init__(self, config: ConverterMineruConfig, logger: Logger = global_logger):
super().__init__(config=config) super().__init__(config=config)
self.mineru_token = config.mineru_token.strip() self.mineru_token = config.mineru_token.strip()
self.formula = config.formula self.formula = config.formula_ocr
self.logger = logger self.logger = logger
def _get_header(self): def _get_header(self):

View File

@@ -26,20 +26,21 @@ class TXTWorkflow(Workflow[TXTWorkflowConfig, Document, Document], HTMLExportabl
if sub_config: if sub_config:
sub_config.logger = config.logger sub_config.logger = config.logger
def translate(self) -> Self: def _pre_translate(self,document_original:Document):
document = document_original.copy()
translate_config = self.config.translator_config translate_config = self.config.translator_config
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config) translator = TXTTranslator(translate_config)
return document,translator
def translate(self) -> Self:
document, translator=self._pre_translate(self.document_original)
translator.translate(document) translator.translate(document)
self.document_translated = document self.document_translated = document
return self return self
async def translate_async(self) -> Self: async def translate_async(self) -> Self:
translate_config = self.config.translator_config document, translator = self._pre_translate(self.document_original)
document = self.document_original.copy()
# 翻译解析后文件
translator = TXTTranslator(translate_config)
await translator.translate_async(document) await translator.translate_async(document)
self.document_translated = document self.document_translated = document
return self return self