完成mdbasedmanager和txtmanager
This commit is contained in:
@@ -1,4 +1,7 @@
|
|||||||
|
from typing import Literal
|
||||||
|
|
||||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig
|
||||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig
|
||||||
|
|
||||||
|
convert_engin_type = Literal["mineru", "docling"]
|
||||||
x2md_convert_config_type = ConverterDoclingConfig | ConverterMineruConfig
|
x2md_convert_config_type = ConverterDoclingConfig | ConverterMineruConfig
|
||||||
@@ -5,7 +5,6 @@ import jinja2
|
|||||||
from docutranslate.exporter.export_config import ExportConfig
|
from docutranslate.exporter.export_config import ExportConfig
|
||||||
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
|
|
||||||
@@ -19,7 +18,7 @@ class TXT2HTMLExporter(TXTExporter):
|
|||||||
export_config = export_config or TXT2HTMLExportConfig()
|
export_config = export_config or TXT2HTMLExportConfig()
|
||||||
self.cdn = export_config.cdn
|
self.cdn = export_config.cdn
|
||||||
|
|
||||||
def export(self, document: MarkdownDocument) -> Document:
|
def export(self, document: Document) -> Document:
|
||||||
cdn = self.cdn
|
cdn = self.cdn
|
||||||
html_template = resource_path("template/txt.html").read_text(encoding="utf-8")
|
html_template = resource_path("template/txt.html").read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
|||||||
10
docutranslate/exporter/txt2x/txt2txt_exporter.py
Normal file
10
docutranslate/exporter/txt2x/txt2txt_exporter.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from docutranslate.exporter.txt2x.interfaces import TXTExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TXT2TXTExporter(TXTExporter):
|
||||||
|
def export(self, document: Document) -> Document:
|
||||||
|
return document.copy()
|
||||||
@@ -15,3 +15,5 @@ def conditional_import(packagename,alias=None):
|
|||||||
# print(f"package:{packagename}不存在")
|
# print(f"package:{packagename}不存在")
|
||||||
available_packages[packagename]=False
|
available_packages[packagename]=False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
conditional_import("docling")
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import copy
|
||||||
import dataclasses
|
import dataclasses
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -22,3 +23,6 @@ class Document:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_bytes(cls,content:bytes,suffix:str,stem:str|None):
|
def from_bytes(cls,content:bytes,suffix:str,stem:str|None):
|
||||||
return cls(content=content,suffix=suffix,stem=stem)
|
return cls(content=content,suffix=suffix,stem=stem)
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
return copy.copy(self)
|
||||||
|
|||||||
@@ -16,13 +16,15 @@ class BaseManager(ABC, Generic[T_Translated]):
|
|||||||
self.document_original: Document | None = None
|
self.document_original: Document | None = None
|
||||||
self.document_translated: T_Translated | None = None
|
self.document_translated: T_Translated | None = None
|
||||||
|
|
||||||
def read_path(self, path: Path | str):
|
def read_path(self, path: Path | str) -> Self:
|
||||||
document = Document.from_path(path)
|
document = Document.from_path(path)
|
||||||
self.document_original = document
|
self.document_original = document
|
||||||
|
return self
|
||||||
|
|
||||||
def read_bytes(self, content: bytes, stem: str, suffix: str):
|
def read_bytes(self, content: bytes, stem: str, suffix: str) -> Self:
|
||||||
document = Document.from_bytes(content=content, stem=stem, suffix=suffix)
|
document = Document.from_bytes(content=content, stem=stem, suffix=suffix)
|
||||||
self.document_original = document
|
self.document_original = document
|
||||||
|
return self
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def translate(self, *args, **kwargs) -> Self:
|
def translate(self, *args, **kwargs) -> Self:
|
||||||
@@ -38,14 +40,11 @@ class BaseManager(ABC, Generic[T_Translated]):
|
|||||||
docu = exporter.export(self.document_translated)
|
docu = exporter.export(self.document_translated)
|
||||||
return docu
|
return docu
|
||||||
|
|
||||||
def _save(self, exporter: Exporter, name: str = None, out_put_dir: Path | str = "./output"):
|
def _save(self, exporter: Exporter, name: str = None, output_dir: Path | str = "./output"):
|
||||||
docu = self._export(exporter)
|
docu = self._export(exporter)
|
||||||
name = name or docu.name
|
name = name or docu.name
|
||||||
output_path = Path(out_put_dir) / Path(name)
|
output_path = Path(output_dir) / Path(name)
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
output_path.write_bytes(docu.content)
|
output_path.write_bytes(docu.content)
|
||||||
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
||||||
return self
|
return self
|
||||||
@abstractmethod
|
|
||||||
def support_export_format(self)->list[str]:
|
|
||||||
...
|
|
||||||
@@ -12,7 +12,7 @@ class HTMLExportable(Protocol):
|
|||||||
def export_to_html(self, export_config: T) -> str:
|
def export_to_html(self, export_config: T) -> str:
|
||||||
...
|
...
|
||||||
|
|
||||||
def save_as_html(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
def save_as_html(self, name: str, output_dir: Path | str, export_config: T) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@@ -21,14 +21,14 @@ class MDExportable(Protocol):
|
|||||||
def export_to_markdown(self, export_config: T) -> str:
|
def export_to_markdown(self, export_config: T) -> str:
|
||||||
...
|
...
|
||||||
|
|
||||||
def save_as_markdown(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
def save_as_markdown(self, name: str, output_dir: Path | str, export_config: T) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class TXTExportable(Protocol):
|
class TXTExportable(Protocol):
|
||||||
def export_to_txt(self, export_config: T) -> str:
|
def export_to_txt(self) -> str:
|
||||||
...
|
...
|
||||||
|
|
||||||
def save_as_txt(self, name: str, out_put_dir: Path | str, export_config: T) -> Self:
|
def save_as_txt(self, name: str, output_dir: Path | str) -> Self:
|
||||||
...
|
...
|
||||||
|
|||||||
@@ -6,20 +6,25 @@ from docutranslate.cacher import md_based_convert_cacher
|
|||||||
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
|
||||||
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
from docutranslate.converter.x2md.converter_identity import ConverterIdentity
|
||||||
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru
|
||||||
|
from docutranslate.converter.x2md.interfaces import X2MarkdownConverter
|
||||||
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
|
from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter
|
||||||
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
|
from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter
|
||||||
from docutranslate.exporter.md2x.types import x2md_convert_config_type
|
from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type
|
||||||
from docutranslate.manager.base_manager import BaseManager
|
from docutranslate.manager.base_manager import BaseManager
|
||||||
from docutranslate.manager.interfaces import HTMLExportable, MDExportable
|
from docutranslate.manager.interfaces import HTMLExportable, MDExportable
|
||||||
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
|
from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = {
|
||||||
|
"mineru": (ConverterMineru, ConverterMineruConfig),
|
||||||
|
"docling": (ConverterDocling, ConverterDoclingConfig),
|
||||||
|
}
|
||||||
|
|
||||||
def support_export_format(self) -> list[str]:
|
def _get_document_md(self, convert_engin: convert_engin_type | None,
|
||||||
return [".md",".html",".zip"]
|
convert_config: x2md_convert_config_type | None):
|
||||||
|
|
||||||
def _get_document_md(self, convert_engin, convert_config):
|
|
||||||
if self.document_original is None:
|
if self.document_original is None:
|
||||||
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.")
|
||||||
# 获取缓存的解析后文件
|
# 获取缓存的解析后文件
|
||||||
@@ -29,16 +34,13 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
|||||||
if document_cached:
|
if document_cached:
|
||||||
document_md = document_cached
|
document_md = document_cached
|
||||||
else:
|
else:
|
||||||
if convert_engin is None:
|
if convert_engin is None or self.document_original.suffix == ".md":
|
||||||
converter = ConverterIdentity()
|
converter = ConverterIdentity()
|
||||||
elif convert_engin == "mineru":
|
elif convert_engin in self._converter_factory:
|
||||||
if not isinstance(convert_config, ConverterMineruConfig):
|
converter_class, config_class = self._converter_factory[convert_engin]
|
||||||
raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterMineruConfig}")
|
if not isinstance(convert_config, config_class):
|
||||||
converter = ConverterMineru(convert_config, logger=self.logger)
|
raise TypeError(f"未传入正确的convert_config,应传入{config_class.__name__}类型")
|
||||||
elif convert_engin == "docling":
|
converter = converter_class(convert_config, logger=self.logger)
|
||||||
if not isinstance(convert_config, ConverterDoclingConfig):
|
|
||||||
raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterDoclingConfig}")
|
|
||||||
converter = ConverterDocling(convert_config, logger=self.logger)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"不存在{convert_engin}解析引擎")
|
raise ValueError(f"不存在{convert_engin}解析引擎")
|
||||||
document_md = converter.convert(self.document_original)
|
document_md = converter.convert(self.document_original)
|
||||||
@@ -48,7 +50,7 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
|||||||
|
|
||||||
@overload
|
@overload
|
||||||
def translate(self, convert_engin: None,
|
def translate(self, convert_engin: None,
|
||||||
convert_config: None, translate_config: MDTranslateConfig) -> Self:
|
convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
@@ -61,7 +63,7 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
|||||||
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
|
convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
def translate(self, convert_engin: Literal["mineru", "docling"] | None,
|
def translate(self, convert_engin: convert_engin_type | None,
|
||||||
convert_config: x2md_convert_config_type | None,
|
convert_config: x2md_convert_config_type | None,
|
||||||
translate_config: MDTranslateConfig) -> Self:
|
translate_config: MDTranslateConfig) -> Self:
|
||||||
document_md = self._get_document_md(convert_engin, convert_config)
|
document_md = self._get_document_md(convert_engin, convert_config)
|
||||||
@@ -90,13 +92,13 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable):
|
|||||||
docu = self._export(MD2MDExporter(export_config))
|
docu = self._export(MD2MDExporter(export_config))
|
||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: MD2HTMLExportConfig | None = None) -> Self:
|
export_config: MD2HTMLExportConfig | None = None) -> Self:
|
||||||
self._save(exporter=MD2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
|
self._save(exporter=MD2HTMLExporter(export_config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_markdown(self, name: str = None, out_put_dir: Path | str = "./output",
|
def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: MD2MDExportConfig | None = None) -> Self:
|
export_config: MD2MDExportConfig | None = None) -> Self:
|
||||||
|
|
||||||
self._save(exporter=MD2MDExporter(export_config), name=name, out_put_dir=out_put_dir)
|
self._save(exporter=MD2MDExporter(export_config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|||||||
@@ -1,34 +1,18 @@
|
|||||||
from copy import copy
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from logging import Logger
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
|
from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter
|
||||||
|
from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter
|
||||||
from docutranslate.manager.base_manager import BaseManager
|
from docutranslate.manager.base_manager import BaseManager
|
||||||
from docutranslate.manager.interfaces import HTMLExportable
|
from docutranslate.manager.interfaces import HTMLExportable, TXTExportable
|
||||||
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
|
from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TXTManagerConfig:
|
|
||||||
chunk_size: int = 3000
|
|
||||||
base_url: str | None = None
|
|
||||||
api_key = None,
|
|
||||||
model_id: str | None = None
|
|
||||||
temperature = 0.7
|
|
||||||
concurrent: int = 30
|
|
||||||
timeout = 2000
|
|
||||||
cache = True
|
|
||||||
logger: Logger | None = None
|
|
||||||
|
|
||||||
|
class TXTManager(BaseManager, HTMLExportable,TXTExportable):
|
||||||
class TXTManager(BaseManager, HTMLExportable):
|
|
||||||
def support_export_format(self) -> list[str]:
|
|
||||||
return [".txt", ".html"]
|
|
||||||
|
|
||||||
def translate(self, translate_config: TXTTranslateConfig) -> Self:
|
def translate(self, translate_config: TXTTranslateConfig) -> Self:
|
||||||
document = copy(self.document_original)
|
document = self.document_original.copy()
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = TXTTranslator(translate_config)
|
translator = TXTTranslator(translate_config)
|
||||||
translator.translate(document)
|
translator.translate(document)
|
||||||
@@ -36,7 +20,7 @@ class TXTManager(BaseManager, HTMLExportable):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
|
async def translate_async(self, translate_config: TXTTranslateConfig) -> Self:
|
||||||
document = copy(self.document_original)
|
document = self.document_original.copy()
|
||||||
# 翻译解析后文件
|
# 翻译解析后文件
|
||||||
translator = TXTTranslator(translate_config)
|
translator = TXTTranslator(translate_config)
|
||||||
await translator.translate_async(document)
|
await translator.translate_async(document)
|
||||||
@@ -48,19 +32,14 @@ class TXTManager(BaseManager, HTMLExportable):
|
|||||||
return docu.content.decode()
|
return docu.content.decode()
|
||||||
|
|
||||||
def export_to_txt(self) -> str:
|
def export_to_txt(self) -> str:
|
||||||
if self.document_translated is None:
|
docu = self._export(TXT2TXTExporter())
|
||||||
raise RuntimeError("Document has not been translated yet. Call translate() first.")
|
return docu.content.decode()
|
||||||
return self.document_translated.content.decode()
|
|
||||||
|
|
||||||
def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output",
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
export_config: TXT2HTMLExportConfig | None = None) -> Self:
|
export_config: TXT2HTMLExportConfig | None = None) -> Self:
|
||||||
self._save(exporter=TXT2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir)
|
self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_as_txt(self, name: str = None, out_put_dir: Path | str = "./output", ) -> Self:
|
def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", ) -> Self:
|
||||||
name = name or self.document_translated.name
|
self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir)
|
||||||
output_path = Path(out_put_dir) / Path(name)
|
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
output_path.write_bytes(self.document_translated.content)
|
|
||||||
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
|
||||||
return self
|
return self
|
||||||
|
|||||||
Reference in New Issue
Block a user