diff --git a/docutranslate/exporter/md2x/types.py b/docutranslate/exporter/md2x/types.py index e58d790..623e903 100644 --- a/docutranslate/exporter/md2x/types.py +++ b/docutranslate/exporter/md2x/types.py @@ -1,4 +1,7 @@ +from typing import Literal + from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig -x2md_convert_config_type=ConverterDoclingConfig | ConverterMineruConfig \ No newline at end of file +convert_engin_type = Literal["mineru", "docling"] +x2md_convert_config_type = ConverterDoclingConfig | ConverterMineruConfig diff --git a/docutranslate/exporter/txt2x/txt2html_exporter.py b/docutranslate/exporter/txt2x/txt2html_exporter.py index 4100846..c7f92ac 100644 --- a/docutranslate/exporter/txt2x/txt2html_exporter.py +++ b/docutranslate/exporter/txt2x/txt2html_exporter.py @@ -5,7 +5,6 @@ import jinja2 from docutranslate.exporter.export_config import ExportConfig from docutranslate.exporter.txt2x.interfaces import TXTExporter from docutranslate.ir.document import Document -from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.utils.resource_utils import resource_path @@ -19,7 +18,7 @@ class TXT2HTMLExporter(TXTExporter): export_config = export_config or TXT2HTMLExportConfig() self.cdn = export_config.cdn - def export(self, document: MarkdownDocument) -> Document: + def export(self, document: Document) -> Document: cdn = self.cdn html_template = resource_path("template/txt.html").read_text(encoding="utf-8") diff --git a/docutranslate/exporter/txt2x/txt2txt_exporter.py b/docutranslate/exporter/txt2x/txt2txt_exporter.py new file mode 100644 index 0000000..9d46e29 --- /dev/null +++ b/docutranslate/exporter/txt2x/txt2txt_exporter.py @@ -0,0 +1,10 @@ +from docutranslate.exporter.txt2x.interfaces import TXTExporter +from docutranslate.ir.document import Document + + + + + +class TXT2TXTExporter(TXTExporter): + def export(self, document: Document) -> Document: + return document.copy() diff --git a/docutranslate/global_values/conditional_import.py b/docutranslate/global_values/conditional_import.py index 19da421..02a4f2a 100644 --- a/docutranslate/global_values/conditional_import.py +++ b/docutranslate/global_values/conditional_import.py @@ -15,3 +15,5 @@ def conditional_import(packagename,alias=None): # print(f"package:{packagename}不存在") available_packages[packagename]=False return False + +conditional_import("docling") diff --git a/docutranslate/ir/document.py b/docutranslate/ir/document.py index 327a101..0b1ea5e 100644 --- a/docutranslate/ir/document.py +++ b/docutranslate/ir/document.py @@ -1,3 +1,4 @@ +import copy import dataclasses from pathlib import Path @@ -22,3 +23,6 @@ class Document: @classmethod def from_bytes(cls,content:bytes,suffix:str,stem:str|None): return cls(content=content,suffix=suffix,stem=stem) + + def copy(self): + return copy.copy(self) diff --git a/docutranslate/manager/base_manager.py b/docutranslate/manager/base_manager.py index becf3ea..fc1eb5d 100644 --- a/docutranslate/manager/base_manager.py +++ b/docutranslate/manager/base_manager.py @@ -16,13 +16,15 @@ class BaseManager(ABC, Generic[T_Translated]): self.document_original: Document | None = None self.document_translated: T_Translated | None = None - def read_path(self, path: Path | str): + def read_path(self, path: Path | str) -> Self: document = Document.from_path(path) self.document_original = document + return self - def read_bytes(self, content: bytes, stem: str, suffix: str): + def read_bytes(self, content: bytes, stem: str, suffix: str) -> Self: document = Document.from_bytes(content=content, stem=stem, suffix=suffix) self.document_original = document + return self @abstractmethod def translate(self, *args, **kwargs) -> Self: @@ -38,14 +40,11 @@ class BaseManager(ABC, Generic[T_Translated]): docu = exporter.export(self.document_translated) return docu - def _save(self, exporter: Exporter, name: str = None, out_put_dir: Path | str = "./output"): + def _save(self, exporter: Exporter, name: str = None, output_dir: Path | str = "./output"): docu = self._export(exporter) name = name or docu.name - output_path = Path(out_put_dir) / Path(name) + output_path = Path(output_dir) / Path(name) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_bytes(docu.content) self.logger.info(f"文件已保存到{output_path.resolve()}") return self - @abstractmethod - def support_export_format(self)->list[str]: - ... \ No newline at end of file diff --git a/docutranslate/manager/interfaces.py b/docutranslate/manager/interfaces.py index 3a58e03..496a06f 100644 --- a/docutranslate/manager/interfaces.py +++ b/docutranslate/manager/interfaces.py @@ -12,7 +12,7 @@ class HTMLExportable(Protocol): def export_to_html(self, export_config: T) -> str: ... - def save_as_html(self, name: str, out_put_dir: Path | str, export_config: T) -> Self: + def save_as_html(self, name: str, output_dir: Path | str, export_config: T) -> Self: ... @@ -21,14 +21,14 @@ class MDExportable(Protocol): def export_to_markdown(self, export_config: T) -> str: ... - def save_as_markdown(self, name: str, out_put_dir: Path | str, export_config: T) -> Self: + def save_as_markdown(self, name: str, output_dir: Path | str, export_config: T) -> Self: ... @runtime_checkable class TXTExportable(Protocol): - def export_to_txt(self, export_config: T) -> str: + def export_to_txt(self) -> str: ... - def save_as_txt(self, name: str, out_put_dir: Path | str, export_config: T) -> Self: + def save_as_txt(self, name: str, output_dir: Path | str) -> Self: ... diff --git a/docutranslate/manager/md_based_manager.py b/docutranslate/manager/md_based_manager.py index d77f2ba..623e35a 100644 --- a/docutranslate/manager/md_based_manager.py +++ b/docutranslate/manager/md_based_manager.py @@ -6,20 +6,25 @@ from docutranslate.cacher import md_based_convert_cacher from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling from docutranslate.converter.x2md.converter_identity import ConverterIdentity from docutranslate.converter.x2md.converter_mineru import ConverterMineruConfig, ConverterMineru +from docutranslate.converter.x2md.interfaces import X2MarkdownConverter from docutranslate.exporter.md2x.md2html_exporter import MD2HTMLExportConfig, MD2HTMLExporter from docutranslate.exporter.md2x.md2md_exporter import MD2MDExportConfig, MD2MDExporter -from docutranslate.exporter.md2x.types import x2md_convert_config_type +from docutranslate.exporter.md2x.types import x2md_convert_config_type, convert_engin_type from docutranslate.manager.base_manager import BaseManager from docutranslate.manager.interfaces import HTMLExportable, MDExportable from docutranslate.translater.md_translator import MDTranslateConfig, MDTranslator class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._converter_factory: dict[str:tuple[X2MarkdownConverter, x2md_convert_config_type]] = { + "mineru": (ConverterMineru, ConverterMineruConfig), + "docling": (ConverterDocling, ConverterDoclingConfig), + } - def support_export_format(self) -> list[str]: - return [".md",".html",".zip"] - - def _get_document_md(self, convert_engin, convert_config): + def _get_document_md(self, convert_engin: convert_engin_type | None, + convert_config: x2md_convert_config_type | None): if self.document_original is None: raise RuntimeError("file has not been read yet. Call read_path or read_bytes first.") # 获取缓存的解析后文件 @@ -29,16 +34,13 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): if document_cached: document_md = document_cached else: - if convert_engin is None: + if convert_engin is None or self.document_original.suffix == ".md": converter = ConverterIdentity() - elif convert_engin == "mineru": - if not isinstance(convert_config, ConverterMineruConfig): - raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterMineruConfig}") - converter = ConverterMineru(convert_config, logger=self.logger) - elif convert_engin == "docling": - if not isinstance(convert_config, ConverterDoclingConfig): - raise RuntimeError(f"未传入正确的convert_config,应传入{ConverterDoclingConfig}") - converter = ConverterDocling(convert_config, logger=self.logger) + elif convert_engin in self._converter_factory: + converter_class, config_class = self._converter_factory[convert_engin] + if not isinstance(convert_config, config_class): + raise TypeError(f"未传入正确的convert_config,应传入{config_class.__name__}类型") + converter = converter_class(convert_config, logger=self.logger) else: raise ValueError(f"不存在{convert_engin}解析引擎") document_md = converter.convert(self.document_original) @@ -48,7 +50,7 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): @overload def translate(self, convert_engin: None, - convert_config: None, translate_config: MDTranslateConfig) -> Self: + convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self: ... @overload @@ -61,7 +63,7 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): convert_config: ConverterMineruConfig, translate_config: MDTranslateConfig) -> Self: ... - def translate(self, convert_engin: Literal["mineru", "docling"] | None, + def translate(self, convert_engin: convert_engin_type | None, convert_config: x2md_convert_config_type | None, translate_config: MDTranslateConfig) -> Self: document_md = self._get_document_md(convert_engin, convert_config) @@ -90,13 +92,13 @@ class MarkdownBasedManager(BaseManager, HTMLExportable, MDExportable): docu = self._export(MD2MDExporter(export_config)) return docu.content.decode() - def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output", + def save_as_html(self, name: str = None, output_dir: Path | str = "./output", export_config: MD2HTMLExportConfig | None = None) -> Self: - self._save(exporter=MD2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir) + self._save(exporter=MD2HTMLExporter(export_config), name=name, output_dir=output_dir) return self - def save_as_markdown(self, name: str = None, out_put_dir: Path | str = "./output", + def save_as_markdown(self, name: str = None, output_dir: Path | str = "./output", export_config: MD2MDExportConfig | None = None) -> Self: - self._save(exporter=MD2MDExporter(export_config), name=name, out_put_dir=out_put_dir) + self._save(exporter=MD2MDExporter(export_config), name=name, output_dir=output_dir) return self diff --git a/docutranslate/manager/txt_manager.py b/docutranslate/manager/txt_manager.py index 6121dc0..e8d01f8 100644 --- a/docutranslate/manager/txt_manager.py +++ b/docutranslate/manager/txt_manager.py @@ -1,34 +1,18 @@ -from copy import copy -from dataclasses import dataclass -from logging import Logger from pathlib import Path from typing import Self from docutranslate.exporter.txt2x.txt2html_exporter import TXT2HTMLExportConfig, TXT2HTMLExporter +from docutranslate.exporter.txt2x.txt2txt_exporter import TXT2TXTExporter from docutranslate.manager.base_manager import BaseManager -from docutranslate.manager.interfaces import HTMLExportable +from docutranslate.manager.interfaces import HTMLExportable, TXTExportable from docutranslate.translater.txt_translator import TXTTranslateConfig, TXTTranslator -@dataclass -class TXTManagerConfig: - chunk_size: int = 3000 - base_url: str | None = None - api_key = None, - model_id: str | None = None - temperature = 0.7 - concurrent: int = 30 - timeout = 2000 - cache = True - logger: Logger | None = None - -class TXTManager(BaseManager, HTMLExportable): - def support_export_format(self) -> list[str]: - return [".txt", ".html"] +class TXTManager(BaseManager, HTMLExportable,TXTExportable): def translate(self, translate_config: TXTTranslateConfig) -> Self: - document = copy(self.document_original) + document = self.document_original.copy() # 翻译解析后文件 translator = TXTTranslator(translate_config) translator.translate(document) @@ -36,7 +20,7 @@ class TXTManager(BaseManager, HTMLExportable): return self async def translate_async(self, translate_config: TXTTranslateConfig) -> Self: - document = copy(self.document_original) + document = self.document_original.copy() # 翻译解析后文件 translator = TXTTranslator(translate_config) await translator.translate_async(document) @@ -48,19 +32,14 @@ class TXTManager(BaseManager, HTMLExportable): return docu.content.decode() def export_to_txt(self) -> str: - if self.document_translated is None: - raise RuntimeError("Document has not been translated yet. Call translate() first.") - return self.document_translated.content.decode() + docu = self._export(TXT2TXTExporter()) + return docu.content.decode() - def save_as_html(self, name: str = None, out_put_dir: Path | str = "./output", + def save_as_html(self, name: str = None, output_dir: Path | str = "./output", export_config: TXT2HTMLExportConfig | None = None) -> Self: - self._save(exporter=TXT2HTMLExporter(export_config), name=name, out_put_dir=out_put_dir) + self._save(exporter=TXT2HTMLExporter(export_config), name=name, output_dir=output_dir) return self - def save_as_txt(self, name: str = None, out_put_dir: Path | str = "./output", ) -> Self: - name = name or self.document_translated.name - output_path = Path(out_put_dir) / Path(name) - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_bytes(self.document_translated.content) - self.logger.info(f"文件已保存到{output_path.resolve()}") + def save_as_txt(self, name: str = None, output_dir: Path | str = "./output", ) -> Self: + self._save(exporter=TXT2TXTExporter(), name=name, output_dir=output_dir) return self