diff --git a/README.md b/README.md index c0ace2b..2494eaf 100644 --- a/README.md +++ b/README.md @@ -173,8 +173,9 @@ translater = FileTranslater(convert_engin="mineru", # 使用mineru解析文档 translater.read_file("<文件路径>").save_as_html()#保存 translater.read_file("<文件路径>").export_to_html()#输出字符串 # 文件转markdown -translater.read_file("<文件路径>").save_as_markdown()#保存 -translater.read_file("<文件路径>").export_to_markdown()#输出字符串 +translater.read_file("<文件路径>").save_as_markdown()#保存内嵌bas64图片的markdown +translater.read_file("<文件路径>").save_as_markdown(embed=False)#保存不内嵌图片的markdown(文件夹形式) +translater.read_file("<文件路径>").export_to_markdown()#输出内嵌图片的markdown字符串 ``` ## 参数说明 @@ -262,6 +263,7 @@ from docutranslate import FileTranslater translater = FileTranslater(base_url="", key="", model_id="", # 使用的模型id + convert_engin="docling", # 使用docling docling_artifact=r"C:\Users\\.cache\docling\models" ) ``` diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 952b64b..351cd6d 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -1,5 +1,7 @@ import asyncio import html +import io +import zipfile from pathlib import Path from typing import Literal import markdown2 @@ -9,7 +11,8 @@ from docutranslate.agents import MDRefineAgent, MDTranslateAgent from docutranslate.cacher import document_cacher_global from docutranslate.converter import Document, ConverterMineru from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts -from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block +from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \ + unembed_base64_images_to_zip from docutranslate.logger import translater_logger from docutranslate.global_values import available_packages from docutranslate.utils.resource_utils import resource_path @@ -18,16 +21,18 @@ DOCLING_FLAG = True if available_packages.get("docling") else False if DOCLING_FLAG: from docutranslate.converter import ConverterDocling -default_params={ - "chunk_size":3000, - "concurrent":30, - "temperature":0.7, +default_params = { + "chunk_size": 3000, + "concurrent": 30, + "temperature": 0.7, } + class FileTranslater: def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"], - base_url:str|None=None, key=None, model_id:str|None=None, temperature=default_params["temperature"], - concurrent:int=default_params["concurrent"], timeout=2000, + base_url: str | None = None, key=None, model_id: str | None = None, + temperature=default_params["temperature"], + concurrent: int = default_params["concurrent"], timeout=2000, convert_engin: Literal["docling", "mineru"] = "mineru", docling_artifact: Path | str | None = None, mineru_token: str = None, cache=True): @@ -37,7 +42,7 @@ class FileTranslater: self.markdown: str = "" self.chunk_size = chunk_size self.concurrent = concurrent - self.base_url= base_url + self.base_url = base_url self.key = key if key is not None else "xx" self.model_id = model_id self.temperature = temperature @@ -145,7 +150,7 @@ class FileTranslater: def read_document(self, document: Document, formula: bool, code: bool, save: bool, save_format: Literal["markdown", "html"], refine: bool, refine_agent: Agent | None): - self.document=document + self.document = document self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) if refine: self.refine_markdown_by_agent(refine_agent) @@ -193,7 +198,7 @@ class FileTranslater: if file_path: document = Document(path=file_path) else: - document=self.document + document = self.document if document is None: raise Exception("未读取文件") translater_logger.info(f"读取文件:{document.filename}") @@ -207,7 +212,7 @@ class FileTranslater: if file_path: document = Document(path=file_path) else: - document=self.document + document = self.document if document is None: raise Exception("未读取文件") translater_logger.info(f"读取文件:{document.filename}") @@ -277,28 +282,38 @@ class FileTranslater: translater_logger.info("翻译完成") return self.markdown - def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"): + def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True): if isinstance(filename, str): filename = Path(filename) if isinstance(output_dir, str): output_dir = Path(output_dir) if filename is None: - filename=f"{self.document.stem}.md" + filename = Path(f"{self.document.stem}.md") # 确保输出目录存在 output_dir.mkdir(parents=True, exist_ok=True) - full_name = output_dir / filename - # 输出前格式化markdown - self._markdown_format() - with open(full_name, "w") as file: - file.write(self.markdown) - translater_logger.info(f"文件已写入{full_name.resolve()}") + if embeded: + full_name = output_dir / filename + with open(full_name, "w") as file: + file.write(self.export_to_markdown()) + translater_logger.info(f"文件已写入{full_name.resolve()}") + else: + with zipfile.ZipFile(self.export_to_unembed_markdown()) as zip_ref: + zip_ref.extractall(output_dir) return self - def export_to_markdown(self): + def export_to_markdown(self) -> str: # 输出前格式化markdown self._markdown_format() return self.markdown + def export_to_unembed_markdown(self, filename: str | Path | None = None) -> io.BytesIO: + if isinstance(filename, str): + filename = Path(filename) + if filename is None: + filename = Path(f"{self.document.stem}.md") + self._markdown_format() + return unembed_base64_images_to_zip(self.markdown, folder_name=str(filename.stem), markdown_name=str(filename)) + def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"): if isinstance(filename, str): filename = Path(filename) @@ -326,33 +341,34 @@ class FileTranslater: auto_render = f'' if not cdn else r"""""" # language=javascript render_math_in_element = r""" - """ if cdn else r""" - """ + """ if cdn else r""" + """ mermaid = f'' if self.document.suffix == ".txt": @@ -385,7 +401,7 @@ class FileTranslater: if save: if output_format == "markdown": - self.save_as_markdown(f"{self.document.stem}_{to_lang}.md",output_dir=output_dir) + self.save_as_markdown(f"{self.document.stem}_{to_lang}.md", output_dir=output_dir) elif output_format == "html": self.save_as_html(f"{self.document.stem}_{to_lang}.html", output_dir=output_dir) return self diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index 7f3d227..96e5a4b 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -6,6 +6,8 @@ import re import threading import uuid import zipfile +from pathlib import Path +import tempfile class MaskDict: @@ -161,6 +163,35 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding return modified_md_content +def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->io.BytesIO: + with tempfile.TemporaryDirectory() as temp_dir: + subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行 + os.makedirs(subfolder, exist_ok=True) + image_folder=os.path.join(subfolder,image_folder_name) + os.makedirs(image_folder,exist_ok=True) + pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)" + def unembed_base64_images(match:re.Match)->str: + b64data = match.group(3) + extension=mimetypes.guess_extension(match.group(2)) + image_id=uuid.uuid1().hex[:8] + image_name=f"{image_id}{extension}" + url=f"./{image_folder_name}/{image_name}" + #创建对应的image文件 + with open(os.path.join(image_folder,image_name),"wb") as f: + f.write(base64.b64decode(b64data)) + return f"![{match.group(1)}]({url})" + modified_md_content = re.sub(pattern, unembed_base64_images,markdown) + with open(os.path.join(subfolder,f"{markdown_name}"),"w") as f: + f.write(modified_md_content) + zip_buffer=io.BytesIO() + folder_path=Path(subfolder) + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: + for file in folder_path.rglob('*'): + if file.is_file(): + zipf.write(file, file.relative_to(folder_path.parent)) + + return zip_buffer + def clean_markdown_math_block(markdown): """清除公式块的多余空格字符""" @@ -173,9 +204,7 @@ def clean_markdown_math_block(markdown): if __name__ == '__main__': - markdown = r""" -$$ -R T _ { k } ^ { i } ( t ) = \frac { \sum _ { t ^ { \prime } \in [ t - W , t ] } R R _ { k } ^ { i } ( t ^ { \prime } ) \times D R _ { k } ^ { i } ( t ^ { \prime } ) } { \sum _ { t ^ { \prime } \in [ t - W , t ] } D R _ { k } ^ { i } ( t ^ { \prime } ) } -$$ -""" - print(clean_markdown_math_block(markdown)) + with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f: + markdown=f.read() + print(unembed_base64_images_to_zip(markdown)) +