From b53a1d6005b30ae1c17ace635207d0749e3bd7b2 Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 3 Jul 2025 17:30:31 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=BA=E5=AF=BC=E5=87=BA?= =?UTF-8?q?=E7=9A=84markdownu=E5=8E=8B=E7=BC=A9=E5=8C=85=E4=B8=8D=E5=90=AB?= =?UTF-8?q?=E9=A1=B6=E5=B1=82=E7=9B=AE=E5=BD=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/translater.py | 16 +++++++++++----- docutranslate/utils/markdown_utils.py | 17 +++++++---------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/docutranslate/translater.py b/docutranslate/translater.py index bb7c46e..4751d63 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -109,6 +109,10 @@ class FileTranslater: return cached_result if document.suffix in [".md", ".txt"]: return document.filebytes.decode("utf-8") + if document.suffix in ['zip']: + #寻找zip内的filename + filename=find_markdown_in_zip(document.filebytes) + return embed_inline_image_from_zip(document.filebytes,filename) translater_logger.info("正在转化为markdown") if self.convert_engin == "docling": if artifact is None: @@ -289,18 +293,20 @@ class FileTranslater: def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True): if isinstance(filename, str): filename = Path(filename) - if isinstance(output_dir, str): - output_dir = Path(output_dir) if filename is None: filename = Path(f"{self.document.stem}.md") # 确保输出目录存在 - output_dir.mkdir(parents=True, exist_ok=True) + if isinstance(output_dir, str): + output_dir = Path(output_dir) if embeded: - full_name = output_dir / filename + output_dir.mkdir(parents=True, exist_ok=True) + full_name = output_dir / filename.name with open(full_name, "w") as file: file.write(self.export_to_markdown()) translater_logger.info(f"文件已写入{full_name.resolve()}") else: + output_dir=output_dir/filename.stem + output_dir.mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(io.BytesIO(self.export_to_unembed_markdown())) as zip_ref: zip_ref.extractall(output_dir) return self @@ -316,7 +322,7 @@ class FileTranslater: if filename is None: filename = Path(f"{self.document.stem}.md") self._markdown_format() - return unembed_base64_images_to_zip(self.markdown, folder_name=str(filename.stem), markdown_name=str(filename)) + return unembed_base64_images_to_zip(self.markdown, markdown_name=filename.name) def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"): if isinstance(filename, str): diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index d1547c8..c85ef6d 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -179,11 +179,9 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding return modified_md_content -def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->bytes: +def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_name="images")->bytes: with tempfile.TemporaryDirectory() as temp_dir: - subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行 - os.makedirs(subfolder, exist_ok=True) - image_folder=os.path.join(subfolder,image_folder_name) + image_folder=os.path.join(temp_dir,image_folder_name) os.makedirs(image_folder,exist_ok=True) pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)" def unembed_base64_images(match:re.Match)->str: @@ -197,16 +195,15 @@ def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str, f.write(base64.b64decode(b64data)) return f"![{match.group(1)}]({url})" modified_md_content = re.sub(pattern, unembed_base64_images,markdown) - with open(os.path.join(subfolder,f"{markdown_name}"),"w") as f: + with open(os.path.join(temp_dir,f"{markdown_name}"),"w") as f: f.write(modified_md_content) zip_buffer=io.BytesIO() - folder_path=Path(subfolder) + folder_path=Path(temp_dir) with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: for file in folder_path.rglob('*'): if file.is_file(): - zipf.write(file, file.relative_to(folder_path.parent)) - - return zip_buffer.read() + zipf.write(file, file.relative_to(folder_path)) + return zip_buffer.getvalue() def clean_markdown_math_block(markdown): """清除公式块的多余空格字符""" @@ -222,5 +219,5 @@ def clean_markdown_math_block(markdown): if __name__ == '__main__': with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f: markdown=f.read() - print(unembed_base64_images_to_zip(markdown)) + print(unembed_base64_images_to_zip(markdown,"markdown.md"))