From 8cef4451b25d3cea9c5139685590b45096d27bea Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 3 Jul 2025 17:09:53 +0800 Subject: [PATCH] =?UTF-8?q?=E5=85=81=E8=AE=B8=E8=BE=93=E5=85=A5=E4=B8=80?= =?UTF-8?q?=E4=B8=AA=E9=93=BE=E6=8E=A5=E5=9B=BE=E7=89=87=E7=9A=84markdown?= =?UTF-8?q?=E7=9A=84zip?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/translater.py | 10 +++++++--- docutranslate/utils/markdown_utils.py | 20 ++++++++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/docutranslate/translater.py b/docutranslate/translater.py index 351cd6d..bb7c46e 100644 --- a/docutranslate/translater.py +++ b/docutranslate/translater.py @@ -12,7 +12,7 @@ from docutranslate.cacher import document_cacher_global from docutranslate.converter import Document, ConverterMineru from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \ - unembed_base64_images_to_zip + unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip from docutranslate.logger import translater_logger from docutranslate.global_values import available_packages from docutranslate.utils.resource_utils import resource_path @@ -132,6 +132,10 @@ class FileTranslater: return cached_result if document.suffix in [".md", ".txt"]: return document.filebytes.decode("utf-8") + if document.suffix in ['zip']: + #寻找zip内的filename + filename=find_markdown_in_zip(document.filebytes) + return embed_inline_image_from_zip(document.filebytes,filename) translater_logger.info("正在转化为markdown") if self.convert_engin == "docling": if artifact is None: @@ -297,7 +301,7 @@ class FileTranslater: file.write(self.export_to_markdown()) translater_logger.info(f"文件已写入{full_name.resolve()}") else: - with zipfile.ZipFile(self.export_to_unembed_markdown()) as zip_ref: + with zipfile.ZipFile(io.BytesIO(self.export_to_unembed_markdown())) as zip_ref: zip_ref.extractall(output_dir) return self @@ -306,7 +310,7 @@ class FileTranslater: self._markdown_format() return self.markdown - def export_to_unembed_markdown(self, filename: str | Path | None = None) -> io.BytesIO: + def export_to_unembed_markdown(self, filename: str | Path | None = None) -> bytes: if isinstance(filename, str): filename = Path(filename) if filename is None: diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index 96e5a4b..d1547c8 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -76,6 +76,22 @@ def placeholder2_uris(markdown: str, mask_dict: MaskDict): return markdown +def find_markdown_in_zip(zip_bytes: bytes): + zip_file_bytes = io.BytesIO(zip_bytes) + with zipfile.ZipFile(zip_file_bytes, 'r') as zip_ref: + # 获取 ZIP 中所有文件名 + all_files = zip_ref.namelist() + # 筛选出 .md 文件 + md_files = [f for f in all_files if f.lower().endswith('.md')] + + if len(md_files) == 1: + return md_files[0] + elif len(md_files) > 1: + raise ValueError("ZIP 中包含多个 Markdown 文件") + else: + raise ValueError("ZIP 中没有 Markdown 文件") + + def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding="utf-8"): zip_file_bytes = io.BytesIO(zip_bytes) @@ -163,7 +179,7 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding return modified_md_content -def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->io.BytesIO: +def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->bytes: with tempfile.TemporaryDirectory() as temp_dir: subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行 os.makedirs(subfolder, exist_ok=True) @@ -190,7 +206,7 @@ def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str, if file.is_file(): zipf.write(file, file.relative_to(folder_path.parent)) - return zip_buffer + return zip_buffer.read() def clean_markdown_math_block(markdown): """清除公式块的多余空格字符"""