修改为导出的markdownu压缩包不含顶层目录
This commit is contained in:
@@ -109,6 +109,10 @@ class FileTranslater:
|
|||||||
return cached_result
|
return cached_result
|
||||||
if document.suffix in [".md", ".txt"]:
|
if document.suffix in [".md", ".txt"]:
|
||||||
return document.filebytes.decode("utf-8")
|
return document.filebytes.decode("utf-8")
|
||||||
|
if document.suffix in ['zip']:
|
||||||
|
#寻找zip内的filename
|
||||||
|
filename=find_markdown_in_zip(document.filebytes)
|
||||||
|
return embed_inline_image_from_zip(document.filebytes,filename)
|
||||||
translater_logger.info("正在转化为markdown")
|
translater_logger.info("正在转化为markdown")
|
||||||
if self.convert_engin == "docling":
|
if self.convert_engin == "docling":
|
||||||
if artifact is None:
|
if artifact is None:
|
||||||
@@ -289,18 +293,20 @@ class FileTranslater:
|
|||||||
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True):
|
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True):
|
||||||
if isinstance(filename, str):
|
if isinstance(filename, str):
|
||||||
filename = Path(filename)
|
filename = Path(filename)
|
||||||
if isinstance(output_dir, str):
|
|
||||||
output_dir = Path(output_dir)
|
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = Path(f"{self.document.stem}.md")
|
filename = Path(f"{self.document.stem}.md")
|
||||||
# 确保输出目录存在
|
# 确保输出目录存在
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
if isinstance(output_dir, str):
|
||||||
|
output_dir = Path(output_dir)
|
||||||
if embeded:
|
if embeded:
|
||||||
full_name = output_dir / filename
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
full_name = output_dir / filename.name
|
||||||
with open(full_name, "w") as file:
|
with open(full_name, "w") as file:
|
||||||
file.write(self.export_to_markdown())
|
file.write(self.export_to_markdown())
|
||||||
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
||||||
else:
|
else:
|
||||||
|
output_dir=output_dir/filename.stem
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
with zipfile.ZipFile(io.BytesIO(self.export_to_unembed_markdown())) as zip_ref:
|
with zipfile.ZipFile(io.BytesIO(self.export_to_unembed_markdown())) as zip_ref:
|
||||||
zip_ref.extractall(output_dir)
|
zip_ref.extractall(output_dir)
|
||||||
return self
|
return self
|
||||||
@@ -316,7 +322,7 @@ class FileTranslater:
|
|||||||
if filename is None:
|
if filename is None:
|
||||||
filename = Path(f"{self.document.stem}.md")
|
filename = Path(f"{self.document.stem}.md")
|
||||||
self._markdown_format()
|
self._markdown_format()
|
||||||
return unembed_base64_images_to_zip(self.markdown, folder_name=str(filename.stem), markdown_name=str(filename))
|
return unembed_base64_images_to_zip(self.markdown, markdown_name=filename.name)
|
||||||
|
|
||||||
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
||||||
if isinstance(filename, str):
|
if isinstance(filename, str):
|
||||||
|
|||||||
@@ -179,11 +179,9 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding
|
|||||||
return modified_md_content
|
return modified_md_content
|
||||||
|
|
||||||
|
|
||||||
def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->bytes:
|
def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_name="images")->bytes:
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行
|
image_folder=os.path.join(temp_dir,image_folder_name)
|
||||||
os.makedirs(subfolder, exist_ok=True)
|
|
||||||
image_folder=os.path.join(subfolder,image_folder_name)
|
|
||||||
os.makedirs(image_folder,exist_ok=True)
|
os.makedirs(image_folder,exist_ok=True)
|
||||||
pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)"
|
pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)"
|
||||||
def unembed_base64_images(match:re.Match)->str:
|
def unembed_base64_images(match:re.Match)->str:
|
||||||
@@ -197,16 +195,15 @@ def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,
|
|||||||
f.write(base64.b64decode(b64data))
|
f.write(base64.b64decode(b64data))
|
||||||
return f""
|
return f""
|
||||||
modified_md_content = re.sub(pattern, unembed_base64_images,markdown)
|
modified_md_content = re.sub(pattern, unembed_base64_images,markdown)
|
||||||
with open(os.path.join(subfolder,f"{markdown_name}"),"w") as f:
|
with open(os.path.join(temp_dir,f"{markdown_name}"),"w") as f:
|
||||||
f.write(modified_md_content)
|
f.write(modified_md_content)
|
||||||
zip_buffer=io.BytesIO()
|
zip_buffer=io.BytesIO()
|
||||||
folder_path=Path(subfolder)
|
folder_path=Path(temp_dir)
|
||||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||||
for file in folder_path.rglob('*'):
|
for file in folder_path.rglob('*'):
|
||||||
if file.is_file():
|
if file.is_file():
|
||||||
zipf.write(file, file.relative_to(folder_path.parent))
|
zipf.write(file, file.relative_to(folder_path))
|
||||||
|
return zip_buffer.getvalue()
|
||||||
return zip_buffer.read()
|
|
||||||
|
|
||||||
def clean_markdown_math_block(markdown):
|
def clean_markdown_math_block(markdown):
|
||||||
"""清除公式块的多余空格字符"""
|
"""清除公式块的多余空格字符"""
|
||||||
@@ -222,5 +219,5 @@ def clean_markdown_math_block(markdown):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f:
|
with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f:
|
||||||
markdown=f.read()
|
markdown=f.read()
|
||||||
print(unembed_base64_images_to_zip(markdown))
|
print(unembed_base64_images_to_zip(markdown,"markdown.md"))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user