允许输入一个链接图片的markdown的zip
This commit is contained in:
@@ -12,7 +12,7 @@ from docutranslate.cacher import document_cacher_global
|
|||||||
from docutranslate.converter import Document, ConverterMineru
|
from docutranslate.converter import Document, ConverterMineru
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
|
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
|
||||||
unembed_base64_images_to_zip
|
unembed_base64_images_to_zip, embed_inline_image_from_zip, find_markdown_in_zip
|
||||||
from docutranslate.logger import translater_logger
|
from docutranslate.logger import translater_logger
|
||||||
from docutranslate.global_values import available_packages
|
from docutranslate.global_values import available_packages
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
@@ -132,6 +132,10 @@ class FileTranslater:
|
|||||||
return cached_result
|
return cached_result
|
||||||
if document.suffix in [".md", ".txt"]:
|
if document.suffix in [".md", ".txt"]:
|
||||||
return document.filebytes.decode("utf-8")
|
return document.filebytes.decode("utf-8")
|
||||||
|
if document.suffix in ['zip']:
|
||||||
|
#寻找zip内的filename
|
||||||
|
filename=find_markdown_in_zip(document.filebytes)
|
||||||
|
return embed_inline_image_from_zip(document.filebytes,filename)
|
||||||
translater_logger.info("正在转化为markdown")
|
translater_logger.info("正在转化为markdown")
|
||||||
if self.convert_engin == "docling":
|
if self.convert_engin == "docling":
|
||||||
if artifact is None:
|
if artifact is None:
|
||||||
@@ -297,7 +301,7 @@ class FileTranslater:
|
|||||||
file.write(self.export_to_markdown())
|
file.write(self.export_to_markdown())
|
||||||
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
||||||
else:
|
else:
|
||||||
with zipfile.ZipFile(self.export_to_unembed_markdown()) as zip_ref:
|
with zipfile.ZipFile(io.BytesIO(self.export_to_unembed_markdown())) as zip_ref:
|
||||||
zip_ref.extractall(output_dir)
|
zip_ref.extractall(output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@@ -306,7 +310,7 @@ class FileTranslater:
|
|||||||
self._markdown_format()
|
self._markdown_format()
|
||||||
return self.markdown
|
return self.markdown
|
||||||
|
|
||||||
def export_to_unembed_markdown(self, filename: str | Path | None = None) -> io.BytesIO:
|
def export_to_unembed_markdown(self, filename: str | Path | None = None) -> bytes:
|
||||||
if isinstance(filename, str):
|
if isinstance(filename, str):
|
||||||
filename = Path(filename)
|
filename = Path(filename)
|
||||||
if filename is None:
|
if filename is None:
|
||||||
|
|||||||
@@ -76,6 +76,22 @@ def placeholder2_uris(markdown: str, mask_dict: MaskDict):
|
|||||||
return markdown
|
return markdown
|
||||||
|
|
||||||
|
|
||||||
|
def find_markdown_in_zip(zip_bytes: bytes):
|
||||||
|
zip_file_bytes = io.BytesIO(zip_bytes)
|
||||||
|
with zipfile.ZipFile(zip_file_bytes, 'r') as zip_ref:
|
||||||
|
# 获取 ZIP 中所有文件名
|
||||||
|
all_files = zip_ref.namelist()
|
||||||
|
# 筛选出 .md 文件
|
||||||
|
md_files = [f for f in all_files if f.lower().endswith('.md')]
|
||||||
|
|
||||||
|
if len(md_files) == 1:
|
||||||
|
return md_files[0]
|
||||||
|
elif len(md_files) > 1:
|
||||||
|
raise ValueError("ZIP 中包含多个 Markdown 文件")
|
||||||
|
else:
|
||||||
|
raise ValueError("ZIP 中没有 Markdown 文件")
|
||||||
|
|
||||||
|
|
||||||
def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding="utf-8"):
|
def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding="utf-8"):
|
||||||
zip_file_bytes = io.BytesIO(zip_bytes)
|
zip_file_bytes = io.BytesIO(zip_bytes)
|
||||||
|
|
||||||
@@ -163,7 +179,7 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding
|
|||||||
return modified_md_content
|
return modified_md_content
|
||||||
|
|
||||||
|
|
||||||
def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->io.BytesIO:
|
def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->bytes:
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行
|
subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行
|
||||||
os.makedirs(subfolder, exist_ok=True)
|
os.makedirs(subfolder, exist_ok=True)
|
||||||
@@ -190,7 +206,7 @@ def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,
|
|||||||
if file.is_file():
|
if file.is_file():
|
||||||
zipf.write(file, file.relative_to(folder_path.parent))
|
zipf.write(file, file.relative_to(folder_path.parent))
|
||||||
|
|
||||||
return zip_buffer
|
return zip_buffer.read()
|
||||||
|
|
||||||
def clean_markdown_math_block(markdown):
|
def clean_markdown_math_block(markdown):
|
||||||
"""清除公式块的多余空格字符"""
|
"""清除公式块的多余空格字符"""
|
||||||
|
|||||||
Reference in New Issue
Block a user