From 6282c254f7cb6f97f9c94d0132bdbf1f2dc76aed Mon Sep 17 00:00:00 2001 From: xunbu Date: Wed, 26 Nov 2025 11:14:50 +0800 Subject: [PATCH] fix --- docutranslate/utils/markdown_utils.py | 55 +++++++++++++++++++++------ 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index 0dbfb87..2bfd3fa 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -207,22 +207,56 @@ def unembed_base64_images_to_zip(markdown: str, markdown_name: str, image_folder with tempfile.TemporaryDirectory() as temp_dir: image_folder = os.path.join(temp_dir, image_folder_name) os.makedirs(image_folder, exist_ok=True) - pattern = r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)" + + pattern = r"!\[(.*?)\]\(data:(.*?);.*base64,(.*?)\)" def unembed_base64_images(match: re.Match) -> str: - b64data = match.group(3) - extension = mimetypes.guess_extension(match.group(2)) - image_id = hashlib.md5(b64data.encode()).hexdigest()[:8] - image_name = f"{image_id}{extension}" - url = f"./{image_folder_name}/{image_name}" - # 创建对应的image文件 - with open(os.path.join(image_folder, image_name), "wb") as f: - f.write(base64.b64decode(b64data)) - return f"![{match.group(1)}]({url})" + alt_text = match.group(1) + mime_type = match.group(2) + b64data_raw = match.group(3) + + # 【修改点2】强制清洗数据:移除所有非 Base64 合法字符(如中文、空格、换行符等) + # Base64 字符集只包含 A-Z, a-z, 0-9, +, /, = + b64data_clean = re.sub(r'[^A-Za-z0-9+/=]', '', b64data_raw) + + # 简单的扩展名推断 + extension = mimetypes.guess_extension(mime_type) + if not extension: + if 'png' in mime_type: + extension = '.png' + elif 'jpeg' in mime_type or 'jpg' in mime_type: + extension = '.jpg' + elif 'gif' in mime_type: + extension = '.gif' + elif 'svg' in mime_type: + extension = '.svg' + elif 'webp' in mime_type: + extension = '.webp' + else: + extension = '.bin' + + try: + # 【修改点3】添加异常捕获 + image_bytes = base64.b64decode(b64data_clean) + image_id = hashlib.md5(image_bytes).hexdigest()[:8] + image_name = f"{image_id}{extension}" + url = f"./{image_folder_name}/{image_name}" + + with open(os.path.join(image_folder, image_name), "wb") as f: + f.write(image_bytes) + + # 返回替换后的 Markdown 图片链接 + return f"![{alt_text}]({url})" + except Exception as e: + print(f"Warning: Failed to decode base64 image in markdown. Error: {e}") + # 如果解码失败,返回原始匹配文本(不做替换),保证文档不丢失内容 + return match.group(0) modified_md_content = re.sub(pattern, unembed_base64_images, markdown) + with open(os.path.join(temp_dir, f"{markdown_name}"), "w", encoding="utf-8") as f: f.write(modified_md_content) + zip_buffer = io.BytesIO() folder_path = Path(temp_dir) with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf: @@ -231,6 +265,5 @@ def unembed_base64_images_to_zip(markdown: str, markdown_name: str, image_folder zipf.write(file, file.relative_to(folder_path)) return zip_buffer.getvalue() - if __name__ == '__main__': pass