翻译markdown可以传入包含图片的zip压缩包

This commit is contained in:
xunbu
2025-10-13 12:49:57 +08:00
parent 4abd9589c0
commit 947de6de67
3 changed files with 95 additions and 65 deletions

View File

@@ -6,11 +6,11 @@ import io
import mimetypes import mimetypes
import os import os
import re import re
import tempfile
import threading import threading
import uuid import uuid
import zipfile import zipfile
from pathlib import Path from pathlib import Path
import tempfile
class MaskDict: class MaskDict:
@@ -100,86 +100,100 @@ def find_markdown_in_zip(zip_bytes: bytes):
raise ValueError("ZIP 中没有 Markdown 文件") raise ValueError("ZIP 中没有 Markdown 文件")
def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding="utf-8"): def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str | None = None, encoding="utf-8"):
"""
从ZIP文件的字节流中读取一个Markdown文件并将其中的相对路径图片内联为Base64编码的data URI。
Args:
zip_bytes (bytes): ZIP文件的字节内容。
filename_in_zip (str | None, optional):
要处理的Markdown文件名。如果为 None则自动查找并使用ZIP包中的第一个.md或.markdown文件。
默认为 None。
encoding (str, optional): Markdown文件的编码格式。默认为 "utf-8"
Returns:
str | None: 包含内联图片的Markdown文本内容如果发生错误则返回None。
"""
zip_file_bytes = io.BytesIO(zip_bytes) zip_file_bytes = io.BytesIO(zip_bytes)
print("正在尝试打开内存中的ZIP存档...")
print(f"正在尝试打开内存中的ZIP存档...")
with zipfile.ZipFile(zip_file_bytes, 'r') as archive: with zipfile.ZipFile(zip_file_bytes, 'r') as archive:
print(f"ZIP存档已打开。正在查找文件 '{filename_in_zip}'...") print("ZIP存档已打开。")
if filename_in_zip not in archive.namelist(): # --- 新增和修改的逻辑 ---
print(f"错误: 文件 '{filename_in_zip}' 在ZIP压缩包中未找到。") target_md_filename = filename_in_zip
# 如果未指定文件名则自动查找第一个Markdown文件
if target_md_filename is None:
print("`正在自动查找第一个Markdown文件...")
found_md = None
for name in archive.namelist():
# 确保它是一个文件(不是目录),并检查扩展名
if not name.endswith('/') and name.lower().endswith(('.md', '.markdown')):
found_md = name
break # 找到第一个就停止
if found_md:
target_md_filename = found_md
print(f"已自动选择Markdown文件: '{target_md_filename}'")
else:
print("错误: ZIP压缩包中未找到任何Markdown文件 (.md 或 .markdown)。")
print(f"压缩包中的可用文件列表: {archive.namelist()}") print(f"压缩包中的可用文件列表: {archive.namelist()}")
return None return None
md_content_bytes = archive.read(filename_in_zip) # 统一检查最终确定的文件是否存在于压缩包中
print(f"文件 '{filename_in_zip}' 已找到并读取。") if target_md_filename not in archive.namelist():
print(f"错误: 文件 '{target_md_filename}' 在ZIP压缩包中未找到。")
print(f"压缩包中的可用文件列表: {archive.namelist()}")
return None
# --- 后续代码使用 target_md_filename ---
print(f"正在读取文件 '{target_md_filename}'...")
md_content_bytes = archive.read(target_md_filename)
print(f"文件 '{target_md_filename}' 已读取。")
md_content_text = md_content_bytes.decode(encoding) md_content_text = md_content_bytes.decode(encoding)
print(f"文件内容已使用 '{encoding}' 编码成功解码。") print(f"文件内容已使用 '{encoding}' 编码成功解码。")
# --- 新增:处理图片 ---
print("开始处理Markdown中的图片...") print("开始处理Markdown中的图片...")
# 获取Markdown文件在ZIP包内的基本目录,用于解析相对图片路径 # 获取Markdown文件在ZIP包内的基本目录
# 例如,如果 filename_in_zip 是 "docs/guide/full.md", base_md_path_in_zip 是 "docs/guide" base_md_path_in_zip = os.path.dirname(target_md_filename)
# 如果 filename_in_zip 是 "full.md", base_md_path_in_zip 是 ""
base_md_path_in_zip = os.path.dirname(filename_in_zip)
def replace_image_with_base64(match): def replace_image_with_base64(match):
alt_text = match.group(1) alt_text = match.group(1)
original_image_path = match.group(2) original_image_path = match.group(2)
# 检查是否是外部链接或已经是data URI
if original_image_path.startswith(('http://', 'https://', 'data:')): if original_image_path.startswith(('http://', 'https://', 'data:')):
print(f" 跳过外部或已内联图片: {original_image_path}") # print(f" 跳过外部或已内联图片: {original_image_path}")
return match.group(0) # 返回原始匹配 return match.group(0)
# 构建图片在ZIP文件中的绝对路径
# os.path.join 会正确处理 base_md_path_in_zip 为空字符串的情况
image_path_in_zip = os.path.join(base_md_path_in_zip, original_image_path) image_path_in_zip = os.path.join(base_md_path_in_zip, original_image_path)
# zipfile 使用正斜杠并且路径是相对于zip根目录的os.path.normpath确保路径格式正确
image_path_in_zip = os.path.normpath(image_path_in_zip).replace(os.sep, '/') image_path_in_zip = os.path.normpath(image_path_in_zip).replace(os.sep, '/')
# 确保路径不是以 './' 开头如果filename_in_zip在根目录且图片路径也是相对的
if image_path_in_zip.startswith('./'): if image_path_in_zip.startswith('./'):
image_path_in_zip = image_path_in_zip[2:] image_path_in_zip = image_path_in_zip[2:]
# print(f" 尝试内联图片: '{original_image_path}' (解析为ZIP内路径: '{image_path_in_zip}')")
try: try:
image_bytes = archive.read(image_path_in_zip) image_bytes = archive.read(image_path_in_zip)
# 猜测MIME类型
mime_type, _ = mimetypes.guess_type(image_path_in_zip) mime_type, _ = mimetypes.guess_type(image_path_in_zip)
if not mime_type: if not mime_type:
# 备用:根据扩展名手动判断一些常见类型
ext = os.path.splitext(image_path_in_zip)[1].lower() ext = os.path.splitext(image_path_in_zip)[1].lower()
if ext == '.png': mime_map = {'.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
mime_type = 'image/png' '.gif': 'image/gif', '.svg': 'image/svg+xml', '.webp': 'image/webp'}
elif ext in ['.jpg', '.jpeg']: mime_type = mime_map.get(ext)
mime_type = 'image/jpeg'
elif ext == '.gif': if not mime_type:
mime_type = 'image/gif'
elif ext == '.svg':
mime_type = 'image/svg+xml'
elif ext == '.webp':
mime_type = 'image/webp'
else:
print(f" 警告: 无法确定图片 '{image_path_in_zip}' 的MIME类型。跳过内联。") print(f" 警告: 无法确定图片 '{image_path_in_zip}' 的MIME类型。跳过内联。")
return match.group(0) # 返回原始匹配 return match.group(0)
base64_encoded_data = base64.b64encode(image_bytes).decode('utf-8') base64_encoded_data = base64.b64encode(image_bytes).decode('utf-8')
new_image_tag = f"![{alt_text}](data:{mime_type};base64,{base64_encoded_data})" new_image_tag = f"![{alt_text}](data:{mime_type};base64,{base64_encoded_data})"
# print(f" 成功内联图片: {original_image_path} -> data:{mime_type[:20]}...")
return new_image_tag return new_image_tag
except KeyError: except KeyError:
print(f" 警告: 图片 '{image_path_in_zip}' 在ZIP压缩包中未找到。原始链接将被保留。") print(f" 警告: 图片 '{image_path_in_zip}' 在ZIP压缩包中未找到。原始链接将被保留。")
return match.group(0) # 图片不在zip中返回原始匹配 return match.group(0)
except Exception as e_img: except Exception as e_img:
print(f" 错误: 处理图片 '{image_path_in_zip}' 时发生错误: {e_img}。原始链接将被保留。") print(f" 错误: 处理图片 '{image_path_in_zip}' 时发生错误: {e_img}。原始链接将被保留。")
return match.group(0) return match.group(0)
# 正则表达式查找Markdown图片: ![alt text](path/to/image.ext)
# 修改了正则表达式使其不贪婪地匹配alt文本和路径
image_regex = r"!\[(.*?)\]\((.*?)\)" image_regex = r"!\[(.*?)\]\((.*?)\)"
modified_md_content = re.sub(image_regex, replace_image_with_base64, md_content_text) modified_md_content = re.sub(image_regex, replace_image_with_base64, md_content_text)
@@ -192,6 +206,7 @@ def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_nam
image_folder = os.path.join(temp_dir, image_folder_name) image_folder = os.path.join(temp_dir, image_folder_name)
os.makedirs(image_folder, exist_ok=True) os.makedirs(image_folder, exist_ok=True)
pattern = r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)" pattern = r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)"
def unembed_base64_images(match: re.Match) -> str: def unembed_base64_images(match: re.Match) -> str:
b64data = match.group(3) b64data = match.group(3)
extension = mimetypes.guess_extension(match.group(2)) extension = mimetypes.guess_extension(match.group(2))
@@ -202,6 +217,7 @@ def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_nam
with open(os.path.join(image_folder, image_name), "wb") as f: with open(os.path.join(image_folder, image_name), "wb") as f:
f.write(base64.b64decode(b64data)) f.write(base64.b64decode(b64data))
return f"![{match.group(1)}]({url})" return f"![{match.group(1)}]({url})"
modified_md_content = re.sub(pattern, unembed_base64_images, markdown) modified_md_content = re.sub(pattern, unembed_base64_images, markdown)
with open(os.path.join(temp_dir, f"{markdown_name}"), "w", encoding="utf-8") as f: with open(os.path.join(temp_dir, f"{markdown_name}"), "w", encoding="utf-8") as f:
f.write(modified_md_content) f.write(modified_md_content)
@@ -216,4 +232,3 @@ def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_nam
if __name__ == '__main__': if __name__ == '__main__':
pass pass

View File

@@ -11,6 +11,7 @@ from docutranslate.global_values.conditional_import import DOCLING_EXIST
from docutranslate.glossary.glossary import Glossary from docutranslate.glossary.glossary import Glossary
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.utils.markdown_utils import embed_inline_image_from_zip
if DOCLING_EXIST: if DOCLING_EXIST:
from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling from docutranslate.converter.x2md.converter_docling import ConverterDoclingConfig, ConverterDocling
@@ -38,7 +39,8 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
HTMLExportable[MD2HTMLExporterConfig], HTMLExportable[MD2HTMLExporterConfig],
MDFormatsExportable[ExporterConfig]): MDFormatsExportable[ExporterConfig]):
_converter_factory: dict[ _converter_factory: dict[
ConvertEngineType, Tuple[Type[X2MarkdownConverter|ConverterIdentity], Type[X2MarkdownConverterConfig]] | None] = { ConvertEngineType, Tuple[Type[X2MarkdownConverter | ConverterIdentity], Type[
X2MarkdownConverterConfig]] | None] = {
"mineru": (ConverterMineru, ConverterMineruConfig), "mineru": (ConverterMineru, ConverterMineruConfig),
"identity": (ConverterIdentity, None) "identity": (ConverterIdentity, None)
} }
@@ -58,6 +60,8 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
if self.document_original is None: if self.document_original is None:
raise RuntimeError("File has not been read yet. Call read_path or read_bytes first.") raise RuntimeError("File has not been read yet. Call read_path or read_bytes first.")
if self.document_original.suffix.lower() == ".zip":
self.document_original = self._get_md_from_zip(self.document_original)
# 获取缓存的解析后文件 # 获取缓存的解析后文件
document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin, document_cached = md_based_convert_cacher.get_cached_result(self.document_original, convert_engin,
convert_config) convert_config)
@@ -83,8 +87,15 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark
return document_md return document_md
def _get_md_from_zip(self, document: Document) -> Document:
assert document.suffix.lower() == ".zip"
self.logger.info("传入zip文件正在自动组合markdown文本与图片")
content_byte = embed_inline_image_from_zip(document.content).encode()
return document.from_bytes(content_byte, suffix=".md", stem=document.stem)
def _pre_translate(self, document: Document): def _pre_translate(self, document: Document):
convert_engine: ConvertEngineType = "identity" if document.suffix == ".md" else self.convert_engine convert_engine: ConvertEngineType = "identity" if document.suffix.lower() in [".md", ".markdown",
".zip"] else self.convert_engine
convert_config = self.config.converter_config convert_config = self.config.converter_config
translator_config = self.config.translator_config translator_config = self.config.translator_config
translator = MDTranslator(translator_config) translator = MDTranslator(translator_config)

View File

@@ -1,5 +1,9 @@
更新日志 更新日志
---------------- ----------------
v1.4.10a2版 2025.10.13
优化
- 优化docx翻译效果与稳定性
----------------
v1.4.9版 2025.10.10 v1.4.9版 2025.10.10
特性 特性
- docx翻译增加页眉页脚支持 - docx翻译增加页眉页脚支持