允许保存不内嵌图片的markdown(文件夹形式)

This commit is contained in:
xunbu
2025-07-03 16:37:53 +08:00
parent 1e04b24b78
commit 8e2449dafa
3 changed files with 103 additions and 56 deletions

View File

@@ -173,8 +173,9 @@ translater = FileTranslater(convert_engin="mineru", # 使用mineru解析文档
translater.read_file("<文件路径>").save_as_html()#保存
translater.read_file("<文件路径>").export_to_html()#输出字符串
# 文件转markdown
translater.read_file("<文件路径>").save_as_markdown()#保存
translater.read_file("<文件路径>").export_to_markdown()#输出字符串
translater.read_file("<文件路径>").save_as_markdown()#保存内嵌bas64图片的markdown
translater.read_file("<文件路径>").save_as_markdown(embed=False)#保存不内嵌图片的markdown文件夹形式
translater.read_file("<文件路径>").export_to_markdown()#输出内嵌图片的markdown字符串
```
## 参数说明
@@ -262,6 +263,7 @@ from docutranslate import FileTranslater
translater = FileTranslater(base_url="<baseurl>",
key="<key>",
model_id="<model-id>", # 使用的模型id
convert_engin="docling", # 使用docling
docling_artifact=r"C:\Users\<user>\.cache\docling\models"
)
```

View File

@@ -1,5 +1,7 @@
import asyncio
import html
import io
import zipfile
from pathlib import Path
from typing import Literal
import markdown2
@@ -9,7 +11,8 @@ from docutranslate.agents import MDRefineAgent, MDTranslateAgent
from docutranslate.cacher import document_cacher_global
from docutranslate.converter import Document, ConverterMineru
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
unembed_base64_images_to_zip
from docutranslate.logger import translater_logger
from docutranslate.global_values import available_packages
from docutranslate.utils.resource_utils import resource_path
@@ -24,9 +27,11 @@ default_params={
"temperature": 0.7,
}
class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"],
base_url:str|None=None, key=None, model_id:str|None=None, temperature=default_params["temperature"],
base_url: str | None = None, key=None, model_id: str | None = None,
temperature=default_params["temperature"],
concurrent: int = default_params["concurrent"], timeout=2000,
convert_engin: Literal["docling", "mineru"] = "mineru",
docling_artifact: Path | str | None = None,
@@ -277,28 +282,38 @@ class FileTranslater:
translater_logger.info("翻译完成")
return self.markdown
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True):
if isinstance(filename, str):
filename = Path(filename)
if isinstance(output_dir, str):
output_dir = Path(output_dir)
if filename is None:
filename=f"{self.document.stem}.md"
filename = Path(f"{self.document.stem}.md")
# 确保输出目录存在
output_dir.mkdir(parents=True, exist_ok=True)
if embeded:
full_name = output_dir / filename
# 输出前格式化markdown
self._markdown_format()
with open(full_name, "w") as file:
file.write(self.markdown)
file.write(self.export_to_markdown())
translater_logger.info(f"文件已写入{full_name.resolve()}")
else:
with zipfile.ZipFile(self.export_to_unembed_markdown()) as zip_ref:
zip_ref.extractall(output_dir)
return self
def export_to_markdown(self):
def export_to_markdown(self) -> str:
# 输出前格式化markdown
self._markdown_format()
return self.markdown
def export_to_unembed_markdown(self, filename: str | Path | None = None) -> io.BytesIO:
if isinstance(filename, str):
filename = Path(filename)
if filename is None:
filename = Path(f"{self.document.stem}.md")
self._markdown_format()
return unembed_base64_images_to_zip(self.markdown, folder_name=str(filename.stem), markdown_name=str(filename))
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
if isinstance(filename, str):
filename = Path(filename)
@@ -340,7 +355,8 @@ class FileTranslater:
});
</script>""" if cdn else r"""
<script>
document.addEventListener("DOMContentLoaded", function () {
document.addEventListener("DOMContentLoaded", function
() {
renderMathInElement(document.body, {
delimiters: [
{left: '$$', right: '$$', display: true},

View File

@@ -6,6 +6,8 @@ import re
import threading
import uuid
import zipfile
from pathlib import Path
import tempfile
class MaskDict:
@@ -161,6 +163,35 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding
return modified_md_content
def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->io.BytesIO:
with tempfile.TemporaryDirectory() as temp_dir:
subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行
os.makedirs(subfolder, exist_ok=True)
image_folder=os.path.join(subfolder,image_folder_name)
os.makedirs(image_folder,exist_ok=True)
pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)"
def unembed_base64_images(match:re.Match)->str:
b64data = match.group(3)
extension=mimetypes.guess_extension(match.group(2))
image_id=uuid.uuid1().hex[:8]
image_name=f"{image_id}{extension}"
url=f"./{image_folder_name}/{image_name}"
#创建对应的image文件
with open(os.path.join(image_folder,image_name),"wb") as f:
f.write(base64.b64decode(b64data))
return f"![{match.group(1)}]({url})"
modified_md_content = re.sub(pattern, unembed_base64_images,markdown)
with open(os.path.join(subfolder,f"{markdown_name}"),"w") as f:
f.write(modified_md_content)
zip_buffer=io.BytesIO()
folder_path=Path(subfolder)
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file in folder_path.rglob('*'):
if file.is_file():
zipf.write(file, file.relative_to(folder_path.parent))
return zip_buffer
def clean_markdown_math_block(markdown):
"""清除公式块的多余空格字符"""
@@ -173,9 +204,7 @@ def clean_markdown_math_block(markdown):
if __name__ == '__main__':
markdown = r"""
$$
R T _ { k } ^ { i } ( t ) = \frac { \sum _ { t ^ { \prime } \in [ t - W , t ] } R R _ { k } ^ { i } ( t ^ { \prime } ) \times D R _ { k } ^ { i } ( t ^ { \prime } ) } { \sum _ { t ^ { \prime } \in [ t - W , t ] } D R _ { k } ^ { i } ( t ^ { \prime } ) }
$$
"""
print(clean_markdown_math_block(markdown))
with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f:
markdown=f.read()
print(unembed_base64_images_to_zip(markdown))