允许保存不内嵌图片的markdown(文件夹形式)

This commit is contained in:
xunbu
2025-07-03 16:37:53 +08:00
parent 1e04b24b78
commit 8e2449dafa
3 changed files with 103 additions and 56 deletions

View File

@@ -173,8 +173,9 @@ translater = FileTranslater(convert_engin="mineru", # 使用mineru解析文档
translater.read_file("<文件路径>").save_as_html()#保存 translater.read_file("<文件路径>").save_as_html()#保存
translater.read_file("<文件路径>").export_to_html()#输出字符串 translater.read_file("<文件路径>").export_to_html()#输出字符串
# 文件转markdown # 文件转markdown
translater.read_file("<文件路径>").save_as_markdown()#保存 translater.read_file("<文件路径>").save_as_markdown()#保存内嵌bas64图片的markdown
translater.read_file("<文件路径>").export_to_markdown()#输出字符串 translater.read_file("<文件路径>").save_as_markdown(embed=False)#保存不内嵌图片的markdown文件夹形式
translater.read_file("<文件路径>").export_to_markdown()#输出内嵌图片的markdown字符串
``` ```
## 参数说明 ## 参数说明
@@ -262,6 +263,7 @@ from docutranslate import FileTranslater
translater = FileTranslater(base_url="<baseurl>", translater = FileTranslater(base_url="<baseurl>",
key="<key>", key="<key>",
model_id="<model-id>", # 使用的模型id model_id="<model-id>", # 使用的模型id
convert_engin="docling", # 使用docling
docling_artifact=r"C:\Users\<user>\.cache\docling\models" docling_artifact=r"C:\Users\<user>\.cache\docling\models"
) )
``` ```

View File

@@ -1,5 +1,7 @@
import asyncio import asyncio
import html import html
import io
import zipfile
from pathlib import Path from pathlib import Path
from typing import Literal from typing import Literal
import markdown2 import markdown2
@@ -9,7 +11,8 @@ from docutranslate.agents import MDRefineAgent, MDTranslateAgent
from docutranslate.cacher import document_cacher_global from docutranslate.cacher import document_cacher_global
from docutranslate.converter import Document, ConverterMineru from docutranslate.converter import Document, ConverterMineru
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
unembed_base64_images_to_zip
from docutranslate.logger import translater_logger from docutranslate.logger import translater_logger
from docutranslate.global_values import available_packages from docutranslate.global_values import available_packages
from docutranslate.utils.resource_utils import resource_path from docutranslate.utils.resource_utils import resource_path
@@ -18,16 +21,18 @@ DOCLING_FLAG = True if available_packages.get("docling") else False
if DOCLING_FLAG: if DOCLING_FLAG:
from docutranslate.converter import ConverterDocling from docutranslate.converter import ConverterDocling
default_params={ default_params = {
"chunk_size":3000, "chunk_size": 3000,
"concurrent":30, "concurrent": 30,
"temperature":0.7, "temperature": 0.7,
} }
class FileTranslater: class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"], def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"],
base_url:str|None=None, key=None, model_id:str|None=None, temperature=default_params["temperature"], base_url: str | None = None, key=None, model_id: str | None = None,
concurrent:int=default_params["concurrent"], timeout=2000, temperature=default_params["temperature"],
concurrent: int = default_params["concurrent"], timeout=2000,
convert_engin: Literal["docling", "mineru"] = "mineru", convert_engin: Literal["docling", "mineru"] = "mineru",
docling_artifact: Path | str | None = None, docling_artifact: Path | str | None = None,
mineru_token: str = None, cache=True): mineru_token: str = None, cache=True):
@@ -37,7 +42,7 @@ class FileTranslater:
self.markdown: str = "" self.markdown: str = ""
self.chunk_size = chunk_size self.chunk_size = chunk_size
self.concurrent = concurrent self.concurrent = concurrent
self.base_url= base_url self.base_url = base_url
self.key = key if key is not None else "xx" self.key = key if key is not None else "xx"
self.model_id = model_id self.model_id = model_id
self.temperature = temperature self.temperature = temperature
@@ -145,7 +150,7 @@ class FileTranslater:
def read_document(self, document: Document, formula: bool, code: bool, save: bool, def read_document(self, document: Document, formula: bool, code: bool, save: bool,
save_format: Literal["markdown", "html"], refine: bool, save_format: Literal["markdown", "html"], refine: bool,
refine_agent: Agent | None): refine_agent: Agent | None):
self.document=document self.document = document
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact) self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
if refine: if refine:
self.refine_markdown_by_agent(refine_agent) self.refine_markdown_by_agent(refine_agent)
@@ -193,7 +198,7 @@ class FileTranslater:
if file_path: if file_path:
document = Document(path=file_path) document = Document(path=file_path)
else: else:
document=self.document document = self.document
if document is None: if document is None:
raise Exception("未读取文件") raise Exception("未读取文件")
translater_logger.info(f"读取文件:{document.filename}") translater_logger.info(f"读取文件:{document.filename}")
@@ -207,7 +212,7 @@ class FileTranslater:
if file_path: if file_path:
document = Document(path=file_path) document = Document(path=file_path)
else: else:
document=self.document document = self.document
if document is None: if document is None:
raise Exception("未读取文件") raise Exception("未读取文件")
translater_logger.info(f"读取文件:{document.filename}") translater_logger.info(f"读取文件:{document.filename}")
@@ -277,28 +282,38 @@ class FileTranslater:
translater_logger.info("翻译完成") translater_logger.info("翻译完成")
return self.markdown return self.markdown
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"): def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True):
if isinstance(filename, str): if isinstance(filename, str):
filename = Path(filename) filename = Path(filename)
if isinstance(output_dir, str): if isinstance(output_dir, str):
output_dir = Path(output_dir) output_dir = Path(output_dir)
if filename is None: if filename is None:
filename=f"{self.document.stem}.md" filename = Path(f"{self.document.stem}.md")
# 确保输出目录存在 # 确保输出目录存在
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
full_name = output_dir / filename if embeded:
# 输出前格式化markdown full_name = output_dir / filename
self._markdown_format() with open(full_name, "w") as file:
with open(full_name, "w") as file: file.write(self.export_to_markdown())
file.write(self.markdown) translater_logger.info(f"文件已写入{full_name.resolve()}")
translater_logger.info(f"文件已写入{full_name.resolve()}") else:
with zipfile.ZipFile(self.export_to_unembed_markdown()) as zip_ref:
zip_ref.extractall(output_dir)
return self return self
def export_to_markdown(self): def export_to_markdown(self) -> str:
# 输出前格式化markdown # 输出前格式化markdown
self._markdown_format() self._markdown_format()
return self.markdown return self.markdown
def export_to_unembed_markdown(self, filename: str | Path | None = None) -> io.BytesIO:
if isinstance(filename, str):
filename = Path(filename)
if filename is None:
filename = Path(f"{self.document.stem}.md")
self._markdown_format()
return unembed_base64_images_to_zip(self.markdown, folder_name=str(filename.stem), markdown_name=str(filename))
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"): def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
if isinstance(filename, str): if isinstance(filename, str):
filename = Path(filename) filename = Path(filename)
@@ -326,33 +341,34 @@ class FileTranslater:
auto_render = f'<script>{resource_path("static/autoRender.js").read_text(encoding='utf-8')}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js" integrity="sha512-iWiuBS5nt6r60fCz26Nd0Zqe0nbk1ZTIQbl3Kv7kYsX+yKMUFHzjaH2+AnM6vp2Xs+gNmaBAVWJjSmuPw76Efg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>""" auto_render = f'<script>{resource_path("static/autoRender.js").read_text(encoding='utf-8')}</script>' if not cdn else r"""<script defer src="https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js" integrity="sha512-iWiuBS5nt6r60fCz26Nd0Zqe0nbk1ZTIQbl3Kv7kYsX+yKMUFHzjaH2+AnM6vp2Xs+gNmaBAVWJjSmuPw76Efg==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>"""
# language=javascript # language=javascript
render_math_in_element = r""" render_math_in_element = r"""
<script> <script>
document.addEventListener("DOMContentLoaded", function () { document.addEventListener("DOMContentLoaded", function () {
renderMathInElement(document.body, { renderMathInElement(document.body, {
delimiters: [ delimiters: [
{left: '$$', right: '$$', display: true}, {left: '$$', right: '$$', display: true},
{left: '\\[', right: '\\]', display: true}, {left: '\\[', right: '\\]', display: true},
{left: '$', right: '$', display: false}, {left: '$', right: '$', display: false},
{left: '\\(', right: '\\)', display: false} {left: '\\(', right: '\\)', display: false}
], ],
throwOnError: false throwOnError: false
}) })
}); });
</script>""" if cdn else r""" </script>""" if cdn else r"""
<script> <script>
document.addEventListener("DOMContentLoaded", function () { document.addEventListener("DOMContentLoaded", function
renderMathInElement(document.body, { () {
delimiters: [ renderMathInElement(document.body, {
{left: '$$', right: '$$', display: true}, delimiters: [
{left: '\\[', right: '\\]', display: true}, {left: '$$', right: '$$', display: true},
{left: '$', right: '$', display: false}, {left: '\\[', right: '\\]', display: true},
{left: '\\(', right: '\\)', display: false} {left: '$', right: '$', display: false},
], {left: '\\(', right: '\\)', display: false}
fonts: false, ],
throwOnError: false fonts: false,
}) throwOnError: false
}); })
</script>""" });
</script>"""
mermaid = f'<script>{resource_path("static/mermaid.js").read_text(encoding='utf-8')}</script>' mermaid = f'<script>{resource_path("static/mermaid.js").read_text(encoding='utf-8')}</script>'
if self.document.suffix == ".txt": if self.document.suffix == ".txt":
@@ -385,7 +401,7 @@ class FileTranslater:
if save: if save:
if output_format == "markdown": if output_format == "markdown":
self.save_as_markdown(f"{self.document.stem}_{to_lang}.md",output_dir=output_dir) self.save_as_markdown(f"{self.document.stem}_{to_lang}.md", output_dir=output_dir)
elif output_format == "html": elif output_format == "html":
self.save_as_html(f"{self.document.stem}_{to_lang}.html", output_dir=output_dir) self.save_as_html(f"{self.document.stem}_{to_lang}.html", output_dir=output_dir)
return self return self

View File

@@ -6,6 +6,8 @@ import re
import threading import threading
import uuid import uuid
import zipfile import zipfile
from pathlib import Path
import tempfile
class MaskDict: class MaskDict:
@@ -161,6 +163,35 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding
return modified_md_content return modified_md_content
def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->io.BytesIO:
with tempfile.TemporaryDirectory() as temp_dir:
subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行
os.makedirs(subfolder, exist_ok=True)
image_folder=os.path.join(subfolder,image_folder_name)
os.makedirs(image_folder,exist_ok=True)
pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)"
def unembed_base64_images(match:re.Match)->str:
b64data = match.group(3)
extension=mimetypes.guess_extension(match.group(2))
image_id=uuid.uuid1().hex[:8]
image_name=f"{image_id}{extension}"
url=f"./{image_folder_name}/{image_name}"
#创建对应的image文件
with open(os.path.join(image_folder,image_name),"wb") as f:
f.write(base64.b64decode(b64data))
return f"![{match.group(1)}]({url})"
modified_md_content = re.sub(pattern, unembed_base64_images,markdown)
with open(os.path.join(subfolder,f"{markdown_name}"),"w") as f:
f.write(modified_md_content)
zip_buffer=io.BytesIO()
folder_path=Path(subfolder)
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
for file in folder_path.rglob('*'):
if file.is_file():
zipf.write(file, file.relative_to(folder_path.parent))
return zip_buffer
def clean_markdown_math_block(markdown): def clean_markdown_math_block(markdown):
"""清除公式块的多余空格字符""" """清除公式块的多余空格字符"""
@@ -173,9 +204,7 @@ def clean_markdown_math_block(markdown):
if __name__ == '__main__': if __name__ == '__main__':
markdown = r""" with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f:
$$ markdown=f.read()
R T _ { k } ^ { i } ( t ) = \frac { \sum _ { t ^ { \prime } \in [ t - W , t ] } R R _ { k } ^ { i } ( t ^ { \prime } ) \times D R _ { k } ^ { i } ( t ^ { \prime } ) } { \sum _ { t ^ { \prime } \in [ t - W , t ] } D R _ { k } ^ { i } ( t ^ { \prime } ) } print(unembed_base64_images_to_zip(markdown))
$$
"""
print(clean_markdown_math_block(markdown))