允许保存不内嵌图片的markdown(文件夹形式)
This commit is contained in:
@@ -173,8 +173,9 @@ translater = FileTranslater(convert_engin="mineru", # 使用mineru解析文档
|
|||||||
translater.read_file("<文件路径>").save_as_html()#保存
|
translater.read_file("<文件路径>").save_as_html()#保存
|
||||||
translater.read_file("<文件路径>").export_to_html()#输出字符串
|
translater.read_file("<文件路径>").export_to_html()#输出字符串
|
||||||
# 文件转markdown
|
# 文件转markdown
|
||||||
translater.read_file("<文件路径>").save_as_markdown()#保存
|
translater.read_file("<文件路径>").save_as_markdown()#保存内嵌bas64图片的markdown
|
||||||
translater.read_file("<文件路径>").export_to_markdown()#输出字符串
|
translater.read_file("<文件路径>").save_as_markdown(embed=False)#保存不内嵌图片的markdown(文件夹形式)
|
||||||
|
translater.read_file("<文件路径>").export_to_markdown()#输出内嵌图片的markdown字符串
|
||||||
```
|
```
|
||||||
|
|
||||||
## 参数说明
|
## 参数说明
|
||||||
@@ -262,6 +263,7 @@ from docutranslate import FileTranslater
|
|||||||
translater = FileTranslater(base_url="<baseurl>",
|
translater = FileTranslater(base_url="<baseurl>",
|
||||||
key="<key>",
|
key="<key>",
|
||||||
model_id="<model-id>", # 使用的模型id
|
model_id="<model-id>", # 使用的模型id
|
||||||
|
convert_engin="docling", # 使用docling
|
||||||
docling_artifact=r"C:\Users\<user>\.cache\docling\models"
|
docling_artifact=r"C:\Users\<user>\.cache\docling\models"
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import html
|
import html
|
||||||
|
import io
|
||||||
|
import zipfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
import markdown2
|
import markdown2
|
||||||
@@ -9,7 +11,8 @@ from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
|||||||
from docutranslate.cacher import document_cacher_global
|
from docutranslate.cacher import document_cacher_global
|
||||||
from docutranslate.converter import Document, ConverterMineru
|
from docutranslate.converter import Document, ConverterMineru
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block
|
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict, clean_markdown_math_block, \
|
||||||
|
unembed_base64_images_to_zip
|
||||||
from docutranslate.logger import translater_logger
|
from docutranslate.logger import translater_logger
|
||||||
from docutranslate.global_values import available_packages
|
from docutranslate.global_values import available_packages
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
@@ -18,16 +21,18 @@ DOCLING_FLAG = True if available_packages.get("docling") else False
|
|||||||
if DOCLING_FLAG:
|
if DOCLING_FLAG:
|
||||||
from docutranslate.converter import ConverterDocling
|
from docutranslate.converter import ConverterDocling
|
||||||
|
|
||||||
default_params={
|
default_params = {
|
||||||
"chunk_size":3000,
|
"chunk_size": 3000,
|
||||||
"concurrent":30,
|
"concurrent": 30,
|
||||||
"temperature":0.7,
|
"temperature": 0.7,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class FileTranslater:
|
class FileTranslater:
|
||||||
def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"],
|
def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"],
|
||||||
base_url:str|None=None, key=None, model_id:str|None=None, temperature=default_params["temperature"],
|
base_url: str | None = None, key=None, model_id: str | None = None,
|
||||||
concurrent:int=default_params["concurrent"], timeout=2000,
|
temperature=default_params["temperature"],
|
||||||
|
concurrent: int = default_params["concurrent"], timeout=2000,
|
||||||
convert_engin: Literal["docling", "mineru"] = "mineru",
|
convert_engin: Literal["docling", "mineru"] = "mineru",
|
||||||
docling_artifact: Path | str | None = None,
|
docling_artifact: Path | str | None = None,
|
||||||
mineru_token: str = None, cache=True):
|
mineru_token: str = None, cache=True):
|
||||||
@@ -37,7 +42,7 @@ class FileTranslater:
|
|||||||
self.markdown: str = ""
|
self.markdown: str = ""
|
||||||
self.chunk_size = chunk_size
|
self.chunk_size = chunk_size
|
||||||
self.concurrent = concurrent
|
self.concurrent = concurrent
|
||||||
self.base_url= base_url
|
self.base_url = base_url
|
||||||
self.key = key if key is not None else "xx"
|
self.key = key if key is not None else "xx"
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
self.temperature = temperature
|
self.temperature = temperature
|
||||||
@@ -145,7 +150,7 @@ class FileTranslater:
|
|||||||
def read_document(self, document: Document, formula: bool, code: bool, save: bool,
|
def read_document(self, document: Document, formula: bool, code: bool, save: bool,
|
||||||
save_format: Literal["markdown", "html"], refine: bool,
|
save_format: Literal["markdown", "html"], refine: bool,
|
||||||
refine_agent: Agent | None):
|
refine_agent: Agent | None):
|
||||||
self.document=document
|
self.document = document
|
||||||
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
|
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
|
||||||
if refine:
|
if refine:
|
||||||
self.refine_markdown_by_agent(refine_agent)
|
self.refine_markdown_by_agent(refine_agent)
|
||||||
@@ -193,7 +198,7 @@ class FileTranslater:
|
|||||||
if file_path:
|
if file_path:
|
||||||
document = Document(path=file_path)
|
document = Document(path=file_path)
|
||||||
else:
|
else:
|
||||||
document=self.document
|
document = self.document
|
||||||
if document is None:
|
if document is None:
|
||||||
raise Exception("未读取文件")
|
raise Exception("未读取文件")
|
||||||
translater_logger.info(f"读取文件:{document.filename}")
|
translater_logger.info(f"读取文件:{document.filename}")
|
||||||
@@ -207,7 +212,7 @@ class FileTranslater:
|
|||||||
if file_path:
|
if file_path:
|
||||||
document = Document(path=file_path)
|
document = Document(path=file_path)
|
||||||
else:
|
else:
|
||||||
document=self.document
|
document = self.document
|
||||||
if document is None:
|
if document is None:
|
||||||
raise Exception("未读取文件")
|
raise Exception("未读取文件")
|
||||||
translater_logger.info(f"读取文件:{document.filename}")
|
translater_logger.info(f"读取文件:{document.filename}")
|
||||||
@@ -277,28 +282,38 @@ class FileTranslater:
|
|||||||
translater_logger.info("翻译完成")
|
translater_logger.info("翻译完成")
|
||||||
return self.markdown
|
return self.markdown
|
||||||
|
|
||||||
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output", embeded=True):
|
||||||
if isinstance(filename, str):
|
if isinstance(filename, str):
|
||||||
filename = Path(filename)
|
filename = Path(filename)
|
||||||
if isinstance(output_dir, str):
|
if isinstance(output_dir, str):
|
||||||
output_dir = Path(output_dir)
|
output_dir = Path(output_dir)
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename=f"{self.document.stem}.md"
|
filename = Path(f"{self.document.stem}.md")
|
||||||
# 确保输出目录存在
|
# 确保输出目录存在
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
if embeded:
|
||||||
full_name = output_dir / filename
|
full_name = output_dir / filename
|
||||||
# 输出前格式化markdown
|
|
||||||
self._markdown_format()
|
|
||||||
with open(full_name, "w") as file:
|
with open(full_name, "w") as file:
|
||||||
file.write(self.markdown)
|
file.write(self.export_to_markdown())
|
||||||
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
translater_logger.info(f"文件已写入{full_name.resolve()}")
|
||||||
|
else:
|
||||||
|
with zipfile.ZipFile(self.export_to_unembed_markdown()) as zip_ref:
|
||||||
|
zip_ref.extractall(output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def export_to_markdown(self):
|
def export_to_markdown(self) -> str:
|
||||||
# 输出前格式化markdown
|
# 输出前格式化markdown
|
||||||
self._markdown_format()
|
self._markdown_format()
|
||||||
return self.markdown
|
return self.markdown
|
||||||
|
|
||||||
|
def export_to_unembed_markdown(self, filename: str | Path | None = None) -> io.BytesIO:
|
||||||
|
if isinstance(filename, str):
|
||||||
|
filename = Path(filename)
|
||||||
|
if filename is None:
|
||||||
|
filename = Path(f"{self.document.stem}.md")
|
||||||
|
self._markdown_format()
|
||||||
|
return unembed_base64_images_to_zip(self.markdown, folder_name=str(filename.stem), markdown_name=str(filename))
|
||||||
|
|
||||||
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
||||||
if isinstance(filename, str):
|
if isinstance(filename, str):
|
||||||
filename = Path(filename)
|
filename = Path(filename)
|
||||||
@@ -340,7 +355,8 @@ class FileTranslater:
|
|||||||
});
|
});
|
||||||
</script>""" if cdn else r"""
|
</script>""" if cdn else r"""
|
||||||
<script>
|
<script>
|
||||||
document.addEventListener("DOMContentLoaded", function () {
|
document.addEventListener("DOMContentLoaded", function
|
||||||
|
() {
|
||||||
renderMathInElement(document.body, {
|
renderMathInElement(document.body, {
|
||||||
delimiters: [
|
delimiters: [
|
||||||
{left: '$$', right: '$$', display: true},
|
{left: '$$', right: '$$', display: true},
|
||||||
@@ -385,7 +401,7 @@ class FileTranslater:
|
|||||||
|
|
||||||
if save:
|
if save:
|
||||||
if output_format == "markdown":
|
if output_format == "markdown":
|
||||||
self.save_as_markdown(f"{self.document.stem}_{to_lang}.md",output_dir=output_dir)
|
self.save_as_markdown(f"{self.document.stem}_{to_lang}.md", output_dir=output_dir)
|
||||||
elif output_format == "html":
|
elif output_format == "html":
|
||||||
self.save_as_html(f"{self.document.stem}_{to_lang}.html", output_dir=output_dir)
|
self.save_as_html(f"{self.document.stem}_{to_lang}.html", output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ import re
|
|||||||
import threading
|
import threading
|
||||||
import uuid
|
import uuid
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
class MaskDict:
|
class MaskDict:
|
||||||
@@ -161,6 +163,35 @@ def embed_inline_image_from_zip(zip_bytes: bytes, filename_in_zip: str, encoding
|
|||||||
return modified_md_content
|
return modified_md_content
|
||||||
|
|
||||||
|
|
||||||
|
def unembed_base64_images_to_zip(markdown:str,folder_name:str,markdown_name:str,image_folder_name="images")->io.BytesIO:
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
subfolder = os.path.join(temp_dir, folder_name)#所有的操作都在这个subfolder里进行
|
||||||
|
os.makedirs(subfolder, exist_ok=True)
|
||||||
|
image_folder=os.path.join(subfolder,image_folder_name)
|
||||||
|
os.makedirs(image_folder,exist_ok=True)
|
||||||
|
pattern=r"!\[(.*?)\]\(data:(.*?);.*base64,(.*)\)"
|
||||||
|
def unembed_base64_images(match:re.Match)->str:
|
||||||
|
b64data = match.group(3)
|
||||||
|
extension=mimetypes.guess_extension(match.group(2))
|
||||||
|
image_id=uuid.uuid1().hex[:8]
|
||||||
|
image_name=f"{image_id}{extension}"
|
||||||
|
url=f"./{image_folder_name}/{image_name}"
|
||||||
|
#创建对应的image文件
|
||||||
|
with open(os.path.join(image_folder,image_name),"wb") as f:
|
||||||
|
f.write(base64.b64decode(b64data))
|
||||||
|
return f""
|
||||||
|
modified_md_content = re.sub(pattern, unembed_base64_images,markdown)
|
||||||
|
with open(os.path.join(subfolder,f"{markdown_name}"),"w") as f:
|
||||||
|
f.write(modified_md_content)
|
||||||
|
zip_buffer=io.BytesIO()
|
||||||
|
folder_path=Path(subfolder)
|
||||||
|
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
||||||
|
for file in folder_path.rglob('*'):
|
||||||
|
if file.is_file():
|
||||||
|
zipf.write(file, file.relative_to(folder_path.parent))
|
||||||
|
|
||||||
|
return zip_buffer
|
||||||
|
|
||||||
def clean_markdown_math_block(markdown):
|
def clean_markdown_math_block(markdown):
|
||||||
"""清除公式块的多余空格字符"""
|
"""清除公式块的多余空格字符"""
|
||||||
|
|
||||||
@@ -173,9 +204,7 @@ def clean_markdown_math_block(markdown):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
markdown = r"""
|
with open(r"C:\Users\jxgm\Desktop\translate\docutranslate\tests\files\test7.md",'r') as f:
|
||||||
$$
|
markdown=f.read()
|
||||||
R T _ { k } ^ { i } ( t ) = \frac { \sum _ { t ^ { \prime } \in [ t - W , t ] } R R _ { k } ^ { i } ( t ^ { \prime } ) \times D R _ { k } ^ { i } ( t ^ { \prime } ) } { \sum _ { t ^ { \prime } \in [ t - W , t ] } D R _ { k } ^ { i } ( t ^ { \prime } ) }
|
print(unembed_base64_images_to_zip(markdown))
|
||||||
$$
|
|
||||||
"""
|
|
||||||
print(clean_markdown_math_block(markdown))
|
|
||||||
|
|||||||
Reference in New Issue
Block a user