修复markdown公式显示错误的问题
This commit is contained in:
@@ -46,7 +46,7 @@ class MD2HTMLExporter(MDExporter):
|
|||||||
},
|
},
|
||||||
trust: true,
|
trust: true,
|
||||||
strict: false
|
strict: false
|
||||||
});
|
})
|
||||||
});
|
});
|
||||||
</script>"""
|
</script>"""
|
||||||
|
|
||||||
@@ -84,14 +84,6 @@ class MD2HTMLExporter(MDExporter):
|
|||||||
|
|
||||||
content = document.content.decode()
|
content = document.content.decode()
|
||||||
|
|
||||||
# =================================================================
|
|
||||||
# 步骤 2: 预处理 markdown 内容,确保数学公式块周围有正确的空行
|
|
||||||
# 正则表达式 r'(\$\$[\s\S]*?\$\$)' 匹配一个完整的 $$...$$ 块。
|
|
||||||
# [\s\S]*? 匹配包括换行符在内的任何字符,并且是非贪婪模式。
|
|
||||||
# re.sub 将找到的每个匹配项替换为 `\n\n<匹配项>\n\n`,从而强制添加空行。
|
|
||||||
content = re.sub(r'(\$\$[\s\S]*?\$\$)', r'\n\n\1\n\n', content)
|
|
||||||
# =================================================================
|
|
||||||
|
|
||||||
html_content = markdown.markdown(
|
html_content = markdown.markdown(
|
||||||
content,
|
content,
|
||||||
extensions=extensions,
|
extensions=extensions,
|
||||||
@@ -109,12 +101,13 @@ class MD2HTMLExporter(MDExporter):
|
|||||||
mermaid=mermaid,
|
mermaid=mermaid,
|
||||||
)
|
)
|
||||||
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
|
return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# d = Document.from_path(r"C:\Users\jxgm\Desktop\mcp文件夹\学习笔记\互联网认证授权机制\互联网认证授权机制.md")
|
# d = Document.from_path(r"C:\Users\jxgm\Desktop\mcp文件夹\学习笔记\互联网认证授权机制\互联网认证授权机制.md")
|
||||||
d = Document.from_path(r"C:\Users\jxgm\Desktop\matrixcalc_translated.md")
|
# d = Document.from_path(r"C:\Users\jxgm\Desktop\matrixcalc_translated.md")
|
||||||
# d = Document.from_path(r"C:\Users\jxgm\Downloads\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md")
|
d = Document.from_path(r"C:\Users\jxgm\Desktop\full_translated.md")
|
||||||
exporter = MD2HTMLExporter()
|
exporter = MD2HTMLExporter()
|
||||||
d1 = exporter.export(d)
|
d1 = exporter.export(d)
|
||||||
path = Path(r"C:\Users\jxgm\Desktop\a.html")
|
path = Path(r"C:\Users\jxgm\Desktop\a.html")
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from docutranslate.ir.markdown_document import MarkdownDocument
|
|||||||
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
|
||||||
from docutranslate.translator.base import Translator
|
from docutranslate.translator.base import Translator
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -44,7 +43,6 @@ class MDTranslator(Translator):
|
|||||||
# 做一些加强鲁棒性的操作
|
# 做一些加强鲁棒性的操作
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
content = clean_markdown_math_block(content)
|
|
||||||
|
|
||||||
document.content = content.encode()
|
document.content = content.encode()
|
||||||
self.logger.info("翻译完成")
|
self.logger.info("翻译完成")
|
||||||
@@ -62,7 +60,6 @@ class MDTranslator(Translator):
|
|||||||
# 做一些加强鲁棒性的操作
|
# 做一些加强鲁棒性的操作
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
content = clean_markdown_math_block(content)
|
|
||||||
document.content = content.encode()
|
document.content = content.encode()
|
||||||
|
|
||||||
await asyncio.to_thread(run)
|
await asyncio.to_thread(run)
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ import re
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBlockSplitter:
|
class MarkdownBlockSplitter:
|
||||||
def __init__(self, max_block_size: int = 5000):
|
def __init__(self, max_block_size: int = 5000):
|
||||||
"""
|
"""
|
||||||
@@ -202,4 +204,13 @@ def join_markdown_texts(markdown_texts: List[str]) -> str:
|
|||||||
|
|
||||||
joined_text += separator + current_chunk
|
joined_text += separator + current_chunk
|
||||||
|
|
||||||
return joined_text
|
return joined_text
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from pathlib import Path
|
||||||
|
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
||||||
|
content=Path(r"C:\Users\jxgm\Desktop\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md").read_text()
|
||||||
|
content=split_markdown_text(content)
|
||||||
|
content=join_markdown_texts(content)
|
||||||
|
|
||||||
|
|||||||
@@ -211,16 +211,6 @@ def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_nam
|
|||||||
zipf.write(file, file.relative_to(folder_path))
|
zipf.write(file, file.relative_to(folder_path))
|
||||||
return zip_buffer.getvalue()
|
return zip_buffer.getvalue()
|
||||||
|
|
||||||
def clean_markdown_math_block(markdown):
|
|
||||||
"""清除公式块的多余空格字符"""
|
|
||||||
|
|
||||||
def replace_block(match: re.Match):
|
|
||||||
return f"{match.group(1).strip()}\n{match.group(2).strip()}\n{match.group(3).lstrip()}"
|
|
||||||
|
|
||||||
pattern = re.compile(r"(^\s*\$\$\s*)\n([\s\S]+?)\n(^\s*\$\$\s*$)", re.MULTILINE)
|
|
||||||
cleaned_text = pattern.sub(replace_block, markdown)
|
|
||||||
return cleaned_text
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pass
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user