From 78da02ec9c52b4d6c54b43df5000a0312b42531b Mon Sep 17 00:00:00 2001 From: xunbu Date: Sat, 23 Aug 2025 21:31:07 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dmarkdown=E5=85=AC=E5=BC=8F?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=94=99=E8=AF=AF=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/exporter/md/md2html_exporter.py | 15 ++++----------- .../translator/ai_translator/md_translator.py | 3 --- docutranslate/utils/markdown_splitter.py | 13 ++++++++++++- docutranslate/utils/markdown_utils.py | 10 ---------- 4 files changed, 16 insertions(+), 25 deletions(-) diff --git a/docutranslate/exporter/md/md2html_exporter.py b/docutranslate/exporter/md/md2html_exporter.py index ee06ce2..e0e5515 100644 --- a/docutranslate/exporter/md/md2html_exporter.py +++ b/docutranslate/exporter/md/md2html_exporter.py @@ -46,7 +46,7 @@ class MD2HTMLExporter(MDExporter): }, trust: true, strict: false - }); + }) }); """ @@ -84,14 +84,6 @@ class MD2HTMLExporter(MDExporter): content = document.content.decode() - # ================================================================= - # 步骤 2: 预处理 markdown 内容,确保数学公式块周围有正确的空行 - # 正则表达式 r'(\$\$[\s\S]*?\$\$)' 匹配一个完整的 $$...$$ 块。 - # [\s\S]*? 匹配包括换行符在内的任何字符,并且是非贪婪模式。 - # re.sub 将找到的每个匹配项替换为 `\n\n<匹配项>\n\n`,从而强制添加空行。 - content = re.sub(r'(\$\$[\s\S]*?\$\$)', r'\n\n\1\n\n', content) - # ================================================================= - html_content = markdown.markdown( content, extensions=extensions, @@ -109,12 +101,13 @@ class MD2HTMLExporter(MDExporter): mermaid=mermaid, ) return Document.from_bytes(content=render.encode("utf-8"), suffix=".html", stem=document.stem) + if __name__ == '__main__': from pathlib import Path # d = Document.from_path(r"C:\Users\jxgm\Desktop\mcp文件夹\学习笔记\互联网认证授权机制\互联网认证授权机制.md") - d = Document.from_path(r"C:\Users\jxgm\Desktop\matrixcalc_translated.md") - # d = Document.from_path(r"C:\Users\jxgm\Downloads\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md") + # d = Document.from_path(r"C:\Users\jxgm\Desktop\matrixcalc_translated.md") + d = Document.from_path(r"C:\Users\jxgm\Desktop\full_translated.md") exporter = MD2HTMLExporter() d1 = exporter.export(d) path = Path(r"C:\Users\jxgm\Desktop\a.html") diff --git a/docutranslate/translator/ai_translator/md_translator.py b/docutranslate/translator/ai_translator/md_translator.py index f938573..d69efb7 100644 --- a/docutranslate/translator/ai_translator/md_translator.py +++ b/docutranslate/translator/ai_translator/md_translator.py @@ -9,7 +9,6 @@ from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.base import Translator from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts -from docutranslate.utils.markdown_utils import clean_markdown_math_block @dataclass @@ -44,7 +43,6 @@ class MDTranslator(Translator): # 做一些加强鲁棒性的操作 content = content.replace(r'\(', r'\(') content = content.replace(r'\)', r'\)') - content = clean_markdown_math_block(content) document.content = content.encode() self.logger.info("翻译完成") @@ -62,7 +60,6 @@ class MDTranslator(Translator): # 做一些加强鲁棒性的操作 content = content.replace(r'\(', r'\(') content = content.replace(r'\)', r'\)') - content = clean_markdown_math_block(content) document.content = content.encode() await asyncio.to_thread(run) diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py index e9b0ca6..7b3049e 100644 --- a/docutranslate/utils/markdown_splitter.py +++ b/docutranslate/utils/markdown_splitter.py @@ -2,6 +2,8 @@ import re from typing import List + + class MarkdownBlockSplitter: def __init__(self, max_block_size: int = 5000): """ @@ -202,4 +204,13 @@ def join_markdown_texts(markdown_texts: List[str]) -> str: joined_text += separator + current_chunk - return joined_text \ No newline at end of file + return joined_text + + +if __name__ == '__main__': + from pathlib import Path + from docutranslate.utils.markdown_utils import clean_markdown_math_block + content=Path(r"C:\Users\jxgm\Desktop\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md").read_text() + content=split_markdown_text(content) + content=join_markdown_texts(content) + diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index 4361d99..bf1e263 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -211,16 +211,6 @@ def unembed_base64_images_to_zip(markdown:str,markdown_name:str,image_folder_nam zipf.write(file, file.relative_to(folder_path)) return zip_buffer.getvalue() -def clean_markdown_math_block(markdown): - """清除公式块的多余空格字符""" - - def replace_block(match: re.Match): - return f"{match.group(1).strip()}\n{match.group(2).strip()}\n{match.group(3).lstrip()}" - - pattern = re.compile(r"(^\s*\$\$\s*)\n([\s\S]+?)\n(^\s*\$\$\s*$)", re.MULTILINE) - cleaned_text = pattern.sub(replace_block, markdown) - return cleaned_text - if __name__ == '__main__': pass