修复md分隔符问题

2025-11-25 13:32:13 +08:00
parent 795594f5d9
commit f6d707fc73
2 changed files with 42 additions and 87 deletions
--- a/docutranslate/translator/ai_translator/md_translator.py
+++ b/docutranslate/translator/ai_translator/md_translator.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: 2025 QinHan
 # SPDX-License-Identifier: MPL-2.0
 import asyncio
 import re
 from dataclasses import dataclass
 from typing import Self, List
@@ -10,7 +9,8 @@ from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
 from docutranslate.context.md_mask_context import MDMaskUrisContext
 from docutranslate.ir.markdown_document import MarkdownDocument
 from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
-from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
+# 引入新的 is_placeholder 函数
 from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts, is_placeholder
@dataclass
@@ -39,32 +39,24 @@ class MDTranslator(AiTranslator):
                                                  system_proxy_enable=config.system_proxy_enable)
            self.translate_agent = MDTranslateAgent(agent_config)
    @staticmethod
    def _is_placeholder(text: str) -> bool:
        """检查文本块是否仅包含图片占位符"""
        # 匹配 <ph-xxxxxx> 格式，允许前后有空白
        return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
    def translate(self, document: MarkdownDocument) -> Self:
        self.logger.info("正在翻译markdown")
        with MDMaskUrisContext(document):
            chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
            # 预处理：分离出需要翻译的文本块和不需要翻译的占位符块
            translate_indices: List[int] = []
            translate_chunks: List[str] = []
-            final_result: List[str] = list(chunks)  # 浅拷贝，预填充原始值
+            final_result: List[str] = list(chunks)
            for i, chunk in enumerate(chunks):
-                if self._is_placeholder(chunk):
+                # 直接使用 splitter 中定义的函数
-                    # 如果是占位符，不需要处理，final_result中该位置保持原样
+                if is_placeholder(chunk):
                    continue
                else:
                    translate_indices.append(i)
                    translate_chunks.append(chunk)
            if self.glossary_agent and translate_chunks:
                # 仅对需要翻译的文本提取术语
                self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size)
                if self.translate_agent:
                    self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
@@ -73,13 +65,9 @@ class MDTranslator(AiTranslator):
            if self.translate_agent and translate_chunks:
                translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks)
                # 将翻译结果回填到对应位置
                for idx, translated_text in zip(translate_indices, translated_sub_results):
                    final_result[idx] = translated_text
            # 如果没有翻译代理或者没有需要翻译的块，final_result 已经包含了正确的内容（原始chunks）
            content = join_markdown_texts(final_result)
            # 做一些加强鲁棒性的操作
            content = content.replace(r'\（', r'\(')
@@ -94,13 +82,12 @@ class MDTranslator(AiTranslator):
        with MDMaskUrisContext(document):
            chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
            # 预处理：分离出需要翻译的文本块和不需要翻译的占位符块
            translate_indices: List[int] = []
            translate_chunks: List[str] = []
            final_result: List[str] = list(chunks)
            for i, chunk in enumerate(chunks):
-                if self._is_placeholder(chunk):
+                if is_placeholder(chunk):
                    continue
                else:
                    translate_indices.append(i)
@@ -116,14 +103,11 @@ class MDTranslator(AiTranslator):
            if self.translate_agent and translate_chunks:
                translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks)
                # 将翻译结果回填到对应位置
                for idx, translated_text in zip(translate_indices, translated_sub_results):
                    final_result[idx] = translated_text
            def run():
                content = join_markdown_texts(final_result)
                # 做一些加强鲁棒性的操作
                content = content.replace(r'\（', r'\(')
                content = content.replace(r'\）', r'\)')
                document.content = content.encode()
--- a/docutranslate/utils/markdown_splitter.py
+++ b/docutranslate/utils/markdown_splitter.py
@@ -4,36 +4,34 @@ import re
 from typing import List
 def is_placeholder(text: str) -> bool:
    """
    判断文本块是否仅包含图片占位符
    匹配格式: <ph-abc123> (允许前后空白)
    """
    return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
 class MarkdownBlockSplitter:
    def __init__(self, max_block_size: int = 5000):
        """
        初始化Markdown分块器
        参数:
            max_block_size: 每个块的最大字节数
        """
        self.max_block_size = max_block_size
        # 匹配占位符的正则，例如 <ph-abc123>
        self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
    @staticmethod
    def _get_bytes(text: str) -> int:
        return len(text.encode('utf-8'))
    def _is_placeholder(self, text: str) -> bool:
        """判断文本是否纯粹是一个占位符"""
        return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip()))
    def split_markdown(self, markdown_text: str) -> List[str]:
        """
        将Markdown文本分割成指定大小的块
        确保可以通过简单拼接重建原始文本（分割的代码块除外）
        尽量保持标题与其对应内容在同一个块中
        """
        # 1. 将文本分割成逻辑块
        logical_blocks = self._split_into_logical_blocks(markdown_text)
        # 2. 合并逻辑块，使其不超过 max_block_size
        chunks = []
        current_chunk_parts = []
        current_size = 0
@@ -41,27 +39,21 @@ class MarkdownBlockSplitter:
        for block in logical_blocks:
            block_size = self._get_bytes(block)
-            # 检查是否是占位符块（需要单独成块）
+            # 如果是占位符，必须单独成块，且强制切断当前累积的内容
-            if self._is_placeholder(block):
+            if is_placeholder(block):
                # 如果当前有积累的块，先输出
                if current_chunk_parts:
                    chunks.append("".join(current_chunk_parts))
                    current_chunk_parts = []
                    current_size = 0
                # 占位符单独作为一个chunk
                chunks.append(block)
                continue
            # 情况1：块本身就过大
            if block_size > self.max_block_size:
                # 先将当前积累的块输出
                if current_chunk_parts:
                    chunks.append("".join(current_chunk_parts))
                    current_chunk_parts = []
                    current_size = 0
                # 分割这个超大块并直接添加到结果中
                chunks.extend(self._split_large_block(block))
                continue
@@ -69,7 +61,6 @@ class MarkdownBlockSplitter:
            if current_size + block_size > self.max_block_size:
                if current_chunk_parts:
                    chunks.append("".join(current_chunk_parts))
                current_chunk_parts = [block]
                current_size = block_size
            # 情况3：正常添加
@@ -77,20 +68,14 @@ class MarkdownBlockSplitter:
                current_chunk_parts.append(block)
                current_size += block_size
        # 添加最后一个剩余的chunk
        if current_chunk_parts:
            chunks.append("".join(current_chunk_parts))
        return chunks
    def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
        """
        将Markdown文本分割成逻辑块（标题、段落、代码块、空行分隔符、图片占位符等）
        """
        # 标准化换行符
        text = markdown_text.replace('\r\n', '\n')
-
+        # 分割代码块
        # 分割代码块和其他内容
        code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)'
        parts = re.split(code_block_pattern, text)
@@ -99,39 +84,31 @@ class MarkdownBlockSplitter:
            if not part:
                continue
-            if i % 2 == 1:  # 这是一个代码块
+            # 代码块直接添加
            if i % 2 == 1:
                blocks.append(part)
-            else:  # 这是普通Markdown内容
+            else:
-                # 1. 先按占位符分割，确保占位符独立
+                # 普通文本：先切分出占位符
                ph_parts = re.split(self.placeholder_pattern, part)
                for ph_part in ph_parts:
                    if not ph_part:
                        continue
-                    if self._is_placeholder(ph_part):
+                    if is_placeholder(ph_part):
                        blocks.append(ph_part)
                    else:
-                        # 2. 对非占位符文本，按一个或多个空行分割，并保留分隔符
+                        # 再按空行切分段落
                        # 这能有效分离段落、列表、标题等，并保留它们之间的空行
                        sub_parts = re.split(r'(\n{2,})', ph_part)
                        # 过滤掉 re.split 可能产生的空字符串
                        blocks.extend([p for p in sub_parts if p])
        return blocks
    def _split_large_block(self, block: str) -> List[str]:
-        """
+        # 代码块处理
        分割单个超过 max_block_size 的大块
        """
        # 优先处理代码块
        if block.startswith(('```', '~~~')):
            fence = '```' if block.startswith('```') else '~~~'
            lines = block.split('\n')
            header = lines[0]
            footer = lines[-1]
            content_lines = lines[1:-1]
            chunks = []
            current_chunk_lines = [header]
            current_size = self._get_bytes(header) + 1
@@ -152,7 +129,7 @@ class MarkdownBlockSplitter:
                chunks.append('\n'.join(current_chunk_lines))
            return chunks
-        # 对普通大文本按行分割
+        # 普通文本处理
        lines = block.split('\n')
        chunks = []
        current_chunk = []
@@ -162,42 +139,28 @@ class MarkdownBlockSplitter:
            if current_size + line_size > self.max_block_size and current_chunk:
                chunks.append('\n'.join(current_chunk))
                current_chunk = [line]
-                current_size = line_size - 1  # -1 for the first line does not have a leading '\n'
+                current_size = line_size - 1
            else:
                current_chunk.append(line)
                current_size += line_size
        if current_chunk:
            chunks.append('\n'.join(current_chunk))
        return chunks
 def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
    """
    将Markdown字符串分割成不超过max_block_size的块
    """
    splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
    chunks = splitter.split_markdown(markdown_text)
-    # 过滤掉仅由空白字符组成的块，但保留占位符块
+    # 过滤空块，但保留占位符
-    return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)]
+    return [chunk for chunk in chunks if chunk.strip() or is_placeholder(chunk)]
 def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
-    """
+    """判断常规文本是否需要单换行连接"""
    判断两个块是否应该用单个换行符连接
    这通常发生在列表、表格、引用块的连续行之间
    """
    if not prev_chunk.strip() or not next_chunk.strip():
        return False
    # 如果其中一个是占位符，通常建议使用双换行以确保它是独立的块，
    # 除非原格式非常紧凑，但在翻译场景下，分隔开更安全。
    # 这里不额外处理占位符，走默认逻辑（最后会返回False，从而使用\n\n）
    if re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', prev_chunk) or \
            re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', next_chunk):
        return False
    last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
    first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
@@ -206,7 +169,7 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
            first_line_next.startswith('|') and first_line_next.endswith('|'):
        return True
-    # 列表 (无序和有序)
+    # 列表
    list_markers = r'^\s*([-*+]|\d+\.)\s+'
    if re.match(list_markers, last_line_prev) and re.match(list_markers, first_line_next):
        return True
@@ -230,11 +193,19 @@ def join_markdown_texts(markdown_texts: List[str]) -> str:
        prev_chunk = markdown_texts[i - 1]
        current_chunk = markdown_texts[i]
-        # 判断是否应该用单换行还是双换行
+        # === 核心修复逻辑 ===
-        if _needs_single_newline_join(prev_chunk, current_chunk):
+        # 如果前一块或后一块是占位符，强制使用单换行 '\n'
        # 这样可以保证：
        # 1. 连续的徽章/图片 [img1]\n[img2] 会紧凑排列（视为行内元素）
        # 2. HTML结构 <p>\n<img>\n</p> 不会被打断
        # 3. 标题后的图片 # Title\n<img> 也能正常渲染
        if is_placeholder(prev_chunk) or is_placeholder(current_chunk):
            separator = "\n"
        elif _needs_single_newline_join(prev_chunk, current_chunk):
            separator = "\n"
        else:
-            # 默认使用双换行来分隔不同的块
+            # 只有两个纯文本段落之间才用双换行
            separator = "\n\n"
        joined_text += separator + current_chunk