diff --git a/docutranslate/translator/ai_translator/md_translator.py b/docutranslate/translator/ai_translator/md_translator.py index c811114..d14d44a 100644 --- a/docutranslate/translator/ai_translator/md_translator.py +++ b/docutranslate/translator/ai_translator/md_translator.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: 2025 QinHan # SPDX-License-Identifier: MPL-2.0 import asyncio -import re from dataclasses import dataclass from typing import Self, List @@ -10,7 +9,8 @@ from docutranslate.agents.markdown_agent import MDTranslateAgentConfig from docutranslate.context.md_mask_context import MDMaskUrisContext from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator -from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts +# 引入新的 is_placeholder 函数 +from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts, is_placeholder @dataclass @@ -39,32 +39,24 @@ class MDTranslator(AiTranslator): system_proxy_enable=config.system_proxy_enable) self.translate_agent = MDTranslateAgent(agent_config) - @staticmethod - def _is_placeholder(text: str) -> bool: - """检查文本块是否仅包含图片占位符""" - # 匹配 格式,允许前后有空白 - return bool(re.match(r'^\s*\s*$', text)) - def translate(self, document: MarkdownDocument) -> Self: self.logger.info("正在翻译markdown") with MDMaskUrisContext(document): chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) - # 预处理:分离出需要翻译的文本块和不需要翻译的占位符块 translate_indices: List[int] = [] translate_chunks: List[str] = [] - final_result: List[str] = list(chunks) # 浅拷贝,预填充原始值 + final_result: List[str] = list(chunks) for i, chunk in enumerate(chunks): - if self._is_placeholder(chunk): - # 如果是占位符,不需要处理,final_result中该位置保持原样 + # 直接使用 splitter 中定义的函数 + if is_placeholder(chunk): continue else: translate_indices.append(i) translate_chunks.append(chunk) if self.glossary_agent and translate_chunks: - # 仅对需要翻译的文本提取术语 self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size) if self.translate_agent: self.translate_agent.update_glossary_dict(self.glossary_dict_gen) @@ -73,13 +65,9 @@ class MDTranslator(AiTranslator): if self.translate_agent and translate_chunks: translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks) - - # 将翻译结果回填到对应位置 for idx, translated_text in zip(translate_indices, translated_sub_results): final_result[idx] = translated_text - # 如果没有翻译代理或者没有需要翻译的块,final_result 已经包含了正确的内容(原始chunks) - content = join_markdown_texts(final_result) # 做一些加强鲁棒性的操作 content = content.replace(r'\(', r'\(') @@ -94,13 +82,12 @@ class MDTranslator(AiTranslator): with MDMaskUrisContext(document): chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) - # 预处理:分离出需要翻译的文本块和不需要翻译的占位符块 translate_indices: List[int] = [] translate_chunks: List[str] = [] final_result: List[str] = list(chunks) for i, chunk in enumerate(chunks): - if self._is_placeholder(chunk): + if is_placeholder(chunk): continue else: translate_indices.append(i) @@ -116,14 +103,11 @@ class MDTranslator(AiTranslator): if self.translate_agent and translate_chunks: translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks) - - # 将翻译结果回填到对应位置 for idx, translated_text in zip(translate_indices, translated_sub_results): final_result[idx] = translated_text def run(): content = join_markdown_texts(final_result) - # 做一些加强鲁棒性的操作 content = content.replace(r'\(', r'\(') content = content.replace(r'\)', r'\)') document.content = content.encode() diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py index dc3e194..f53dc56 100644 --- a/docutranslate/utils/markdown_splitter.py +++ b/docutranslate/utils/markdown_splitter.py @@ -4,36 +4,34 @@ import re from typing import List +def is_placeholder(text: str) -> bool: + """ + 判断文本块是否仅包含图片占位符 + 匹配格式: (允许前后空白) + """ + return bool(re.match(r'^\s*\s*$', text)) + + class MarkdownBlockSplitter: def __init__(self, max_block_size: int = 5000): """ 初始化Markdown分块器 - 参数: max_block_size: 每个块的最大字节数 """ self.max_block_size = max_block_size - # 匹配占位符的正则,例如 self.placeholder_pattern = r'()' @staticmethod def _get_bytes(text: str) -> int: return len(text.encode('utf-8')) - def _is_placeholder(self, text: str) -> bool: - """判断文本是否纯粹是一个占位符""" - return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip())) - def split_markdown(self, markdown_text: str) -> List[str]: """ 将Markdown文本分割成指定大小的块 - 确保可以通过简单拼接重建原始文本(分割的代码块除外) - 尽量保持标题与其对应内容在同一个块中 """ - # 1. 将文本分割成逻辑块 logical_blocks = self._split_into_logical_blocks(markdown_text) - # 2. 合并逻辑块,使其不超过 max_block_size chunks = [] current_chunk_parts = [] current_size = 0 @@ -41,27 +39,21 @@ class MarkdownBlockSplitter: for block in logical_blocks: block_size = self._get_bytes(block) - # 检查是否是占位符块(需要单独成块) - if self._is_placeholder(block): - # 如果当前有积累的块,先输出 + # 如果是占位符,必须单独成块,且强制切断当前累积的内容 + if is_placeholder(block): if current_chunk_parts: chunks.append("".join(current_chunk_parts)) current_chunk_parts = [] current_size = 0 - - # 占位符单独作为一个chunk chunks.append(block) continue # 情况1:块本身就过大 if block_size > self.max_block_size: - # 先将当前积累的块输出 if current_chunk_parts: chunks.append("".join(current_chunk_parts)) current_chunk_parts = [] current_size = 0 - - # 分割这个超大块并直接添加到结果中 chunks.extend(self._split_large_block(block)) continue @@ -69,7 +61,6 @@ class MarkdownBlockSplitter: if current_size + block_size > self.max_block_size: if current_chunk_parts: chunks.append("".join(current_chunk_parts)) - current_chunk_parts = [block] current_size = block_size # 情况3:正常添加 @@ -77,20 +68,14 @@ class MarkdownBlockSplitter: current_chunk_parts.append(block) current_size += block_size - # 添加最后一个剩余的chunk if current_chunk_parts: chunks.append("".join(current_chunk_parts)) return chunks def _split_into_logical_blocks(self, markdown_text: str) -> List[str]: - """ - 将Markdown文本分割成逻辑块(标题、段落、代码块、空行分隔符、图片占位符等) - """ - # 标准化换行符 text = markdown_text.replace('\r\n', '\n') - - # 分割代码块和其他内容 + # 分割代码块 code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)' parts = re.split(code_block_pattern, text) @@ -99,39 +84,31 @@ class MarkdownBlockSplitter: if not part: continue - if i % 2 == 1: # 这是一个代码块 + # 代码块直接添加 + if i % 2 == 1: blocks.append(part) - else: # 这是普通Markdown内容 - # 1. 先按占位符分割,确保占位符独立 + else: + # 普通文本:先切分出占位符 ph_parts = re.split(self.placeholder_pattern, part) - for ph_part in ph_parts: if not ph_part: continue - if self._is_placeholder(ph_part): + if is_placeholder(ph_part): blocks.append(ph_part) else: - # 2. 对非占位符文本,按一个或多个空行分割,并保留分隔符 - # 这能有效分离段落、列表、标题等,并保留它们之间的空行 + # 再按空行切分段落 sub_parts = re.split(r'(\n{2,})', ph_part) - # 过滤掉 re.split 可能产生的空字符串 blocks.extend([p for p in sub_parts if p]) - return blocks def _split_large_block(self, block: str) -> List[str]: - """ - 分割单个超过 max_block_size 的大块 - """ - # 优先处理代码块 + # 代码块处理 if block.startswith(('```', '~~~')): - fence = '```' if block.startswith('```') else '~~~' lines = block.split('\n') header = lines[0] footer = lines[-1] content_lines = lines[1:-1] - chunks = [] current_chunk_lines = [header] current_size = self._get_bytes(header) + 1 @@ -152,7 +129,7 @@ class MarkdownBlockSplitter: chunks.append('\n'.join(current_chunk_lines)) return chunks - # 对普通大文本按行分割 + # 普通文本处理 lines = block.split('\n') chunks = [] current_chunk = [] @@ -162,42 +139,28 @@ class MarkdownBlockSplitter: if current_size + line_size > self.max_block_size and current_chunk: chunks.append('\n'.join(current_chunk)) current_chunk = [line] - current_size = line_size - 1 # -1 for the first line does not have a leading '\n' + current_size = line_size - 1 else: current_chunk.append(line) current_size += line_size if current_chunk: chunks.append('\n'.join(current_chunk)) - return chunks def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]: - """ - 将Markdown字符串分割成不超过max_block_size的块 - """ splitter = MarkdownBlockSplitter(max_block_size=max_block_size) chunks = splitter.split_markdown(markdown_text) - # 过滤掉仅由空白字符组成的块,但保留占位符块 - return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)] + # 过滤空块,但保留占位符 + return [chunk for chunk in chunks if chunk.strip() or is_placeholder(chunk)] def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool: - """ - 判断两个块是否应该用单个换行符连接 - 这通常发生在列表、表格、引用块的连续行之间 - """ + """判断常规文本是否需要单换行连接""" if not prev_chunk.strip() or not next_chunk.strip(): return False - # 如果其中一个是占位符,通常建议使用双换行以确保它是独立的块, - # 除非原格式非常紧凑,但在翻译场景下,分隔开更安全。 - # 这里不额外处理占位符,走默认逻辑(最后会返回False,从而使用\n\n) - if re.match(r'^\s*\s*$', prev_chunk) or \ - re.match(r'^\s*\s*$', next_chunk): - return False - last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip() first_line_next = next_chunk.lstrip().split('\n')[0].lstrip() @@ -206,7 +169,7 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool: first_line_next.startswith('|') and first_line_next.endswith('|'): return True - # 列表 (无序和有序) + # 列表 list_markers = r'^\s*([-*+]|\d+\.)\s+' if re.match(list_markers, last_line_prev) and re.match(list_markers, first_line_next): return True @@ -230,13 +193,21 @@ def join_markdown_texts(markdown_texts: List[str]) -> str: prev_chunk = markdown_texts[i - 1] current_chunk = markdown_texts[i] - # 判断是否应该用单换行还是双换行 - if _needs_single_newline_join(prev_chunk, current_chunk): + # === 核心修复逻辑 === + # 如果前一块或后一块是占位符,强制使用单换行 '\n' + # 这样可以保证: + # 1. 连续的徽章/图片 [img1]\n[img2] 会紧凑排列(视为行内元素) + # 2. HTML结构

\n\n

不会被打断 + # 3. 标题后的图片 # Title\n 也能正常渲染 + if is_placeholder(prev_chunk) or is_placeholder(current_chunk): + separator = "\n" + + elif _needs_single_newline_join(prev_chunk, current_chunk): separator = "\n" else: - # 默认使用双换行来分隔不同的块 + # 只有两个纯文本段落之间才用双换行 separator = "\n\n" joined_text += separator + current_chunk - return joined_text + return joined_text \ No newline at end of file