修复md分隔符问题
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Self, List
|
from typing import Self, List
|
||||||
|
|
||||||
@@ -10,7 +9,8 @@ from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
|
|||||||
from docutranslate.context.md_mask_context import MDMaskUrisContext
|
from docutranslate.context.md_mask_context import MDMaskUrisContext
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
# 引入新的 is_placeholder 函数
|
||||||
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts, is_placeholder
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -39,32 +39,24 @@ class MDTranslator(AiTranslator):
|
|||||||
system_proxy_enable=config.system_proxy_enable)
|
system_proxy_enable=config.system_proxy_enable)
|
||||||
self.translate_agent = MDTranslateAgent(agent_config)
|
self.translate_agent = MDTranslateAgent(agent_config)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_placeholder(text: str) -> bool:
|
|
||||||
"""检查文本块是否仅包含图片占位符"""
|
|
||||||
# 匹配 <ph-xxxxxx> 格式,允许前后有空白
|
|
||||||
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
|
|
||||||
|
|
||||||
def translate(self, document: MarkdownDocument) -> Self:
|
def translate(self, document: MarkdownDocument) -> Self:
|
||||||
self.logger.info("正在翻译markdown")
|
self.logger.info("正在翻译markdown")
|
||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
|
||||||
# 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
|
|
||||||
translate_indices: List[int] = []
|
translate_indices: List[int] = []
|
||||||
translate_chunks: List[str] = []
|
translate_chunks: List[str] = []
|
||||||
final_result: List[str] = list(chunks) # 浅拷贝,预填充原始值
|
final_result: List[str] = list(chunks)
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
if self._is_placeholder(chunk):
|
# 直接使用 splitter 中定义的函数
|
||||||
# 如果是占位符,不需要处理,final_result中该位置保持原样
|
if is_placeholder(chunk):
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
translate_indices.append(i)
|
translate_indices.append(i)
|
||||||
translate_chunks.append(chunk)
|
translate_chunks.append(chunk)
|
||||||
|
|
||||||
if self.glossary_agent and translate_chunks:
|
if self.glossary_agent and translate_chunks:
|
||||||
# 仅对需要翻译的文本提取术语
|
|
||||||
self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size)
|
self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size)
|
||||||
if self.translate_agent:
|
if self.translate_agent:
|
||||||
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
||||||
@@ -73,13 +65,9 @@ class MDTranslator(AiTranslator):
|
|||||||
|
|
||||||
if self.translate_agent and translate_chunks:
|
if self.translate_agent and translate_chunks:
|
||||||
translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks)
|
translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks)
|
||||||
|
|
||||||
# 将翻译结果回填到对应位置
|
|
||||||
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
||||||
final_result[idx] = translated_text
|
final_result[idx] = translated_text
|
||||||
|
|
||||||
# 如果没有翻译代理或者没有需要翻译的块,final_result 已经包含了正确的内容(原始chunks)
|
|
||||||
|
|
||||||
content = join_markdown_texts(final_result)
|
content = join_markdown_texts(final_result)
|
||||||
# 做一些加强鲁棒性的操作
|
# 做一些加强鲁棒性的操作
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
@@ -94,13 +82,12 @@ class MDTranslator(AiTranslator):
|
|||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
|
||||||
# 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
|
|
||||||
translate_indices: List[int] = []
|
translate_indices: List[int] = []
|
||||||
translate_chunks: List[str] = []
|
translate_chunks: List[str] = []
|
||||||
final_result: List[str] = list(chunks)
|
final_result: List[str] = list(chunks)
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
if self._is_placeholder(chunk):
|
if is_placeholder(chunk):
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
translate_indices.append(i)
|
translate_indices.append(i)
|
||||||
@@ -116,14 +103,11 @@ class MDTranslator(AiTranslator):
|
|||||||
|
|
||||||
if self.translate_agent and translate_chunks:
|
if self.translate_agent and translate_chunks:
|
||||||
translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks)
|
translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks)
|
||||||
|
|
||||||
# 将翻译结果回填到对应位置
|
|
||||||
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
||||||
final_result[idx] = translated_text
|
final_result[idx] = translated_text
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
content = join_markdown_texts(final_result)
|
content = join_markdown_texts(final_result)
|
||||||
# 做一些加强鲁棒性的操作
|
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
document.content = content.encode()
|
document.content = content.encode()
|
||||||
|
|||||||
@@ -4,36 +4,34 @@ import re
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
def is_placeholder(text: str) -> bool:
|
||||||
|
"""
|
||||||
|
判断文本块是否仅包含图片占位符
|
||||||
|
匹配格式: <ph-abc123> (允许前后空白)
|
||||||
|
"""
|
||||||
|
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBlockSplitter:
|
class MarkdownBlockSplitter:
|
||||||
def __init__(self, max_block_size: int = 5000):
|
def __init__(self, max_block_size: int = 5000):
|
||||||
"""
|
"""
|
||||||
初始化Markdown分块器
|
初始化Markdown分块器
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
max_block_size: 每个块的最大字节数
|
max_block_size: 每个块的最大字节数
|
||||||
"""
|
"""
|
||||||
self.max_block_size = max_block_size
|
self.max_block_size = max_block_size
|
||||||
# 匹配占位符的正则,例如 <ph-abc123>
|
|
||||||
self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
|
self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_bytes(text: str) -> int:
|
def _get_bytes(text: str) -> int:
|
||||||
return len(text.encode('utf-8'))
|
return len(text.encode('utf-8'))
|
||||||
|
|
||||||
def _is_placeholder(self, text: str) -> bool:
|
|
||||||
"""判断文本是否纯粹是一个占位符"""
|
|
||||||
return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip()))
|
|
||||||
|
|
||||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
def split_markdown(self, markdown_text: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
将Markdown文本分割成指定大小的块
|
将Markdown文本分割成指定大小的块
|
||||||
确保可以通过简单拼接重建原始文本(分割的代码块除外)
|
|
||||||
尽量保持标题与其对应内容在同一个块中
|
|
||||||
"""
|
"""
|
||||||
# 1. 将文本分割成逻辑块
|
|
||||||
logical_blocks = self._split_into_logical_blocks(markdown_text)
|
logical_blocks = self._split_into_logical_blocks(markdown_text)
|
||||||
|
|
||||||
# 2. 合并逻辑块,使其不超过 max_block_size
|
|
||||||
chunks = []
|
chunks = []
|
||||||
current_chunk_parts = []
|
current_chunk_parts = []
|
||||||
current_size = 0
|
current_size = 0
|
||||||
@@ -41,27 +39,21 @@ class MarkdownBlockSplitter:
|
|||||||
for block in logical_blocks:
|
for block in logical_blocks:
|
||||||
block_size = self._get_bytes(block)
|
block_size = self._get_bytes(block)
|
||||||
|
|
||||||
# 检查是否是占位符块(需要单独成块)
|
# 如果是占位符,必须单独成块,且强制切断当前累积的内容
|
||||||
if self._is_placeholder(block):
|
if is_placeholder(block):
|
||||||
# 如果当前有积累的块,先输出
|
|
||||||
if current_chunk_parts:
|
if current_chunk_parts:
|
||||||
chunks.append("".join(current_chunk_parts))
|
chunks.append("".join(current_chunk_parts))
|
||||||
current_chunk_parts = []
|
current_chunk_parts = []
|
||||||
current_size = 0
|
current_size = 0
|
||||||
|
|
||||||
# 占位符单独作为一个chunk
|
|
||||||
chunks.append(block)
|
chunks.append(block)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 情况1:块本身就过大
|
# 情况1:块本身就过大
|
||||||
if block_size > self.max_block_size:
|
if block_size > self.max_block_size:
|
||||||
# 先将当前积累的块输出
|
|
||||||
if current_chunk_parts:
|
if current_chunk_parts:
|
||||||
chunks.append("".join(current_chunk_parts))
|
chunks.append("".join(current_chunk_parts))
|
||||||
current_chunk_parts = []
|
current_chunk_parts = []
|
||||||
current_size = 0
|
current_size = 0
|
||||||
|
|
||||||
# 分割这个超大块并直接添加到结果中
|
|
||||||
chunks.extend(self._split_large_block(block))
|
chunks.extend(self._split_large_block(block))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -69,7 +61,6 @@ class MarkdownBlockSplitter:
|
|||||||
if current_size + block_size > self.max_block_size:
|
if current_size + block_size > self.max_block_size:
|
||||||
if current_chunk_parts:
|
if current_chunk_parts:
|
||||||
chunks.append("".join(current_chunk_parts))
|
chunks.append("".join(current_chunk_parts))
|
||||||
|
|
||||||
current_chunk_parts = [block]
|
current_chunk_parts = [block]
|
||||||
current_size = block_size
|
current_size = block_size
|
||||||
# 情况3:正常添加
|
# 情况3:正常添加
|
||||||
@@ -77,20 +68,14 @@ class MarkdownBlockSplitter:
|
|||||||
current_chunk_parts.append(block)
|
current_chunk_parts.append(block)
|
||||||
current_size += block_size
|
current_size += block_size
|
||||||
|
|
||||||
# 添加最后一个剩余的chunk
|
|
||||||
if current_chunk_parts:
|
if current_chunk_parts:
|
||||||
chunks.append("".join(current_chunk_parts))
|
chunks.append("".join(current_chunk_parts))
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
||||||
"""
|
|
||||||
将Markdown文本分割成逻辑块(标题、段落、代码块、空行分隔符、图片占位符等)
|
|
||||||
"""
|
|
||||||
# 标准化换行符
|
|
||||||
text = markdown_text.replace('\r\n', '\n')
|
text = markdown_text.replace('\r\n', '\n')
|
||||||
|
# 分割代码块
|
||||||
# 分割代码块和其他内容
|
|
||||||
code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)'
|
code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)'
|
||||||
parts = re.split(code_block_pattern, text)
|
parts = re.split(code_block_pattern, text)
|
||||||
|
|
||||||
@@ -99,39 +84,31 @@ class MarkdownBlockSplitter:
|
|||||||
if not part:
|
if not part:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if i % 2 == 1: # 这是一个代码块
|
# 代码块直接添加
|
||||||
|
if i % 2 == 1:
|
||||||
blocks.append(part)
|
blocks.append(part)
|
||||||
else: # 这是普通Markdown内容
|
else:
|
||||||
# 1. 先按占位符分割,确保占位符独立
|
# 普通文本:先切分出占位符
|
||||||
ph_parts = re.split(self.placeholder_pattern, part)
|
ph_parts = re.split(self.placeholder_pattern, part)
|
||||||
|
|
||||||
for ph_part in ph_parts:
|
for ph_part in ph_parts:
|
||||||
if not ph_part:
|
if not ph_part:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self._is_placeholder(ph_part):
|
if is_placeholder(ph_part):
|
||||||
blocks.append(ph_part)
|
blocks.append(ph_part)
|
||||||
else:
|
else:
|
||||||
# 2. 对非占位符文本,按一个或多个空行分割,并保留分隔符
|
# 再按空行切分段落
|
||||||
# 这能有效分离段落、列表、标题等,并保留它们之间的空行
|
|
||||||
sub_parts = re.split(r'(\n{2,})', ph_part)
|
sub_parts = re.split(r'(\n{2,})', ph_part)
|
||||||
# 过滤掉 re.split 可能产生的空字符串
|
|
||||||
blocks.extend([p for p in sub_parts if p])
|
blocks.extend([p for p in sub_parts if p])
|
||||||
|
|
||||||
return blocks
|
return blocks
|
||||||
|
|
||||||
def _split_large_block(self, block: str) -> List[str]:
|
def _split_large_block(self, block: str) -> List[str]:
|
||||||
"""
|
# 代码块处理
|
||||||
分割单个超过 max_block_size 的大块
|
|
||||||
"""
|
|
||||||
# 优先处理代码块
|
|
||||||
if block.startswith(('```', '~~~')):
|
if block.startswith(('```', '~~~')):
|
||||||
fence = '```' if block.startswith('```') else '~~~'
|
|
||||||
lines = block.split('\n')
|
lines = block.split('\n')
|
||||||
header = lines[0]
|
header = lines[0]
|
||||||
footer = lines[-1]
|
footer = lines[-1]
|
||||||
content_lines = lines[1:-1]
|
content_lines = lines[1:-1]
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
current_chunk_lines = [header]
|
current_chunk_lines = [header]
|
||||||
current_size = self._get_bytes(header) + 1
|
current_size = self._get_bytes(header) + 1
|
||||||
@@ -152,7 +129,7 @@ class MarkdownBlockSplitter:
|
|||||||
chunks.append('\n'.join(current_chunk_lines))
|
chunks.append('\n'.join(current_chunk_lines))
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
# 对普通大文本按行分割
|
# 普通文本处理
|
||||||
lines = block.split('\n')
|
lines = block.split('\n')
|
||||||
chunks = []
|
chunks = []
|
||||||
current_chunk = []
|
current_chunk = []
|
||||||
@@ -162,42 +139,28 @@ class MarkdownBlockSplitter:
|
|||||||
if current_size + line_size > self.max_block_size and current_chunk:
|
if current_size + line_size > self.max_block_size and current_chunk:
|
||||||
chunks.append('\n'.join(current_chunk))
|
chunks.append('\n'.join(current_chunk))
|
||||||
current_chunk = [line]
|
current_chunk = [line]
|
||||||
current_size = line_size - 1 # -1 for the first line does not have a leading '\n'
|
current_size = line_size - 1
|
||||||
else:
|
else:
|
||||||
current_chunk.append(line)
|
current_chunk.append(line)
|
||||||
current_size += line_size
|
current_size += line_size
|
||||||
|
|
||||||
if current_chunk:
|
if current_chunk:
|
||||||
chunks.append('\n'.join(current_chunk))
|
chunks.append('\n'.join(current_chunk))
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
|
def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
|
||||||
"""
|
|
||||||
将Markdown字符串分割成不超过max_block_size的块
|
|
||||||
"""
|
|
||||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||||
chunks = splitter.split_markdown(markdown_text)
|
chunks = splitter.split_markdown(markdown_text)
|
||||||
# 过滤掉仅由空白字符组成的块,但保留占位符块
|
# 过滤空块,但保留占位符
|
||||||
return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)]
|
return [chunk for chunk in chunks if chunk.strip() or is_placeholder(chunk)]
|
||||||
|
|
||||||
|
|
||||||
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
||||||
"""
|
"""判断常规文本是否需要单换行连接"""
|
||||||
判断两个块是否应该用单个换行符连接
|
|
||||||
这通常发生在列表、表格、引用块的连续行之间
|
|
||||||
"""
|
|
||||||
if not prev_chunk.strip() or not next_chunk.strip():
|
if not prev_chunk.strip() or not next_chunk.strip():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 如果其中一个是占位符,通常建议使用双换行以确保它是独立的块,
|
|
||||||
# 除非原格式非常紧凑,但在翻译场景下,分隔开更安全。
|
|
||||||
# 这里不额外处理占位符,走默认逻辑(最后会返回False,从而使用\n\n)
|
|
||||||
if re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', prev_chunk) or \
|
|
||||||
re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', next_chunk):
|
|
||||||
return False
|
|
||||||
|
|
||||||
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
|
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
|
||||||
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
|
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
|
||||||
|
|
||||||
@@ -206,7 +169,7 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
|||||||
first_line_next.startswith('|') and first_line_next.endswith('|'):
|
first_line_next.startswith('|') and first_line_next.endswith('|'):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# 列表 (无序和有序)
|
# 列表
|
||||||
list_markers = r'^\s*([-*+]|\d+\.)\s+'
|
list_markers = r'^\s*([-*+]|\d+\.)\s+'
|
||||||
if re.match(list_markers, last_line_prev) and re.match(list_markers, first_line_next):
|
if re.match(list_markers, last_line_prev) and re.match(list_markers, first_line_next):
|
||||||
return True
|
return True
|
||||||
@@ -230,11 +193,19 @@ def join_markdown_texts(markdown_texts: List[str]) -> str:
|
|||||||
prev_chunk = markdown_texts[i - 1]
|
prev_chunk = markdown_texts[i - 1]
|
||||||
current_chunk = markdown_texts[i]
|
current_chunk = markdown_texts[i]
|
||||||
|
|
||||||
# 判断是否应该用单换行还是双换行
|
# === 核心修复逻辑 ===
|
||||||
if _needs_single_newline_join(prev_chunk, current_chunk):
|
# 如果前一块或后一块是占位符,强制使用单换行 '\n'
|
||||||
|
# 这样可以保证:
|
||||||
|
# 1. 连续的徽章/图片 [img1]\n[img2] 会紧凑排列(视为行内元素)
|
||||||
|
# 2. HTML结构 <p>\n<img>\n</p> 不会被打断
|
||||||
|
# 3. 标题后的图片 # Title\n<img> 也能正常渲染
|
||||||
|
if is_placeholder(prev_chunk) or is_placeholder(current_chunk):
|
||||||
|
separator = "\n"
|
||||||
|
|
||||||
|
elif _needs_single_newline_join(prev_chunk, current_chunk):
|
||||||
separator = "\n"
|
separator = "\n"
|
||||||
else:
|
else:
|
||||||
# 默认使用双换行来分隔不同的块
|
# 只有两个纯文本段落之间才用双换行
|
||||||
separator = "\n\n"
|
separator = "\n\n"
|
||||||
|
|
||||||
joined_text += separator + current_chunk
|
joined_text += separator + current_chunk
|
||||||
|
|||||||
Reference in New Issue
Block a user