使用\n拼接markdown块解决标题正文拼接问题
This commit is contained in:
@@ -15,6 +15,8 @@ class MarkdownBlockSplitter:
|
||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
||||
"""
|
||||
将markdown文本拆分为指定最大大小的块。
|
||||
确保拆分后可以重新拼接回原文(除了被拆分的代码块)。
|
||||
尽量保持标题和其内容在同一块中。
|
||||
|
||||
参数:
|
||||
markdown_text: 输入的markdown文本。
|
||||
@@ -22,40 +24,72 @@ class MarkdownBlockSplitter:
|
||||
返回:
|
||||
列表形式的markdown块,每个都是一个字符串。
|
||||
"""
|
||||
# 使用更简单的方法:按Markdown块拆分
|
||||
# 这比使用AST解析更可靠
|
||||
|
||||
# 模式用于识别markdown块(标题、段落、代码块等)
|
||||
# 按Markdown块拆分
|
||||
blocks = self._split_into_logical_blocks(markdown_text)
|
||||
|
||||
# 现在合并块以遵守max_block_size
|
||||
# 现在合并块以遵守max_block_size,同时尽量保持标题和内容在一起
|
||||
result_blocks = []
|
||||
current_block = ""
|
||||
header_waiting = False # 标记是否有待处理的标题
|
||||
|
||||
for block in blocks:
|
||||
# 检查块是否为空行分隔符
|
||||
is_separator = block.strip() == "" and block.count("\n") > 0
|
||||
|
||||
# 检查是否是标题
|
||||
is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
|
||||
|
||||
# 如果单个块大于最大大小,则进一步拆分
|
||||
if len(block) > self.max_block_size:
|
||||
# 如果已有累积内容,先添加
|
||||
if current_block:
|
||||
result_blocks.append(current_block)
|
||||
current_block = ""
|
||||
header_waiting = False
|
||||
|
||||
# 拆分大块
|
||||
large_block_parts = self._split_large_block(block)
|
||||
result_blocks.extend(large_block_parts)
|
||||
continue
|
||||
|
||||
# 如果添加此块会超过限制,则开始新的结果块
|
||||
if len(current_block) + len(block) + 2 > self.max_block_size and current_block:
|
||||
# 确定适当的连接符
|
||||
connector = "" if is_separator or not current_block else "\n"
|
||||
|
||||
# 如果当前块是标题,且之前的块已经很大,先结束之前的块
|
||||
if is_header and len(current_block) + len(block) + len(
|
||||
connector) > self.max_block_size * 0.9 and current_block:
|
||||
result_blocks.append(current_block)
|
||||
current_block = block
|
||||
header_waiting = True
|
||||
continue
|
||||
|
||||
# 如果添加此块会超过限制,则开始新的结果块
|
||||
if len(current_block) + len(block) + len(connector) > self.max_block_size and current_block:
|
||||
# 如果当前块以标题开始,我们会尝试将整个块放入下一个块
|
||||
if header_waiting and not is_header:
|
||||
# 检查是否能添加到当前块而不超出太多
|
||||
if len(current_block) + len(block) + len(connector) <= self.max_block_size * 1.1:
|
||||
current_block += connector + block
|
||||
header_waiting = False
|
||||
continue
|
||||
|
||||
result_blocks.append(current_block)
|
||||
current_block = block
|
||||
header_waiting = is_header
|
||||
else:
|
||||
# 添加到当前块并适当换行
|
||||
# 添加到当前块并适当连接
|
||||
if current_block:
|
||||
current_block += "\n\n" + block
|
||||
current_block += connector + block
|
||||
else:
|
||||
current_block = block
|
||||
|
||||
# 如果刚添加了标题,标记等待内容
|
||||
if is_header:
|
||||
header_waiting = True
|
||||
elif header_waiting and not is_separator:
|
||||
# 如果添加了内容到标题后,不再是等待状态
|
||||
header_waiting = False
|
||||
|
||||
# 如果不为空则添加最后一个块
|
||||
if current_block:
|
||||
result_blocks.append(current_block)
|
||||
@@ -65,6 +99,7 @@ class MarkdownBlockSplitter:
|
||||
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
||||
"""
|
||||
将markdown文本拆分为逻辑块(标题、段落、代码块等)
|
||||
保留原始的空行数量,确保能重新拼接回原文
|
||||
|
||||
参数:
|
||||
markdown_text: 输入markdown文本
|
||||
@@ -87,15 +122,48 @@ class MarkdownBlockSplitter:
|
||||
if i % 2 == 1:
|
||||
blocks.append(part)
|
||||
else:
|
||||
# 对于非代码块,按空行拆分
|
||||
part_blocks = re.split(r'\n\s*\n', part)
|
||||
blocks.extend([b.strip() for b in part_blocks if b.strip()])
|
||||
# 对于非代码块,保留原始文本结构
|
||||
if not part: # 跳过空字符串
|
||||
continue
|
||||
|
||||
# 识别并单独处理标题
|
||||
lines = part.split('\n')
|
||||
current_lines = []
|
||||
|
||||
for line in lines:
|
||||
# 检查是否是标题行
|
||||
if re.match(r'^#{1,6}\s+.+', line):
|
||||
# 如果有累积的内容,先添加到块
|
||||
if current_lines:
|
||||
blocks.append('\n'.join(current_lines))
|
||||
current_lines = []
|
||||
|
||||
# 将标题作为单独的块
|
||||
blocks.append(line)
|
||||
else:
|
||||
current_lines.append(line)
|
||||
|
||||
# 处理剩余的行
|
||||
if current_lines:
|
||||
# 按段落分隔符拆分剩余的内容
|
||||
remaining_content = '\n'.join(current_lines)
|
||||
parts_with_sep = re.split(r'(\n\s*\n)', remaining_content)
|
||||
|
||||
for j, p in enumerate(parts_with_sep):
|
||||
if j % 2 == 0: # 这是正文内容
|
||||
if p.strip(): # 只添加非空内容
|
||||
blocks.append(p)
|
||||
else: # 这是分隔符
|
||||
# 添加分隔符作为单独的块以保持原始格式
|
||||
if j > 0 and parts_with_sep[j - 1].strip(): # 确保前面有内容
|
||||
blocks.append(p)
|
||||
|
||||
return blocks
|
||||
|
||||
def _split_large_block(self, block: str) -> List[str]:
|
||||
"""
|
||||
拆分超过max_block_size的大块。
|
||||
只按行拆分,不按句子拆分。
|
||||
|
||||
参数:
|
||||
block: 一个大的markdown块
|
||||
@@ -144,43 +212,36 @@ class MarkdownBlockSplitter:
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
else:
|
||||
# 对于其他块,按句子或行拆分
|
||||
if '.' in block or '!' in block or '?' in block:
|
||||
# 按句子拆分
|
||||
sentences = re.split(r'(?<=[.!?])\s+', block)
|
||||
# 检查是否是标题
|
||||
is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
|
||||
|
||||
current_part = []
|
||||
current_size = 0
|
||||
# 对于所有非代码块,统一按行拆分
|
||||
lines = block.split('\n')
|
||||
|
||||
for sentence in sentences:
|
||||
if current_size + len(sentence) + 1 > self.max_block_size and current_part:
|
||||
result.append(' '.join(current_part))
|
||||
current_part = [sentence]
|
||||
current_size = len(sentence)
|
||||
else:
|
||||
current_part.append(sentence)
|
||||
current_size += len(sentence) + 1 # +1表示空格
|
||||
current_part = []
|
||||
current_size = 0
|
||||
|
||||
if current_part:
|
||||
result.append(' '.join(current_part))
|
||||
else:
|
||||
# 按行拆分
|
||||
lines = block.split('\n')
|
||||
for line in lines:
|
||||
line_size = len(line) + 1 # +1表示换行符
|
||||
|
||||
current_part = []
|
||||
current_size = 0
|
||||
|
||||
for line in lines:
|
||||
if current_size + len(line) + 1 > self.max_block_size and current_part:
|
||||
# 如果这是标题行,并且当前块已经很大,先结束当前块
|
||||
if re.match(r'^#{1,6}\s+.+', line) and current_size > 0:
|
||||
if current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
current_part = [line]
|
||||
current_size = len(line)
|
||||
else:
|
||||
current_part.append(line)
|
||||
current_size += len(line) + 1 # +1表示换行符
|
||||
current_part = [line]
|
||||
current_size = line_size
|
||||
continue
|
||||
|
||||
if current_part:
|
||||
if current_size + line_size > self.max_block_size and current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
current_part = [line]
|
||||
current_size = line_size
|
||||
else:
|
||||
current_part.append(line)
|
||||
current_size += line_size
|
||||
|
||||
if current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
return result
|
||||
|
||||
@@ -188,13 +249,22 @@ class MarkdownBlockSplitter:
|
||||
def split_markdown_text(markdown_text, max_block_size=4096):
|
||||
"""
|
||||
将markdown字符串拆分为不超过max_block_size的块。
|
||||
拆分后可以通过简单的字符串连接重新组合回原始文本(除了被拆分的代码块)。
|
||||
尽量保持标题和其内容在同一块中。
|
||||
|
||||
参数:
|
||||
markdown_text: 输入markdown文本
|
||||
max_block_size: 每个块的最大字符数
|
||||
|
||||
返回:
|
||||
markdown块列表
|
||||
markdown块列表,可以通过''.join(chunks)重新组合(如果没有代码块被拆分)
|
||||
"""
|
||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||
return splitter.split_markdown(markdown_text)
|
||||
return splitter.split_markdown(markdown_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with open(r"C:\Users\jxgm\Desktop\FileTranslate\tests\resource\regex.md", "r") as f:
|
||||
md = f.read()
|
||||
a = split_markdown_text(md)
|
||||
pass
|
||||
Reference in New Issue
Block a user