修复图片布局不正确的问题
This commit is contained in:
@@ -19,7 +19,6 @@ def generate_prompt(markdown_text: str, to_lang: str):
|
|||||||
return f"""
|
return f"""
|
||||||
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
|
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
|
||||||
- NO explanations. NO notes.
|
- NO explanations. NO notes.
|
||||||
- (very important) Preserve all placeholders in the format <ph-abcdef> (example: <ph-1>, <ph-af12asd>).
|
|
||||||
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
||||||
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
||||||
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
||||||
|
|||||||
@@ -9,8 +9,12 @@ from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
|
|||||||
from docutranslate.context.md_mask_context import MDMaskUrisContext
|
from docutranslate.context.md_mask_context import MDMaskUrisContext
|
||||||
from docutranslate.ir.markdown_document import MarkdownDocument
|
from docutranslate.ir.markdown_document import MarkdownDocument
|
||||||
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
||||||
# 引入新的 is_placeholder 函数
|
# 引入新的布局分割和拼接函数
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts, is_placeholder
|
from docutranslate.utils.markdown_splitter import (
|
||||||
|
split_markdown_with_layout,
|
||||||
|
join_markdown_with_layout,
|
||||||
|
is_placeholder
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -42,14 +46,15 @@ class MDTranslator(AiTranslator):
|
|||||||
def translate(self, document: MarkdownDocument) -> Self:
|
def translate(self, document: MarkdownDocument) -> Self:
|
||||||
self.logger.info("正在翻译markdown")
|
self.logger.info("正在翻译markdown")
|
||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
# 使用新接口,获取 chunks 和对应的 separators
|
||||||
|
chunks, separators = split_markdown_with_layout(document.content.decode(), self.chunk_size)
|
||||||
|
|
||||||
translate_indices: List[int] = []
|
translate_indices: List[int] = []
|
||||||
translate_chunks: List[str] = []
|
translate_chunks: List[str] = []
|
||||||
final_result: List[str] = list(chunks)
|
final_result: List[str] = list(chunks) # 浅拷贝,用于回填翻译结果
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
# 直接使用 splitter 中定义的函数
|
# 占位符不翻译
|
||||||
if is_placeholder(chunk):
|
if is_placeholder(chunk):
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
@@ -68,8 +73,9 @@ class MDTranslator(AiTranslator):
|
|||||||
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
||||||
final_result[idx] = translated_text
|
final_result[idx] = translated_text
|
||||||
|
|
||||||
content = join_markdown_texts(final_result)
|
# 使用记录的 separators 进行还原,完美保留布局
|
||||||
# 做一些加强鲁棒性的操作
|
content = join_markdown_with_layout(final_result, separators)
|
||||||
|
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
|
|
||||||
@@ -80,7 +86,8 @@ class MDTranslator(AiTranslator):
|
|||||||
async def translate_async(self, document: MarkdownDocument) -> Self:
|
async def translate_async(self, document: MarkdownDocument) -> Self:
|
||||||
self.logger.info("正在翻译markdown")
|
self.logger.info("正在翻译markdown")
|
||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
# 异步方法同样更新
|
||||||
|
chunks, separators = split_markdown_with_layout(document.content.decode(), self.chunk_size)
|
||||||
|
|
||||||
translate_indices: List[int] = []
|
translate_indices: List[int] = []
|
||||||
translate_chunks: List[str] = []
|
translate_chunks: List[str] = []
|
||||||
@@ -107,7 +114,7 @@ class MDTranslator(AiTranslator):
|
|||||||
final_result[idx] = translated_text
|
final_result[idx] = translated_text
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
content = join_markdown_texts(final_result)
|
content = join_markdown_with_layout(final_result, separators)
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
document.content = content.encode()
|
document.content = content.encode()
|
||||||
|
|||||||
@@ -1,213 +1,247 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List, Tuple, Optional
|
||||||
|
|
||||||
|
|
||||||
def is_placeholder(text: str) -> bool:
|
def is_placeholder(text: str) -> bool:
|
||||||
"""
|
"""判断文本块是否是图片占位符"""
|
||||||
判断文本块是否仅包含图片占位符
|
|
||||||
匹配格式: <ph-abc123> (允许前后空白)
|
|
||||||
"""
|
|
||||||
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
|
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBlockSplitter:
|
class MarkdownBlockSplitter:
|
||||||
def __init__(self, max_block_size: int = 5000):
|
def __init__(self, max_block_size: int = 5000):
|
||||||
"""
|
|
||||||
初始化Markdown分块器
|
|
||||||
参数:
|
|
||||||
max_block_size: 每个块的最大字节数
|
|
||||||
"""
|
|
||||||
self.max_block_size = max_block_size
|
self.max_block_size = max_block_size
|
||||||
self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
|
# 匹配 代码块 或 占位符
|
||||||
|
self.special_token_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~|<ph-[a-zA-Z0-9]+>)'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_bytes(text: str) -> int:
|
def _get_bytes(text: str) -> int:
|
||||||
return len(text.encode('utf-8'))
|
return len(text.encode('utf-8'))
|
||||||
|
|
||||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
def split_with_layout(self, markdown_text: str) -> Tuple[List[str], List[str]]:
|
||||||
"""
|
"""
|
||||||
将Markdown文本分割成指定大小的块
|
分割Markdown,并返回 (内容块列表, 分隔符列表)
|
||||||
|
separators[i] 是 chunks[i] 和 chunks[i+1] 之间的原始文本
|
||||||
"""
|
"""
|
||||||
logical_blocks = self._split_into_logical_blocks(markdown_text)
|
# 1. 细粒度切分:将文本切分为 [Block, Separator, Block, Separator...]
|
||||||
|
raw_blocks, raw_separators = self._tokenize(markdown_text)
|
||||||
|
|
||||||
|
# 2. 聚合:将小的 Block 合并为大的 Chunk,同时合并中间的 Separator
|
||||||
chunks = []
|
chunks = []
|
||||||
current_chunk_parts = []
|
final_separators = []
|
||||||
current_size = 0
|
|
||||||
|
|
||||||
for block in logical_blocks:
|
if not raw_blocks:
|
||||||
block_size = self._get_bytes(block)
|
return [], []
|
||||||
|
|
||||||
# 如果是占位符,必须单独成块,且强制切断当前累积的内容
|
current_chunk = raw_blocks[0]
|
||||||
if is_placeholder(block):
|
current_size = self._get_bytes(current_chunk)
|
||||||
if current_chunk_parts:
|
|
||||||
chunks.append("".join(current_chunk_parts))
|
|
||||||
current_chunk_parts = []
|
|
||||||
current_size = 0
|
|
||||||
chunks.append(block)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 情况1:块本身就过大
|
for i in range(len(raw_separators)):
|
||||||
if block_size > self.max_block_size:
|
next_block = raw_blocks[i + 1]
|
||||||
if current_chunk_parts:
|
separator = raw_separators[i]
|
||||||
chunks.append("".join(current_chunk_parts))
|
|
||||||
current_chunk_parts = []
|
|
||||||
current_size = 0
|
|
||||||
chunks.extend(self._split_large_block(block))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 情况2:将此块添加到当前chunk会超限
|
next_block_size = self._get_bytes(next_block)
|
||||||
if current_size + block_size > self.max_block_size:
|
separator_size = self._get_bytes(separator)
|
||||||
if current_chunk_parts:
|
|
||||||
chunks.append("".join(current_chunk_parts))
|
# 判断是否需要切分
|
||||||
current_chunk_parts = [block]
|
# 1. 遇到占位符,强制切分(为了保护图片不被混入翻译文本中)
|
||||||
current_size = block_size
|
# 2. 当前块 + 分隔符 + 下一块 超过最大限制
|
||||||
# 情况3:正常添加
|
if is_placeholder(current_chunk) or is_placeholder(next_block) or \
|
||||||
|
(current_size + separator_size + next_block_size > self.max_block_size):
|
||||||
|
|
||||||
|
# 结束当前块
|
||||||
|
chunks.append(current_chunk)
|
||||||
|
# 记录连接到下一块的分隔符
|
||||||
|
final_separators.append(separator)
|
||||||
|
|
||||||
|
# 开始新块
|
||||||
|
current_chunk = next_block
|
||||||
|
current_size = next_block_size
|
||||||
else:
|
else:
|
||||||
current_chunk_parts.append(block)
|
# 合并
|
||||||
current_size += block_size
|
# 新的当前块 = 旧当前块 + 分隔符 + 下一块
|
||||||
|
current_chunk += separator + next_block
|
||||||
|
current_size += separator_size + next_block_size
|
||||||
|
|
||||||
if current_chunk_parts:
|
# 添加最后一个块
|
||||||
chunks.append("".join(current_chunk_parts))
|
chunks.append(current_chunk)
|
||||||
|
|
||||||
return chunks
|
return chunks, final_separators
|
||||||
|
|
||||||
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
def _tokenize(self, text: str) -> Tuple[List[str], List[str]]:
|
||||||
text = markdown_text.replace('\r\n', '\n')
|
"""
|
||||||
# 分割代码块
|
将文本初步标记化为逻辑单元。
|
||||||
code_block_pattern = r'(```[\s\S]*?```|~~~[\s\S]*?~~~)'
|
逻辑单元包括:代码块、占位符、普通段落。
|
||||||
parts = re.split(code_block_pattern, text)
|
单元之间的所有字符(通常是空白)都被视为分隔符。
|
||||||
|
"""
|
||||||
|
text = text.replace('\r\n', '\n')
|
||||||
|
|
||||||
blocks = []
|
# 1. 按 代码块 和 占位符 初步切分
|
||||||
|
# re.split 包含捕获组时,结果列表为: [Text, Token, Text, Token, Text]
|
||||||
|
parts = re.split(self.special_token_pattern, text)
|
||||||
|
|
||||||
|
blocks = [] # 存储逻辑内容块
|
||||||
|
separators = [] # 存储块之间的分隔符
|
||||||
|
|
||||||
|
# 临时缓冲区,用于处理 split 产生的纯文本部分
|
||||||
|
def process_text_part(text_part):
|
||||||
|
if not text_part:
|
||||||
|
return []
|
||||||
|
# 对普通文本,按段落(双换行)再次切分
|
||||||
|
# 我们需要保留切分符,所以用捕获组
|
||||||
|
sub_parts = re.split(r'(\n{2,})', text_part)
|
||||||
|
return sub_parts
|
||||||
|
|
||||||
|
# 初始化:处理第一个部分
|
||||||
|
# 整个流程是一个状态机,我们在寻找 "Content" -> "Separator" -> "Content" 的链条
|
||||||
|
|
||||||
|
# 为了简化逻辑,我们先把 parts 扁平化为一个 token 流
|
||||||
|
# 流中的元素要么是重要Token(Code/PH),要么是普通文本(Text)
|
||||||
|
flat_tokens = []
|
||||||
for i, part in enumerate(parts):
|
for i, part in enumerate(parts):
|
||||||
if not part:
|
if not part:
|
||||||
continue
|
continue
|
||||||
|
if re.match(self.special_token_pattern, part):
|
||||||
# 代码块直接添加
|
flat_tokens.append({'type': 'special', 'text': part})
|
||||||
if i % 2 == 1:
|
|
||||||
blocks.append(part)
|
|
||||||
else:
|
else:
|
||||||
# 普通文本:先切分出占位符
|
# 普通文本,继续细分段落
|
||||||
ph_parts = re.split(self.placeholder_pattern, part)
|
sub_parts = process_text_part(part)
|
||||||
for ph_part in ph_parts:
|
for sp in sub_parts:
|
||||||
if not ph_part:
|
if not sp: continue
|
||||||
|
# 只有双换行才被明确视为分隔符逻辑,单换行通常归于段落内
|
||||||
|
# 但为了精准还原,我们把所有 re.split 出来的项都视为独立单元
|
||||||
|
flat_tokens.append({'type': 'text', 'text': sp})
|
||||||
|
|
||||||
|
if not flat_tokens:
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
# 接下来进行 "Whitespace Shifting" (空白归约)
|
||||||
|
# 我们希望 block 是纯净的内容,separator 是 block 之间的空白
|
||||||
|
# 例如: "Text \n <ph>" -> Block="Text", Sep=" \n ", Block="<ph>"
|
||||||
|
|
||||||
|
normalized_blocks = []
|
||||||
|
normalized_separators = []
|
||||||
|
|
||||||
|
current_block_text = ""
|
||||||
|
pending_separator = ""
|
||||||
|
|
||||||
|
for i, token in enumerate(flat_tokens):
|
||||||
|
content = token['text']
|
||||||
|
|
||||||
|
# 如果是特殊块(代码/占位符),它本身就是核心内容,前后不能有粘连
|
||||||
|
if token['type'] == 'special':
|
||||||
|
if current_block_text:
|
||||||
|
normalized_blocks.append(current_block_text)
|
||||||
|
normalized_separators.append(pending_separator)
|
||||||
|
current_block_text = ""
|
||||||
|
pending_separator = ""
|
||||||
|
|
||||||
|
normalized_blocks.append(content)
|
||||||
|
# 特殊块处理完,它的位置占住了,接下来的空白应该算作 separator
|
||||||
|
# 但我们需要看下一个 token 是啥。
|
||||||
|
# 简单处理:将特殊块直接加入,接下来的文本如果是空白,就是 separator
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if is_placeholder(ph_part):
|
# 如果是普通文本
|
||||||
blocks.append(ph_part)
|
# 检查是否全是空白(这是分隔符候选)
|
||||||
|
if not content.strip():
|
||||||
|
# 如果当前没有积累的 block,这可能是开头的空白,或者是两个 special 块之间的空白
|
||||||
|
if not normalized_blocks and not current_block_text:
|
||||||
|
# 忽略文件开头的空白,或者附加到下一个块?
|
||||||
|
# 为了对齐 list 长度,通常忽略开头,或者视为第一个块的一部分(如果不翻译)
|
||||||
|
pass
|
||||||
|
elif normalized_blocks and not current_block_text:
|
||||||
|
# 前面已经有一个完整块,现在还没开始新块,这个空白是 separator
|
||||||
|
# 如果之前已经有 pending_separator,则叠加
|
||||||
|
if len(normalized_separators) < len(normalized_blocks):
|
||||||
|
normalized_separators.append(content)
|
||||||
else:
|
else:
|
||||||
# 再按空行切分段落
|
# 这种情况应该少见,追加到上一个 separator
|
||||||
sub_parts = re.split(r'(\n{2,})', ph_part)
|
normalized_separators[-1] += content
|
||||||
blocks.extend([p for p in sub_parts if p])
|
|
||||||
return blocks
|
|
||||||
|
|
||||||
def _split_large_block(self, block: str) -> List[str]:
|
|
||||||
# 代码块处理
|
|
||||||
if block.startswith(('```', '~~~')):
|
|
||||||
lines = block.split('\n')
|
|
||||||
header = lines[0]
|
|
||||||
footer = lines[-1]
|
|
||||||
content_lines = lines[1:-1]
|
|
||||||
chunks = []
|
|
||||||
current_chunk_lines = [header]
|
|
||||||
current_size = self._get_bytes(header) + 1
|
|
||||||
|
|
||||||
for line in content_lines:
|
|
||||||
line_size = self._get_bytes(line) + 1
|
|
||||||
if current_size + line_size + self._get_bytes(footer) > self.max_block_size:
|
|
||||||
current_chunk_lines.append(footer)
|
|
||||||
chunks.append('\n'.join(current_chunk_lines))
|
|
||||||
current_chunk_lines = [header, line]
|
|
||||||
current_size = self._get_bytes(header) + 1 + line_size
|
|
||||||
else:
|
else:
|
||||||
current_chunk_lines.append(line)
|
# current_block_text 正在积累,遇到了空白
|
||||||
current_size += line_size
|
# 比如 "Hello \n\n World" 中的 \n\n
|
||||||
|
# 结束当前块
|
||||||
if len(current_chunk_lines) > 1:
|
normalized_blocks.append(current_block_text)
|
||||||
current_chunk_lines.append(footer)
|
current_block_text = ""
|
||||||
chunks.append('\n'.join(current_chunk_lines))
|
normalized_separators.append(content)
|
||||||
return chunks
|
|
||||||
|
|
||||||
# 普通文本处理
|
|
||||||
lines = block.split('\n')
|
|
||||||
chunks = []
|
|
||||||
current_chunk = []
|
|
||||||
current_size = 0
|
|
||||||
for line in lines:
|
|
||||||
line_size = self._get_bytes(line) + 1
|
|
||||||
if current_size + line_size > self.max_block_size and current_chunk:
|
|
||||||
chunks.append('\n'.join(current_chunk))
|
|
||||||
current_chunk = [line]
|
|
||||||
current_size = line_size - 1
|
|
||||||
else:
|
else:
|
||||||
current_chunk.append(line)
|
# 是有内容的文本
|
||||||
current_size += line_size
|
# 剥离前导空白(归入上一个分隔符)和尾随空白(归入下一个分隔符)?
|
||||||
|
# 简单起见,利用 rstrip 将尾部空白视为分隔符的一部分
|
||||||
|
|
||||||
if current_chunk:
|
# 更好的策略:
|
||||||
chunks.append('\n'.join(current_chunk))
|
# 文本 token 自身可能包含换行(段落内)。
|
||||||
return chunks
|
# 我们只在 tokenize 阶段切分了 \n{2,}。
|
||||||
|
# 所以 content 基本是一个完整的段落或代码块周围的文本。
|
||||||
|
|
||||||
|
# 如果上一个块已经结束 (normalized_blocks > normalized_separators),说明缺分隔符
|
||||||
|
if len(normalized_blocks) > len(normalized_separators):
|
||||||
|
# 这意味着两个非空文本紧挨着?理论上 tokenize 阶段应该切开了
|
||||||
|
normalized_separators.append("")
|
||||||
|
|
||||||
|
# 剥离尾部空白作为 potential separator
|
||||||
|
stripped = content.rstrip()
|
||||||
|
trailing_space = content[len(stripped):]
|
||||||
|
|
||||||
|
if current_block_text:
|
||||||
|
# 合并到当前正在构建的段落(极少发生,因为我们按split切分)
|
||||||
|
current_block_text += content
|
||||||
|
else:
|
||||||
|
# 新的文本块
|
||||||
|
# 但要注意,如果这个文本块前面有空白,那个空白已经在上面处理了
|
||||||
|
# 这里只需要处理自己
|
||||||
|
normalized_blocks.append(stripped)
|
||||||
|
if trailing_space:
|
||||||
|
# 这个尾部空白暂时存起来,看后面接什么
|
||||||
|
# 实际上在我们的循环模型里,直接视为 separator 比较安全
|
||||||
|
# 除非它是文件结尾
|
||||||
|
if i < len(flat_tokens) - 1:
|
||||||
|
normalized_separators.append(trailing_space)
|
||||||
|
else:
|
||||||
|
# 文件末尾的空白,可以忽略或加回 block
|
||||||
|
normalized_blocks[-1] += trailing_space
|
||||||
|
|
||||||
|
# 修正长度:separators 数量应该是 blocks - 1
|
||||||
|
while len(normalized_separators) < len(normalized_blocks) - 1:
|
||||||
|
normalized_separators.append("\n\n") # 默认 fallback
|
||||||
|
|
||||||
|
return normalized_blocks, normalized_separators
|
||||||
|
|
||||||
|
|
||||||
def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
|
def split_markdown_with_layout(markdown_text: str, max_block_size=5000) -> Tuple[List[str], List[str]]:
|
||||||
|
"""
|
||||||
|
外部调用的主入口
|
||||||
|
返回: (chunks, separators)
|
||||||
|
"""
|
||||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||||
chunks = splitter.split_markdown(markdown_text)
|
return splitter.split_with_layout(markdown_text)
|
||||||
# 过滤空块,但保留占位符
|
|
||||||
return [chunk for chunk in chunks if chunk.strip() or is_placeholder(chunk)]
|
|
||||||
|
|
||||||
|
|
||||||
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
def join_markdown_with_layout(chunks: List[str], separators: List[str]) -> str:
|
||||||
"""判断常规文本是否需要单换行连接"""
|
"""
|
||||||
if not prev_chunk.strip() or not next_chunk.strip():
|
使用保存的分隔符还原 Markdown
|
||||||
return False
|
"""
|
||||||
|
if not chunks:
|
||||||
|
return ""
|
||||||
|
|
||||||
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
|
result = chunks[0]
|
||||||
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
|
for i in range(len(separators)):
|
||||||
|
# 安全检查,防止索引越界(虽然 split 保证了长度对应)
|
||||||
|
sep = separators[i] if i < len(separators) else "\n\n"
|
||||||
|
next_chunk = chunks[i + 1] if i + 1 < len(chunks) else ""
|
||||||
|
result += sep + next_chunk
|
||||||
|
|
||||||
# 表格
|
return result
|
||||||
if last_line_prev.startswith('|') and last_line_prev.endswith('|') and \
|
|
||||||
first_line_next.startswith('|') and first_line_next.endswith('|'):
|
|
||||||
return True
|
|
||||||
|
|
||||||
# 列表
|
|
||||||
list_markers = r'^\s*([-*+]|\d+\.)\s+'
|
|
||||||
if re.match(list_markers, last_line_prev) and re.match(list_markers, first_line_next):
|
|
||||||
return True
|
|
||||||
|
|
||||||
# 引用
|
# 兼容旧接口,防止其他地方报错
|
||||||
if last_line_prev.startswith('>') and first_line_next.startswith('>'):
|
def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
|
||||||
return True
|
chunks, _ = split_markdown_with_layout(markdown_text, max_block_size)
|
||||||
|
return chunks
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def join_markdown_texts(markdown_texts: List[str]) -> str:
|
def join_markdown_texts(markdown_texts: List[str]) -> str:
|
||||||
"""
|
# 旧接口只能猜,建议尽量使用新接口
|
||||||
智能地拼接Markdown块列表
|
return "\n\n".join(markdown_texts)
|
||||||
"""
|
|
||||||
if not markdown_texts:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
joined_text = markdown_texts[0]
|
|
||||||
for i in range(1, len(markdown_texts)):
|
|
||||||
prev_chunk = markdown_texts[i - 1]
|
|
||||||
current_chunk = markdown_texts[i]
|
|
||||||
|
|
||||||
# === 核心修复逻辑 ===
|
|
||||||
# 如果前一块或后一块是占位符,强制使用单换行 '\n'
|
|
||||||
# 这样可以保证:
|
|
||||||
# 1. 连续的徽章/图片 [img1]\n[img2] 会紧凑排列(视为行内元素)
|
|
||||||
# 2. HTML结构 <p>\n<img>\n</p> 不会被打断
|
|
||||||
# 3. 标题后的图片 # Title\n<img> 也能正常渲染
|
|
||||||
if is_placeholder(prev_chunk) or is_placeholder(current_chunk):
|
|
||||||
separator = "\n"
|
|
||||||
|
|
||||||
elif _needs_single_newline_join(prev_chunk, current_chunk):
|
|
||||||
separator = "\n"
|
|
||||||
else:
|
|
||||||
# 只有两个纯文本段落之间才用双换行
|
|
||||||
separator = "\n\n"
|
|
||||||
|
|
||||||
joined_text += separator + current_chunk
|
|
||||||
|
|
||||||
return joined_text
|
|
||||||
Reference in New Issue
Block a user