修复markdown图片缺失问题

This commit is contained in:
xunbu
2025-11-25 12:47:27 +08:00
parent bbf34762ab
commit 8cc903813b
5 changed files with 110 additions and 41 deletions

View File

@@ -19,7 +19,7 @@ def generate_prompt(markdown_text: str, to_lang: str):
return f"""
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
- NO explanations. NO notes.
- Do not change placeholders in the format of `<ph-xxxxxx>`.
- (very important) Preserve all placeholders in the format <ph-abcdef> (example: <ph-1>, <ph-af12asd>).
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
- Remove or correct any obviously abnormal characters, but without altering the original meaning.

View File

@@ -153,7 +153,7 @@
"contributorsGithub": "github 主页",
"contributorsPR": "提交 Pull Request",
"contributorsIssue": "报告 Issue",
"contributorsQQ": "或者通过QQ群联系作者<span>1047781902</span>",
"contributorsQQ": "或者通过QQ群联系作者1047781902",
"glossaryModalTitle": "当前术语表",
"glossaryTableSource": "原文 (src)",
"glossaryTableDestination": "译文 (dst)",
@@ -348,7 +348,7 @@
"contributorsGithub": "GitHub Page",
"contributorsPR": "Submit a Pull Request",
"contributorsIssue": "Report an Issue",
"contributorsQQ": "Or contact the author via the QQ group: <span>1047781902</span>",
"contributorsQQ": "Or contact the author via the QQ group: 1047781902",
"glossaryModalTitle": "Current Glossary",
"glossaryTableSource": "Source (src)",
"glossaryTableDestination": "Destination (dst)",

View File

@@ -1,8 +1,9 @@
# SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0
import asyncio
import re
from dataclasses import dataclass
from typing import Self
from typing import Self, List
from docutranslate.agents import MDTranslateAgent
from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
@@ -38,20 +39,48 @@ class MDTranslator(AiTranslator):
system_proxy_enable=config.system_proxy_enable)
self.translate_agent = MDTranslateAgent(agent_config)
@staticmethod
def _is_placeholder(text: str) -> bool:
"""检查文本块是否仅包含图片占位符"""
# 匹配 <ph-xxxxxx> 格式,允许前后有空白
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
def translate(self, document: MarkdownDocument) -> Self:
self.logger.info("正在翻译markdown")
with MDMaskUrisContext(document):
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
if self.glossary_agent:
self.glossary_dict_gen = self.glossary_agent.send_segments(chunks, self.chunk_size)
# 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
translate_indices: List[int] = []
translate_chunks: List[str] = []
final_result: List[str] = list(chunks) # 浅拷贝,预填充原始值
for i, chunk in enumerate(chunks):
if self._is_placeholder(chunk):
# 如果是占位符不需要处理final_result中该位置保持原样
continue
else:
translate_indices.append(i)
translate_chunks.append(chunk)
if self.glossary_agent and translate_chunks:
# 仅对需要翻译的文本提取术语
self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size)
if self.translate_agent:
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
self.logger.info(f"markdown分为{len(chunks)}")
if self.translate_agent:
result: list[str] = self.translate_agent.send_chunks(chunks)
else:
result = chunks
content = join_markdown_texts(result)
self.logger.info(f"markdown分为{len(chunks)}块 (其中需翻译{len(translate_chunks)}块)")
if self.translate_agent and translate_chunks:
translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks)
# 将翻译结果回填到对应位置
for idx, translated_text in zip(translate_indices, translated_sub_results):
final_result[idx] = translated_text
# 如果没有翻译代理或者没有需要翻译的块final_result 已经包含了正确的内容原始chunks
content = join_markdown_texts(final_result)
# 做一些加强鲁棒性的操作
content = content.replace(r'\', r'\(')
content = content.replace(r'\', r'\)')
@@ -65,19 +94,35 @@ class MDTranslator(AiTranslator):
with MDMaskUrisContext(document):
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
if self.glossary_agent:
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(chunks, self.chunk_size)
# 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
translate_indices: List[int] = []
translate_chunks: List[str] = []
final_result: List[str] = list(chunks)
for i, chunk in enumerate(chunks):
if self._is_placeholder(chunk):
continue
else:
translate_indices.append(i)
translate_chunks.append(chunk)
if self.glossary_agent and translate_chunks:
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(translate_chunks,
self.chunk_size)
if self.translate_agent:
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
self.logger.info(f"markdown分为{len(chunks)}")
if self.translate_agent:
result: list[str] = await self.translate_agent.send_chunks_async(chunks)
else:
result = chunks
self.logger.info(f"markdown分为{len(chunks)} (其中需翻译{len(translate_chunks)}块)")
if self.translate_agent and translate_chunks:
translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks)
# 将翻译结果回填到对应位置
for idx, translated_text in zip(translate_indices, translated_sub_results):
final_result[idx] = translated_text
def run():
content = join_markdown_texts(result)
content = join_markdown_texts(final_result)
# 做一些加强鲁棒性的操作
content = content.replace(r'\', r'\(')
content = content.replace(r'\', r'\)')
@@ -85,4 +130,4 @@ class MDTranslator(AiTranslator):
await asyncio.to_thread(run)
self.logger.info("翻译完成")
return self
return self

View File

@@ -4,8 +4,6 @@ import re
from typing import List
class MarkdownBlockSplitter:
def __init__(self, max_block_size: int = 5000):
"""
@@ -15,11 +13,17 @@ class MarkdownBlockSplitter:
max_block_size: 每个块的最大字节数
"""
self.max_block_size = max_block_size
# 匹配占位符的正则,例如 <ph-abc123>
self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
@staticmethod
def _get_bytes(text: str) -> int:
return len(text.encode('utf-8'))
def _is_placeholder(self, text: str) -> bool:
"""判断文本是否纯粹是一个占位符"""
return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip()))
def split_markdown(self, markdown_text: str) -> List[str]:
"""
将Markdown文本分割成指定大小的块
@@ -37,6 +41,18 @@ class MarkdownBlockSplitter:
for block in logical_blocks:
block_size = self._get_bytes(block)
# 检查是否是占位符块(需要单独成块)
if self._is_placeholder(block):
# 如果当前有积累的块,先输出
if current_chunk_parts:
chunks.append("".join(current_chunk_parts))
current_chunk_parts = []
current_size = 0
# 占位符单独作为一个chunk
chunks.append(block)
continue
# 情况1块本身就过大
if block_size > self.max_block_size:
# 先将当前积累的块输出
@@ -69,7 +85,7 @@ class MarkdownBlockSplitter:
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
"""
将Markdown文本分割成逻辑块标题、段落、代码块、空行分隔符等
将Markdown文本分割成逻辑块标题、段落、代码块、空行分隔符、图片占位符等)
"""
# 标准化换行符
text = markdown_text.replace('\r\n', '\n')
@@ -86,11 +102,21 @@ class MarkdownBlockSplitter:
if i % 2 == 1: # 这是一个代码块
blocks.append(part)
else: # 这是普通Markdown内容
# 按一个或多个空行分割,并保留分隔符
# 这能有效分离段落、列表、标题等,并保留它们之间的空行
sub_parts = re.split(r'(\n{2,})', part)
# 过滤掉 re.split 可能产生的空字符串
blocks.extend([p for p in sub_parts if p])
# 1. 先按占位符分割,确保占位符独立
ph_parts = re.split(self.placeholder_pattern, part)
for ph_part in ph_parts:
if not ph_part:
continue
if self._is_placeholder(ph_part):
blocks.append(ph_part)
else:
# 2. 对非占位符文本,按一个或多个空行分割,并保留分隔符
# 这能有效分离段落、列表、标题等,并保留它们之间的空行
sub_parts = re.split(r'(\n{2,})', ph_part)
# 过滤掉 re.split 可能产生的空字符串
blocks.extend([p for p in sub_parts if p])
return blocks
@@ -153,8 +179,8 @@ def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
"""
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
chunks = splitter.split_markdown(markdown_text)
# 过滤掉仅由空白字符组成的块
return [chunk for chunk in chunks if chunk.strip()]
# 过滤掉仅由空白字符组成的块,但保留占位符块
return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)]
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
@@ -165,6 +191,13 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
if not prev_chunk.strip() or not next_chunk.strip():
return False
# 如果其中一个是占位符,通常建议使用双换行以确保它是独立的块,
# 除非原格式非常紧凑,但在翻译场景下,分隔开更安全。
# 这里不额外处理占位符走默认逻辑最后会返回False从而使用\n\n
if re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', prev_chunk) or \
re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', next_chunk):
return False
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
@@ -207,12 +240,3 @@ def join_markdown_texts(markdown_texts: List[str]) -> str:
joined_text += separator + current_chunk
return joined_text
if __name__ == '__main__':
from pathlib import Path
from docutranslate.utils.markdown_utils import clean_markdown_math_block
content=Path(r"C:\Users\jxgm\Desktop\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md").read_text()
content=split_markdown_text(content)
content=join_markdown_texts(content)

View File

@@ -81,7 +81,7 @@ def placeholder2uris(markdown: str, mask_dict: MaskDict):
print(f"占位符<ph-{id}>已还原为图片")
return uri
ph_pattern = r"<ph-([a-zA-Z0-9]+)>"
ph_pattern = r"<\s*[pP][hH]\s*-\s*([a-zA-Z0-9]+)\s*>"
markdown = re.sub(ph_pattern, placeholder2uri, markdown)
return markdown