修复markdown图片缺失问题

This commit is contained in:
xunbu
2025-11-25 12:47:27 +08:00
parent bbf34762ab
commit 8cc903813b
5 changed files with 110 additions and 41 deletions

View File

@@ -19,7 +19,7 @@ def generate_prompt(markdown_text: str, to_lang: str):
return f""" return f"""
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY. Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
- NO explanations. NO notes. - NO explanations. NO notes.
- Do not change placeholders in the format of `<ph-xxxxxx>`. - (very important) Preserve all placeholders in the format <ph-abcdef> (example: <ph-1>, <ph-af12asd>).
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form. - For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it. - All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
- Remove or correct any obviously abnormal characters, but without altering the original meaning. - Remove or correct any obviously abnormal characters, but without altering the original meaning.

View File

@@ -153,7 +153,7 @@
"contributorsGithub": "github 主页", "contributorsGithub": "github 主页",
"contributorsPR": "提交 Pull Request", "contributorsPR": "提交 Pull Request",
"contributorsIssue": "报告 Issue", "contributorsIssue": "报告 Issue",
"contributorsQQ": "或者通过QQ群联系作者<span>1047781902</span>", "contributorsQQ": "或者通过QQ群联系作者1047781902",
"glossaryModalTitle": "当前术语表", "glossaryModalTitle": "当前术语表",
"glossaryTableSource": "原文 (src)", "glossaryTableSource": "原文 (src)",
"glossaryTableDestination": "译文 (dst)", "glossaryTableDestination": "译文 (dst)",
@@ -348,7 +348,7 @@
"contributorsGithub": "GitHub Page", "contributorsGithub": "GitHub Page",
"contributorsPR": "Submit a Pull Request", "contributorsPR": "Submit a Pull Request",
"contributorsIssue": "Report an Issue", "contributorsIssue": "Report an Issue",
"contributorsQQ": "Or contact the author via the QQ group: <span>1047781902</span>", "contributorsQQ": "Or contact the author via the QQ group: 1047781902",
"glossaryModalTitle": "Current Glossary", "glossaryModalTitle": "Current Glossary",
"glossaryTableSource": "Source (src)", "glossaryTableSource": "Source (src)",
"glossaryTableDestination": "Destination (dst)", "glossaryTableDestination": "Destination (dst)",

View File

@@ -1,8 +1,9 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
import asyncio import asyncio
import re
from dataclasses import dataclass from dataclasses import dataclass
from typing import Self from typing import Self, List
from docutranslate.agents import MDTranslateAgent from docutranslate.agents import MDTranslateAgent
from docutranslate.agents.markdown_agent import MDTranslateAgentConfig from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
@@ -38,20 +39,48 @@ class MDTranslator(AiTranslator):
system_proxy_enable=config.system_proxy_enable) system_proxy_enable=config.system_proxy_enable)
self.translate_agent = MDTranslateAgent(agent_config) self.translate_agent = MDTranslateAgent(agent_config)
@staticmethod
def _is_placeholder(text: str) -> bool:
"""检查文本块是否仅包含图片占位符"""
# 匹配 <ph-xxxxxx> 格式,允许前后有空白
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
def translate(self, document: MarkdownDocument) -> Self: def translate(self, document: MarkdownDocument) -> Self:
self.logger.info("正在翻译markdown") self.logger.info("正在翻译markdown")
with MDMaskUrisContext(document): with MDMaskUrisContext(document):
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
if self.glossary_agent:
self.glossary_dict_gen = self.glossary_agent.send_segments(chunks, self.chunk_size) # 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
translate_indices: List[int] = []
translate_chunks: List[str] = []
final_result: List[str] = list(chunks) # 浅拷贝,预填充原始值
for i, chunk in enumerate(chunks):
if self._is_placeholder(chunk):
# 如果是占位符不需要处理final_result中该位置保持原样
continue
else:
translate_indices.append(i)
translate_chunks.append(chunk)
if self.glossary_agent and translate_chunks:
# 仅对需要翻译的文本提取术语
self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size)
if self.translate_agent: if self.translate_agent:
self.translate_agent.update_glossary_dict(self.glossary_dict_gen) self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
self.logger.info(f"markdown分为{len(chunks)}")
if self.translate_agent: self.logger.info(f"markdown分为{len(chunks)}块 (其中需翻译{len(translate_chunks)}块)")
result: list[str] = self.translate_agent.send_chunks(chunks)
else: if self.translate_agent and translate_chunks:
result = chunks translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks)
content = join_markdown_texts(result)
# 将翻译结果回填到对应位置
for idx, translated_text in zip(translate_indices, translated_sub_results):
final_result[idx] = translated_text
# 如果没有翻译代理或者没有需要翻译的块final_result 已经包含了正确的内容原始chunks
content = join_markdown_texts(final_result)
# 做一些加强鲁棒性的操作 # 做一些加强鲁棒性的操作
content = content.replace(r'\', r'\(') content = content.replace(r'\', r'\(')
content = content.replace(r'\', r'\)') content = content.replace(r'\', r'\)')
@@ -65,19 +94,35 @@ class MDTranslator(AiTranslator):
with MDMaskUrisContext(document): with MDMaskUrisContext(document):
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
if self.glossary_agent: # 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(chunks, self.chunk_size) translate_indices: List[int] = []
translate_chunks: List[str] = []
final_result: List[str] = list(chunks)
for i, chunk in enumerate(chunks):
if self._is_placeholder(chunk):
continue
else:
translate_indices.append(i)
translate_chunks.append(chunk)
if self.glossary_agent and translate_chunks:
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(translate_chunks,
self.chunk_size)
if self.translate_agent: if self.translate_agent:
self.translate_agent.update_glossary_dict(self.glossary_dict_gen) self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
self.logger.info(f"markdown分为{len(chunks)}") self.logger.info(f"markdown分为{len(chunks)} (其中需翻译{len(translate_chunks)}块)")
if self.translate_agent:
result: list[str] = await self.translate_agent.send_chunks_async(chunks) if self.translate_agent and translate_chunks:
else: translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks)
result = chunks
# 将翻译结果回填到对应位置
for idx, translated_text in zip(translate_indices, translated_sub_results):
final_result[idx] = translated_text
def run(): def run():
content = join_markdown_texts(result) content = join_markdown_texts(final_result)
# 做一些加强鲁棒性的操作 # 做一些加强鲁棒性的操作
content = content.replace(r'\', r'\(') content = content.replace(r'\', r'\(')
content = content.replace(r'\', r'\)') content = content.replace(r'\', r'\)')

View File

@@ -4,8 +4,6 @@ import re
from typing import List from typing import List
class MarkdownBlockSplitter: class MarkdownBlockSplitter:
def __init__(self, max_block_size: int = 5000): def __init__(self, max_block_size: int = 5000):
""" """
@@ -15,11 +13,17 @@ class MarkdownBlockSplitter:
max_block_size: 每个块的最大字节数 max_block_size: 每个块的最大字节数
""" """
self.max_block_size = max_block_size self.max_block_size = max_block_size
# 匹配占位符的正则,例如 <ph-abc123>
self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
@staticmethod @staticmethod
def _get_bytes(text: str) -> int: def _get_bytes(text: str) -> int:
return len(text.encode('utf-8')) return len(text.encode('utf-8'))
def _is_placeholder(self, text: str) -> bool:
"""判断文本是否纯粹是一个占位符"""
return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip()))
def split_markdown(self, markdown_text: str) -> List[str]: def split_markdown(self, markdown_text: str) -> List[str]:
""" """
将Markdown文本分割成指定大小的块 将Markdown文本分割成指定大小的块
@@ -37,6 +41,18 @@ class MarkdownBlockSplitter:
for block in logical_blocks: for block in logical_blocks:
block_size = self._get_bytes(block) block_size = self._get_bytes(block)
# 检查是否是占位符块(需要单独成块)
if self._is_placeholder(block):
# 如果当前有积累的块,先输出
if current_chunk_parts:
chunks.append("".join(current_chunk_parts))
current_chunk_parts = []
current_size = 0
# 占位符单独作为一个chunk
chunks.append(block)
continue
# 情况1块本身就过大 # 情况1块本身就过大
if block_size > self.max_block_size: if block_size > self.max_block_size:
# 先将当前积累的块输出 # 先将当前积累的块输出
@@ -69,7 +85,7 @@ class MarkdownBlockSplitter:
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]: def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
""" """
将Markdown文本分割成逻辑块标题、段落、代码块、空行分隔符等 将Markdown文本分割成逻辑块标题、段落、代码块、空行分隔符、图片占位符等)
""" """
# 标准化换行符 # 标准化换行符
text = markdown_text.replace('\r\n', '\n') text = markdown_text.replace('\r\n', '\n')
@@ -86,11 +102,21 @@ class MarkdownBlockSplitter:
if i % 2 == 1: # 这是一个代码块 if i % 2 == 1: # 这是一个代码块
blocks.append(part) blocks.append(part)
else: # 这是普通Markdown内容 else: # 这是普通Markdown内容
# 按一个或多个空行分割,并保留分隔符 # 1. 先按占位符分割,确保占位符独立
# 这能有效分离段落、列表、标题等,并保留它们之间的空行 ph_parts = re.split(self.placeholder_pattern, part)
sub_parts = re.split(r'(\n{2,})', part)
# 过滤掉 re.split 可能产生的空字符串 for ph_part in ph_parts:
blocks.extend([p for p in sub_parts if p]) if not ph_part:
continue
if self._is_placeholder(ph_part):
blocks.append(ph_part)
else:
# 2. 对非占位符文本,按一个或多个空行分割,并保留分隔符
# 这能有效分离段落、列表、标题等,并保留它们之间的空行
sub_parts = re.split(r'(\n{2,})', ph_part)
# 过滤掉 re.split 可能产生的空字符串
blocks.extend([p for p in sub_parts if p])
return blocks return blocks
@@ -153,8 +179,8 @@ def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
""" """
splitter = MarkdownBlockSplitter(max_block_size=max_block_size) splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
chunks = splitter.split_markdown(markdown_text) chunks = splitter.split_markdown(markdown_text)
# 过滤掉仅由空白字符组成的块 # 过滤掉仅由空白字符组成的块,但保留占位符块
return [chunk for chunk in chunks if chunk.strip()] return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)]
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool: def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
@@ -165,6 +191,13 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
if not prev_chunk.strip() or not next_chunk.strip(): if not prev_chunk.strip() or not next_chunk.strip():
return False return False
# 如果其中一个是占位符,通常建议使用双换行以确保它是独立的块,
# 除非原格式非常紧凑,但在翻译场景下,分隔开更安全。
# 这里不额外处理占位符走默认逻辑最后会返回False从而使用\n\n
if re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', prev_chunk) or \
re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', next_chunk):
return False
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip() last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip() first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
@@ -207,12 +240,3 @@ def join_markdown_texts(markdown_texts: List[str]) -> str:
joined_text += separator + current_chunk joined_text += separator + current_chunk
return joined_text return joined_text
if __name__ == '__main__':
from pathlib import Path
from docutranslate.utils.markdown_utils import clean_markdown_math_block
content=Path(r"C:\Users\jxgm\Desktop\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md").read_text()
content=split_markdown_text(content)
content=join_markdown_texts(content)

View File

@@ -81,7 +81,7 @@ def placeholder2uris(markdown: str, mask_dict: MaskDict):
print(f"占位符<ph-{id}>已还原为图片") print(f"占位符<ph-{id}>已还原为图片")
return uri return uri
ph_pattern = r"<ph-([a-zA-Z0-9]+)>" ph_pattern = r"<\s*[pP][hH]\s*-\s*([a-zA-Z0-9]+)\s*>"
markdown = re.sub(ph_pattern, placeholder2uri, markdown) markdown = re.sub(ph_pattern, placeholder2uri, markdown)
return markdown return markdown