修复markdown图片缺失问题
This commit is contained in:
@@ -19,7 +19,7 @@ def generate_prompt(markdown_text: str, to_lang: str):
|
|||||||
return f"""
|
return f"""
|
||||||
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
|
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
|
||||||
- NO explanations. NO notes.
|
- NO explanations. NO notes.
|
||||||
- Do not change placeholders in the format of `<ph-xxxxxx>`.
|
- (very important) Preserve all placeholders in the format <ph-abcdef> (example: <ph-1>, <ph-af12asd>).
|
||||||
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
||||||
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
||||||
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
||||||
|
|||||||
@@ -153,7 +153,7 @@
|
|||||||
"contributorsGithub": "github 主页",
|
"contributorsGithub": "github 主页",
|
||||||
"contributorsPR": "提交 Pull Request",
|
"contributorsPR": "提交 Pull Request",
|
||||||
"contributorsIssue": "报告 Issue",
|
"contributorsIssue": "报告 Issue",
|
||||||
"contributorsQQ": "或者通过QQ群联系作者:<span>1047781902</span>",
|
"contributorsQQ": "或者通过QQ群联系作者:1047781902",
|
||||||
"glossaryModalTitle": "当前术语表",
|
"glossaryModalTitle": "当前术语表",
|
||||||
"glossaryTableSource": "原文 (src)",
|
"glossaryTableSource": "原文 (src)",
|
||||||
"glossaryTableDestination": "译文 (dst)",
|
"glossaryTableDestination": "译文 (dst)",
|
||||||
@@ -348,7 +348,7 @@
|
|||||||
"contributorsGithub": "GitHub Page",
|
"contributorsGithub": "GitHub Page",
|
||||||
"contributorsPR": "Submit a Pull Request",
|
"contributorsPR": "Submit a Pull Request",
|
||||||
"contributorsIssue": "Report an Issue",
|
"contributorsIssue": "Report an Issue",
|
||||||
"contributorsQQ": "Or contact the author via the QQ group: <span>1047781902</span>",
|
"contributorsQQ": "Or contact the author via the QQ group: 1047781902",
|
||||||
"glossaryModalTitle": "Current Glossary",
|
"glossaryModalTitle": "Current Glossary",
|
||||||
"glossaryTableSource": "Source (src)",
|
"glossaryTableSource": "Source (src)",
|
||||||
"glossaryTableDestination": "Destination (dst)",
|
"glossaryTableDestination": "Destination (dst)",
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Self
|
from typing import Self, List
|
||||||
|
|
||||||
from docutranslate.agents import MDTranslateAgent
|
from docutranslate.agents import MDTranslateAgent
|
||||||
from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
|
from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
|
||||||
@@ -38,20 +39,48 @@ class MDTranslator(AiTranslator):
|
|||||||
system_proxy_enable=config.system_proxy_enable)
|
system_proxy_enable=config.system_proxy_enable)
|
||||||
self.translate_agent = MDTranslateAgent(agent_config)
|
self.translate_agent = MDTranslateAgent(agent_config)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_placeholder(text: str) -> bool:
|
||||||
|
"""检查文本块是否仅包含图片占位符"""
|
||||||
|
# 匹配 <ph-xxxxxx> 格式,允许前后有空白
|
||||||
|
return bool(re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', text))
|
||||||
|
|
||||||
def translate(self, document: MarkdownDocument) -> Self:
|
def translate(self, document: MarkdownDocument) -> Self:
|
||||||
self.logger.info("正在翻译markdown")
|
self.logger.info("正在翻译markdown")
|
||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
if self.glossary_agent:
|
|
||||||
self.glossary_dict_gen = self.glossary_agent.send_segments(chunks, self.chunk_size)
|
# 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
|
||||||
|
translate_indices: List[int] = []
|
||||||
|
translate_chunks: List[str] = []
|
||||||
|
final_result: List[str] = list(chunks) # 浅拷贝,预填充原始值
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
if self._is_placeholder(chunk):
|
||||||
|
# 如果是占位符,不需要处理,final_result中该位置保持原样
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
translate_indices.append(i)
|
||||||
|
translate_chunks.append(chunk)
|
||||||
|
|
||||||
|
if self.glossary_agent and translate_chunks:
|
||||||
|
# 仅对需要翻译的文本提取术语
|
||||||
|
self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size)
|
||||||
if self.translate_agent:
|
if self.translate_agent:
|
||||||
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
||||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
|
||||||
if self.translate_agent:
|
self.logger.info(f"markdown分为{len(chunks)}块 (其中需翻译{len(translate_chunks)}块)")
|
||||||
result: list[str] = self.translate_agent.send_chunks(chunks)
|
|
||||||
else:
|
if self.translate_agent and translate_chunks:
|
||||||
result = chunks
|
translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks)
|
||||||
content = join_markdown_texts(result)
|
|
||||||
|
# 将翻译结果回填到对应位置
|
||||||
|
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
||||||
|
final_result[idx] = translated_text
|
||||||
|
|
||||||
|
# 如果没有翻译代理或者没有需要翻译的块,final_result 已经包含了正确的内容(原始chunks)
|
||||||
|
|
||||||
|
content = join_markdown_texts(final_result)
|
||||||
# 做一些加强鲁棒性的操作
|
# 做一些加强鲁棒性的操作
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
@@ -65,19 +94,35 @@ class MDTranslator(AiTranslator):
|
|||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
|
||||||
if self.glossary_agent:
|
# 预处理:分离出需要翻译的文本块和不需要翻译的占位符块
|
||||||
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(chunks, self.chunk_size)
|
translate_indices: List[int] = []
|
||||||
|
translate_chunks: List[str] = []
|
||||||
|
final_result: List[str] = list(chunks)
|
||||||
|
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
if self._is_placeholder(chunk):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
translate_indices.append(i)
|
||||||
|
translate_chunks.append(chunk)
|
||||||
|
|
||||||
|
if self.glossary_agent and translate_chunks:
|
||||||
|
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(translate_chunks,
|
||||||
|
self.chunk_size)
|
||||||
if self.translate_agent:
|
if self.translate_agent:
|
||||||
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
||||||
|
|
||||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
self.logger.info(f"markdown分为{len(chunks)}块 (其中需翻译{len(translate_chunks)}块)")
|
||||||
if self.translate_agent:
|
|
||||||
result: list[str] = await self.translate_agent.send_chunks_async(chunks)
|
if self.translate_agent and translate_chunks:
|
||||||
else:
|
translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks)
|
||||||
result = chunks
|
|
||||||
|
# 将翻译结果回填到对应位置
|
||||||
|
for idx, translated_text in zip(translate_indices, translated_sub_results):
|
||||||
|
final_result[idx] = translated_text
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
content = join_markdown_texts(result)
|
content = join_markdown_texts(final_result)
|
||||||
# 做一些加强鲁棒性的操作
|
# 做一些加强鲁棒性的操作
|
||||||
content = content.replace(r'\(', r'\(')
|
content = content.replace(r'\(', r'\(')
|
||||||
content = content.replace(r'\)', r'\)')
|
content = content.replace(r'\)', r'\)')
|
||||||
@@ -85,4 +130,4 @@ class MDTranslator(AiTranslator):
|
|||||||
|
|
||||||
await asyncio.to_thread(run)
|
await asyncio.to_thread(run)
|
||||||
self.logger.info("翻译完成")
|
self.logger.info("翻译完成")
|
||||||
return self
|
return self
|
||||||
@@ -4,8 +4,6 @@ import re
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MarkdownBlockSplitter:
|
class MarkdownBlockSplitter:
|
||||||
def __init__(self, max_block_size: int = 5000):
|
def __init__(self, max_block_size: int = 5000):
|
||||||
"""
|
"""
|
||||||
@@ -15,11 +13,17 @@ class MarkdownBlockSplitter:
|
|||||||
max_block_size: 每个块的最大字节数
|
max_block_size: 每个块的最大字节数
|
||||||
"""
|
"""
|
||||||
self.max_block_size = max_block_size
|
self.max_block_size = max_block_size
|
||||||
|
# 匹配占位符的正则,例如 <ph-abc123>
|
||||||
|
self.placeholder_pattern = r'(<ph-[a-zA-Z0-9]+>)'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_bytes(text: str) -> int:
|
def _get_bytes(text: str) -> int:
|
||||||
return len(text.encode('utf-8'))
|
return len(text.encode('utf-8'))
|
||||||
|
|
||||||
|
def _is_placeholder(self, text: str) -> bool:
|
||||||
|
"""判断文本是否纯粹是一个占位符"""
|
||||||
|
return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip()))
|
||||||
|
|
||||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
def split_markdown(self, markdown_text: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
将Markdown文本分割成指定大小的块
|
将Markdown文本分割成指定大小的块
|
||||||
@@ -37,6 +41,18 @@ class MarkdownBlockSplitter:
|
|||||||
for block in logical_blocks:
|
for block in logical_blocks:
|
||||||
block_size = self._get_bytes(block)
|
block_size = self._get_bytes(block)
|
||||||
|
|
||||||
|
# 检查是否是占位符块(需要单独成块)
|
||||||
|
if self._is_placeholder(block):
|
||||||
|
# 如果当前有积累的块,先输出
|
||||||
|
if current_chunk_parts:
|
||||||
|
chunks.append("".join(current_chunk_parts))
|
||||||
|
current_chunk_parts = []
|
||||||
|
current_size = 0
|
||||||
|
|
||||||
|
# 占位符单独作为一个chunk
|
||||||
|
chunks.append(block)
|
||||||
|
continue
|
||||||
|
|
||||||
# 情况1:块本身就过大
|
# 情况1:块本身就过大
|
||||||
if block_size > self.max_block_size:
|
if block_size > self.max_block_size:
|
||||||
# 先将当前积累的块输出
|
# 先将当前积累的块输出
|
||||||
@@ -69,7 +85,7 @@ class MarkdownBlockSplitter:
|
|||||||
|
|
||||||
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
将Markdown文本分割成逻辑块(标题、段落、代码块、空行分隔符等)
|
将Markdown文本分割成逻辑块(标题、段落、代码块、空行分隔符、图片占位符等)
|
||||||
"""
|
"""
|
||||||
# 标准化换行符
|
# 标准化换行符
|
||||||
text = markdown_text.replace('\r\n', '\n')
|
text = markdown_text.replace('\r\n', '\n')
|
||||||
@@ -86,11 +102,21 @@ class MarkdownBlockSplitter:
|
|||||||
if i % 2 == 1: # 这是一个代码块
|
if i % 2 == 1: # 这是一个代码块
|
||||||
blocks.append(part)
|
blocks.append(part)
|
||||||
else: # 这是普通Markdown内容
|
else: # 这是普通Markdown内容
|
||||||
# 按一个或多个空行分割,并保留分隔符
|
# 1. 先按占位符分割,确保占位符独立
|
||||||
# 这能有效分离段落、列表、标题等,并保留它们之间的空行
|
ph_parts = re.split(self.placeholder_pattern, part)
|
||||||
sub_parts = re.split(r'(\n{2,})', part)
|
|
||||||
# 过滤掉 re.split 可能产生的空字符串
|
for ph_part in ph_parts:
|
||||||
blocks.extend([p for p in sub_parts if p])
|
if not ph_part:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self._is_placeholder(ph_part):
|
||||||
|
blocks.append(ph_part)
|
||||||
|
else:
|
||||||
|
# 2. 对非占位符文本,按一个或多个空行分割,并保留分隔符
|
||||||
|
# 这能有效分离段落、列表、标题等,并保留它们之间的空行
|
||||||
|
sub_parts = re.split(r'(\n{2,})', ph_part)
|
||||||
|
# 过滤掉 re.split 可能产生的空字符串
|
||||||
|
blocks.extend([p for p in sub_parts if p])
|
||||||
|
|
||||||
return blocks
|
return blocks
|
||||||
|
|
||||||
@@ -153,8 +179,8 @@ def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]:
|
|||||||
"""
|
"""
|
||||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||||
chunks = splitter.split_markdown(markdown_text)
|
chunks = splitter.split_markdown(markdown_text)
|
||||||
# 过滤掉仅由空白字符组成的块
|
# 过滤掉仅由空白字符组成的块,但保留占位符块
|
||||||
return [chunk for chunk in chunks if chunk.strip()]
|
return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)]
|
||||||
|
|
||||||
|
|
||||||
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
||||||
@@ -165,6 +191,13 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool:
|
|||||||
if not prev_chunk.strip() or not next_chunk.strip():
|
if not prev_chunk.strip() or not next_chunk.strip():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# 如果其中一个是占位符,通常建议使用双换行以确保它是独立的块,
|
||||||
|
# 除非原格式非常紧凑,但在翻译场景下,分隔开更安全。
|
||||||
|
# 这里不额外处理占位符,走默认逻辑(最后会返回False,从而使用\n\n)
|
||||||
|
if re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', prev_chunk) or \
|
||||||
|
re.match(r'^\s*<ph-[a-zA-Z0-9]+>\s*$', next_chunk):
|
||||||
|
return False
|
||||||
|
|
||||||
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
|
last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip()
|
||||||
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
|
first_line_next = next_chunk.lstrip().split('\n')[0].lstrip()
|
||||||
|
|
||||||
@@ -207,12 +240,3 @@ def join_markdown_texts(markdown_texts: List[str]) -> str:
|
|||||||
joined_text += separator + current_chunk
|
joined_text += separator + current_chunk
|
||||||
|
|
||||||
return joined_text
|
return joined_text
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
from pathlib import Path
|
|
||||||
from docutranslate.utils.markdown_utils import clean_markdown_math_block
|
|
||||||
content=Path(r"C:\Users\jxgm\Desktop\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md").read_text()
|
|
||||||
content=split_markdown_text(content)
|
|
||||||
content=join_markdown_texts(content)
|
|
||||||
|
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ def placeholder2uris(markdown: str, mask_dict: MaskDict):
|
|||||||
print(f"占位符<ph-{id}>已还原为图片")
|
print(f"占位符<ph-{id}>已还原为图片")
|
||||||
return uri
|
return uri
|
||||||
|
|
||||||
ph_pattern = r"<ph-([a-zA-Z0-9]+)>"
|
ph_pattern = r"<\s*[pP][hH]\s*-\s*([a-zA-Z0-9]+)\s*>"
|
||||||
markdown = re.sub(ph_pattern, placeholder2uri, markdown)
|
markdown = re.sub(ph_pattern, placeholder2uri, markdown)
|
||||||
return markdown
|
return markdown
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user