From 8cc903813bacff3a7fafc6983b71b0c59c8dab86 Mon Sep 17 00:00:00 2001 From: xunbu Date: Tue, 25 Nov 2025 12:47:27 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dmarkdown=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/markdown_agent.py | 2 +- docutranslate/static/i18nData.json | 4 +- .../translator/ai_translator/md_translator.py | 81 ++++++++++++++----- docutranslate/utils/markdown_splitter.py | 62 +++++++++----- docutranslate/utils/markdown_utils.py | 2 +- 5 files changed, 110 insertions(+), 41 deletions(-) diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index 6fb2dd9..43095c1 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -19,7 +19,7 @@ def generate_prompt(markdown_text: str, to_lang: str): return f""" Treat the text input as markdown text and translate it into {to_lang},output translation ONLY. - NO explanations. NO notes. -- Do not change placeholders in the format of ``. +- (very important) Preserve all placeholders in the format (example: , ). - For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form. - All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it. - Remove or correct any obviously abnormal characters, but without altering the original meaning. diff --git a/docutranslate/static/i18nData.json b/docutranslate/static/i18nData.json index ca5c904..c045479 100644 --- a/docutranslate/static/i18nData.json +++ b/docutranslate/static/i18nData.json @@ -153,7 +153,7 @@ "contributorsGithub": "github 主页", "contributorsPR": "提交 Pull Request", "contributorsIssue": "报告 Issue", - "contributorsQQ": "或者通过QQ群联系作者:1047781902", + "contributorsQQ": "或者通过QQ群联系作者:1047781902", "glossaryModalTitle": "当前术语表", "glossaryTableSource": "原文 (src)", "glossaryTableDestination": "译文 (dst)", @@ -348,7 +348,7 @@ "contributorsGithub": "GitHub Page", "contributorsPR": "Submit a Pull Request", "contributorsIssue": "Report an Issue", - "contributorsQQ": "Or contact the author via the QQ group: 1047781902", + "contributorsQQ": "Or contact the author via the QQ group: 1047781902", "glossaryModalTitle": "Current Glossary", "glossaryTableSource": "Source (src)", "glossaryTableDestination": "Destination (dst)", diff --git a/docutranslate/translator/ai_translator/md_translator.py b/docutranslate/translator/ai_translator/md_translator.py index 25b5265..c811114 100644 --- a/docutranslate/translator/ai_translator/md_translator.py +++ b/docutranslate/translator/ai_translator/md_translator.py @@ -1,8 +1,9 @@ # SPDX-FileCopyrightText: 2025 QinHan # SPDX-License-Identifier: MPL-2.0 import asyncio +import re from dataclasses import dataclass -from typing import Self +from typing import Self, List from docutranslate.agents import MDTranslateAgent from docutranslate.agents.markdown_agent import MDTranslateAgentConfig @@ -38,20 +39,48 @@ class MDTranslator(AiTranslator): system_proxy_enable=config.system_proxy_enable) self.translate_agent = MDTranslateAgent(agent_config) + @staticmethod + def _is_placeholder(text: str) -> bool: + """检查文本块是否仅包含图片占位符""" + # 匹配 格式,允许前后有空白 + return bool(re.match(r'^\s*\s*$', text)) + def translate(self, document: MarkdownDocument) -> Self: self.logger.info("正在翻译markdown") with MDMaskUrisContext(document): chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) - if self.glossary_agent: - self.glossary_dict_gen = self.glossary_agent.send_segments(chunks, self.chunk_size) + + # 预处理:分离出需要翻译的文本块和不需要翻译的占位符块 + translate_indices: List[int] = [] + translate_chunks: List[str] = [] + final_result: List[str] = list(chunks) # 浅拷贝,预填充原始值 + + for i, chunk in enumerate(chunks): + if self._is_placeholder(chunk): + # 如果是占位符,不需要处理,final_result中该位置保持原样 + continue + else: + translate_indices.append(i) + translate_chunks.append(chunk) + + if self.glossary_agent and translate_chunks: + # 仅对需要翻译的文本提取术语 + self.glossary_dict_gen = self.glossary_agent.send_segments(translate_chunks, self.chunk_size) if self.translate_agent: self.translate_agent.update_glossary_dict(self.glossary_dict_gen) - self.logger.info(f"markdown分为{len(chunks)}块") - if self.translate_agent: - result: list[str] = self.translate_agent.send_chunks(chunks) - else: - result = chunks - content = join_markdown_texts(result) + + self.logger.info(f"markdown分为{len(chunks)}块 (其中需翻译{len(translate_chunks)}块)") + + if self.translate_agent and translate_chunks: + translated_sub_results: list[str] = self.translate_agent.send_chunks(translate_chunks) + + # 将翻译结果回填到对应位置 + for idx, translated_text in zip(translate_indices, translated_sub_results): + final_result[idx] = translated_text + + # 如果没有翻译代理或者没有需要翻译的块,final_result 已经包含了正确的内容(原始chunks) + + content = join_markdown_texts(final_result) # 做一些加强鲁棒性的操作 content = content.replace(r'\(', r'\(') content = content.replace(r'\)', r'\)') @@ -65,19 +94,35 @@ class MDTranslator(AiTranslator): with MDMaskUrisContext(document): chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) - if self.glossary_agent: - self.glossary_dict_gen = await self.glossary_agent.send_segments_async(chunks, self.chunk_size) + # 预处理:分离出需要翻译的文本块和不需要翻译的占位符块 + translate_indices: List[int] = [] + translate_chunks: List[str] = [] + final_result: List[str] = list(chunks) + + for i, chunk in enumerate(chunks): + if self._is_placeholder(chunk): + continue + else: + translate_indices.append(i) + translate_chunks.append(chunk) + + if self.glossary_agent and translate_chunks: + self.glossary_dict_gen = await self.glossary_agent.send_segments_async(translate_chunks, + self.chunk_size) if self.translate_agent: self.translate_agent.update_glossary_dict(self.glossary_dict_gen) - self.logger.info(f"markdown分为{len(chunks)}块") - if self.translate_agent: - result: list[str] = await self.translate_agent.send_chunks_async(chunks) - else: - result = chunks + self.logger.info(f"markdown分为{len(chunks)}块 (其中需翻译{len(translate_chunks)}块)") + + if self.translate_agent and translate_chunks: + translated_sub_results: list[str] = await self.translate_agent.send_chunks_async(translate_chunks) + + # 将翻译结果回填到对应位置 + for idx, translated_text in zip(translate_indices, translated_sub_results): + final_result[idx] = translated_text def run(): - content = join_markdown_texts(result) + content = join_markdown_texts(final_result) # 做一些加强鲁棒性的操作 content = content.replace(r'\(', r'\(') content = content.replace(r'\)', r'\)') @@ -85,4 +130,4 @@ class MDTranslator(AiTranslator): await asyncio.to_thread(run) self.logger.info("翻译完成") - return self + return self \ No newline at end of file diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py index 5c216ff..dc3e194 100644 --- a/docutranslate/utils/markdown_splitter.py +++ b/docutranslate/utils/markdown_splitter.py @@ -4,8 +4,6 @@ import re from typing import List - - class MarkdownBlockSplitter: def __init__(self, max_block_size: int = 5000): """ @@ -15,11 +13,17 @@ class MarkdownBlockSplitter: max_block_size: 每个块的最大字节数 """ self.max_block_size = max_block_size + # 匹配占位符的正则,例如 + self.placeholder_pattern = r'()' @staticmethod def _get_bytes(text: str) -> int: return len(text.encode('utf-8')) + def _is_placeholder(self, text: str) -> bool: + """判断文本是否纯粹是一个占位符""" + return bool(re.match(r'^' + self.placeholder_pattern + r'$', text.strip())) + def split_markdown(self, markdown_text: str) -> List[str]: """ 将Markdown文本分割成指定大小的块 @@ -37,6 +41,18 @@ class MarkdownBlockSplitter: for block in logical_blocks: block_size = self._get_bytes(block) + # 检查是否是占位符块(需要单独成块) + if self._is_placeholder(block): + # 如果当前有积累的块,先输出 + if current_chunk_parts: + chunks.append("".join(current_chunk_parts)) + current_chunk_parts = [] + current_size = 0 + + # 占位符单独作为一个chunk + chunks.append(block) + continue + # 情况1:块本身就过大 if block_size > self.max_block_size: # 先将当前积累的块输出 @@ -69,7 +85,7 @@ class MarkdownBlockSplitter: def _split_into_logical_blocks(self, markdown_text: str) -> List[str]: """ - 将Markdown文本分割成逻辑块(标题、段落、代码块、空行分隔符等) + 将Markdown文本分割成逻辑块(标题、段落、代码块、空行分隔符、图片占位符等) """ # 标准化换行符 text = markdown_text.replace('\r\n', '\n') @@ -86,11 +102,21 @@ class MarkdownBlockSplitter: if i % 2 == 1: # 这是一个代码块 blocks.append(part) else: # 这是普通Markdown内容 - # 按一个或多个空行分割,并保留分隔符 - # 这能有效分离段落、列表、标题等,并保留它们之间的空行 - sub_parts = re.split(r'(\n{2,})', part) - # 过滤掉 re.split 可能产生的空字符串 - blocks.extend([p for p in sub_parts if p]) + # 1. 先按占位符分割,确保占位符独立 + ph_parts = re.split(self.placeholder_pattern, part) + + for ph_part in ph_parts: + if not ph_part: + continue + + if self._is_placeholder(ph_part): + blocks.append(ph_part) + else: + # 2. 对非占位符文本,按一个或多个空行分割,并保留分隔符 + # 这能有效分离段落、列表、标题等,并保留它们之间的空行 + sub_parts = re.split(r'(\n{2,})', ph_part) + # 过滤掉 re.split 可能产生的空字符串 + blocks.extend([p for p in sub_parts if p]) return blocks @@ -153,8 +179,8 @@ def split_markdown_text(markdown_text: str, max_block_size=5000) -> List[str]: """ splitter = MarkdownBlockSplitter(max_block_size=max_block_size) chunks = splitter.split_markdown(markdown_text) - # 过滤掉仅由空白字符组成的块 - return [chunk for chunk in chunks if chunk.strip()] + # 过滤掉仅由空白字符组成的块,但保留占位符块 + return [chunk for chunk in chunks if chunk.strip() or splitter._is_placeholder(chunk)] def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool: @@ -165,6 +191,13 @@ def _needs_single_newline_join(prev_chunk: str, next_chunk: str) -> bool: if not prev_chunk.strip() or not next_chunk.strip(): return False + # 如果其中一个是占位符,通常建议使用双换行以确保它是独立的块, + # 除非原格式非常紧凑,但在翻译场景下,分隔开更安全。 + # 这里不额外处理占位符,走默认逻辑(最后会返回False,从而使用\n\n) + if re.match(r'^\s*\s*$', prev_chunk) or \ + re.match(r'^\s*\s*$', next_chunk): + return False + last_line_prev = prev_chunk.rstrip().split('\n')[-1].lstrip() first_line_next = next_chunk.lstrip().split('\n')[0].lstrip() @@ -207,12 +240,3 @@ def join_markdown_texts(markdown_texts: List[str]) -> str: joined_text += separator + current_chunk return joined_text - - -if __name__ == '__main__': - from pathlib import Path - from docutranslate.utils.markdown_utils import clean_markdown_math_block - content=Path(r"C:\Users\jxgm\Desktop\3a8d8999-3e9d-4f32-a32c-5b0830bb4320\full.md").read_text() - content=split_markdown_text(content) - content=join_markdown_texts(content) - diff --git a/docutranslate/utils/markdown_utils.py b/docutranslate/utils/markdown_utils.py index d0973a7..0dbfb87 100644 --- a/docutranslate/utils/markdown_utils.py +++ b/docutranslate/utils/markdown_utils.py @@ -81,7 +81,7 @@ def placeholder2uris(markdown: str, mask_dict: MaskDict): print(f"占位符已还原为图片") return uri - ph_pattern = r"" + ph_pattern = r"<\s*[pP][hH]\s*-\s*([a-zA-Z0-9]+)\s*>" markdown = re.sub(ph_pattern, placeholder2uri, markdown) return markdown