diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 3077ad6..d06502d 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -7,10 +7,9 @@
-
-
+
@@ -51,9 +50,10 @@
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
"Python.PDFtranslater (1).executor": "Run",
"Python.PDFtranslater (2).executor": "Run",
+ "Python.agent.executor": "Debug",
"Python.agent_utils.executor": "Run",
"Python.convert.executor": "Run",
- "Python.markdown_splitter.executor": "Run",
+ "Python.markdown_splitter.executor": "Debug",
"Python.markdown_utils.executor": "Run",
"Python.test.executor": "Run",
"Python.test1.executor": "Run",
@@ -109,9 +109,6 @@
-
-
-
@@ -138,6 +135,29 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -161,6 +181,29 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -207,29 +250,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -330,10 +350,10 @@
+
+
-
-
@@ -378,27 +398,42 @@
-
+
+
+
+
+
+
+
+ file://$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py
+ 269
+
+
+
+
+
-
+
+
+
diff --git a/README.md b/README.md
index d822f4a..1d243f9 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@
# 前置条件
-## huggingface换源
+## huggingface换源(不能科学上网的友友看这)
无法访问的huggingface的电脑在以下操作时请换源[点击测试](https://huggingface.co)
@@ -121,13 +121,13 @@ translater.read_file("<文件路径>").save_as_markdown()
```python
from docutranslate import FileTranslater
-translater = FileTranslater(base_url="",# 默认的模型baseurl
- key="",#默认的模型api-key
+translater = FileTranslater(base_url="", # 默认的模型baseurl
+ key="", # 默认的模型api-key
model_id="", # 默认的模型id
chunksize=4000, # markdown分块长度,分块越大效果越好,不建议超过4096
max_concurrent=6, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
- docling_artifact=None, #使用提前下载好的docling模型
- tips=True#开场提示
+ docling_artifact=None, # 使用提前下载好的docling模型
+ tips=True # 开场提示
)
```
@@ -161,6 +161,7 @@ translater.translate_file(r"<要翻译的文件路径>",
| 阿里云百炼 | [点击获取](https://bailian.console.aliyun.com/?tab=model#/api-key) | https://dashscope.aliyuncs.com/compatible-mode/v1 |
| 火山引擎 | [点击获取](https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey?apikey=%7B%7D) | https://ark.cn-beijing.volces.com/api/v3 |
| 硅基流动 | [点击获取](https://cloud.siliconflow.cn/account/ak) | https://api.siliconflow.cn/v1 |
+| DMXAPI | [点击获取](https://www.dmxapi.cn/token) | https://www.dmxapi.cn/v1 |
# FAQ
diff --git a/docutranslate/Agents/markdown_agent.py b/docutranslate/Agents/markdown_agent.py
index 417368d..e2dc4bf 100644
--- a/docutranslate/Agents/markdown_agent.py
+++ b/docutranslate/Agents/markdown_agent.py
@@ -10,16 +10,16 @@ class MDRefineAgent(Agent):
你是一个修正markdown文本的专家。
# 工作
找到markdown片段的不合理之处,对于缺失的句子,应该查看缺失的语句是否可能被错误的放在了其他位置,并通过重组段落、去掉异常字词修复不合理之处。
-尽量忠实于原文。输入文本开头和结尾如有空行请保留,形如的占位符不要改变。code和latex保持原文。
+尽量忠实于原文。形如的占位符不要改变。code和latex保持原文。保留正确的空行。
# 输出
-修正后的markdown纯文本(不能有多余文字)
+修正后的markdown纯文本
# 示例
## 调整顺序
输入:
applications and scenarios becoming more and more extensive.
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
输出:
-Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.【answer-end】
+Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.
## 去掉异常字词
输入:
一道\题@#目:\(x_1+1=2\)
@@ -35,12 +35,11 @@ class MDTranslateAgent(Agent):
你是一个翻译markdown文本的专家。
# 工作
将输入的markdown文本翻译成{to_lang}。
-尽量忠实于原文,修改明显错误的字符。
-输入文本开头和结尾如有空行请保留。
+请忠实于原文。修改明显错误的字符。保留正确的空行。
形如的占位符不要改变。
code和latex保持原文。
# 输出
-翻译后的markdown纯文本(不能有多余文字)
+翻译后的markdown纯文本
# 示例
## 英文翻译为中文:
输入:
diff --git a/docutranslate/translater.py b/docutranslate/translater.py
index 1be2e97..c4621fb 100644
--- a/docutranslate/translater.py
+++ b/docutranslate/translater.py
@@ -82,7 +82,7 @@ class FileTranslater:
print("正在修正markdown")
chuncks = self._split_markdown_into_chunks()
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
- self.markdown = "".join(result)
+ self.markdown = "\n".join(result)
print("markdown已修正")
return self.markdown
@@ -90,7 +90,7 @@ class FileTranslater:
print("正在翻译markdown")
chuncks = self._split_markdown_into_chunks()
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
- self.markdown = "".join(result)
+ self.markdown = "\n".join(result)
print("翻译完成")
return self.markdown
diff --git a/docutranslate/utils/markdown_splitter.py b/docutranslate/utils/markdown_splitter.py
index 2c630f2..77b2d30 100644
--- a/docutranslate/utils/markdown_splitter.py
+++ b/docutranslate/utils/markdown_splitter.py
@@ -15,6 +15,8 @@ class MarkdownBlockSplitter:
def split_markdown(self, markdown_text: str) -> List[str]:
"""
将markdown文本拆分为指定最大大小的块。
+ 确保拆分后可以重新拼接回原文(除了被拆分的代码块)。
+ 尽量保持标题和其内容在同一块中。
参数:
markdown_text: 输入的markdown文本。
@@ -22,40 +24,72 @@ class MarkdownBlockSplitter:
返回:
列表形式的markdown块,每个都是一个字符串。
"""
- # 使用更简单的方法:按Markdown块拆分
- # 这比使用AST解析更可靠
-
- # 模式用于识别markdown块(标题、段落、代码块等)
+ # 按Markdown块拆分
blocks = self._split_into_logical_blocks(markdown_text)
- # 现在合并块以遵守max_block_size
+ # 现在合并块以遵守max_block_size,同时尽量保持标题和内容在一起
result_blocks = []
current_block = ""
+ header_waiting = False # 标记是否有待处理的标题
for block in blocks:
+ # 检查块是否为空行分隔符
+ is_separator = block.strip() == "" and block.count("\n") > 0
+
+ # 检查是否是标题
+ is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
+
# 如果单个块大于最大大小,则进一步拆分
if len(block) > self.max_block_size:
# 如果已有累积内容,先添加
if current_block:
result_blocks.append(current_block)
current_block = ""
+ header_waiting = False
# 拆分大块
large_block_parts = self._split_large_block(block)
result_blocks.extend(large_block_parts)
continue
- # 如果添加此块会超过限制,则开始新的结果块
- if len(current_block) + len(block) + 2 > self.max_block_size and current_block:
+ # 确定适当的连接符
+ connector = "" if is_separator or not current_block else "\n"
+
+ # 如果当前块是标题,且之前的块已经很大,先结束之前的块
+ if is_header and len(current_block) + len(block) + len(
+ connector) > self.max_block_size * 0.9 and current_block:
result_blocks.append(current_block)
current_block = block
+ header_waiting = True
+ continue
+
+ # 如果添加此块会超过限制,则开始新的结果块
+ if len(current_block) + len(block) + len(connector) > self.max_block_size and current_block:
+ # 如果当前块以标题开始,我们会尝试将整个块放入下一个块
+ if header_waiting and not is_header:
+ # 检查是否能添加到当前块而不超出太多
+ if len(current_block) + len(block) + len(connector) <= self.max_block_size * 1.1:
+ current_block += connector + block
+ header_waiting = False
+ continue
+
+ result_blocks.append(current_block)
+ current_block = block
+ header_waiting = is_header
else:
- # 添加到当前块并适当换行
+ # 添加到当前块并适当连接
if current_block:
- current_block += "\n\n" + block
+ current_block += connector + block
else:
current_block = block
+ # 如果刚添加了标题,标记等待内容
+ if is_header:
+ header_waiting = True
+ elif header_waiting and not is_separator:
+ # 如果添加了内容到标题后,不再是等待状态
+ header_waiting = False
+
# 如果不为空则添加最后一个块
if current_block:
result_blocks.append(current_block)
@@ -65,6 +99,7 @@ class MarkdownBlockSplitter:
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
"""
将markdown文本拆分为逻辑块(标题、段落、代码块等)
+ 保留原始的空行数量,确保能重新拼接回原文
参数:
markdown_text: 输入markdown文本
@@ -87,15 +122,48 @@ class MarkdownBlockSplitter:
if i % 2 == 1:
blocks.append(part)
else:
- # 对于非代码块,按空行拆分
- part_blocks = re.split(r'\n\s*\n', part)
- blocks.extend([b.strip() for b in part_blocks if b.strip()])
+ # 对于非代码块,保留原始文本结构
+ if not part: # 跳过空字符串
+ continue
+
+ # 识别并单独处理标题
+ lines = part.split('\n')
+ current_lines = []
+
+ for line in lines:
+ # 检查是否是标题行
+ if re.match(r'^#{1,6}\s+.+', line):
+ # 如果有累积的内容,先添加到块
+ if current_lines:
+ blocks.append('\n'.join(current_lines))
+ current_lines = []
+
+ # 将标题作为单独的块
+ blocks.append(line)
+ else:
+ current_lines.append(line)
+
+ # 处理剩余的行
+ if current_lines:
+ # 按段落分隔符拆分剩余的内容
+ remaining_content = '\n'.join(current_lines)
+ parts_with_sep = re.split(r'(\n\s*\n)', remaining_content)
+
+ for j, p in enumerate(parts_with_sep):
+ if j % 2 == 0: # 这是正文内容
+ if p.strip(): # 只添加非空内容
+ blocks.append(p)
+ else: # 这是分隔符
+ # 添加分隔符作为单独的块以保持原始格式
+ if j > 0 and parts_with_sep[j - 1].strip(): # 确保前面有内容
+ blocks.append(p)
return blocks
def _split_large_block(self, block: str) -> List[str]:
"""
拆分超过max_block_size的大块。
+ 只按行拆分,不按句子拆分。
参数:
block: 一个大的markdown块
@@ -144,43 +212,36 @@ class MarkdownBlockSplitter:
result.append('\n'.join(current_part))
else:
- # 对于其他块,按句子或行拆分
- if '.' in block or '!' in block or '?' in block:
- # 按句子拆分
- sentences = re.split(r'(?<=[.!?])\s+', block)
+ # 检查是否是标题
+ is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
- current_part = []
- current_size = 0
+ # 对于所有非代码块,统一按行拆分
+ lines = block.split('\n')
- for sentence in sentences:
- if current_size + len(sentence) + 1 > self.max_block_size and current_part:
- result.append(' '.join(current_part))
- current_part = [sentence]
- current_size = len(sentence)
- else:
- current_part.append(sentence)
- current_size += len(sentence) + 1 # +1表示空格
+ current_part = []
+ current_size = 0
- if current_part:
- result.append(' '.join(current_part))
- else:
- # 按行拆分
- lines = block.split('\n')
+ for line in lines:
+ line_size = len(line) + 1 # +1表示换行符
- current_part = []
- current_size = 0
-
- for line in lines:
- if current_size + len(line) + 1 > self.max_block_size and current_part:
+ # 如果这是标题行,并且当前块已经很大,先结束当前块
+ if re.match(r'^#{1,6}\s+.+', line) and current_size > 0:
+ if current_part:
result.append('\n'.join(current_part))
- current_part = [line]
- current_size = len(line)
- else:
- current_part.append(line)
- current_size += len(line) + 1 # +1表示换行符
+ current_part = [line]
+ current_size = line_size
+ continue
- if current_part:
+ if current_size + line_size > self.max_block_size and current_part:
result.append('\n'.join(current_part))
+ current_part = [line]
+ current_size = line_size
+ else:
+ current_part.append(line)
+ current_size += line_size
+
+ if current_part:
+ result.append('\n'.join(current_part))
return result
@@ -188,13 +249,22 @@ class MarkdownBlockSplitter:
def split_markdown_text(markdown_text, max_block_size=4096):
"""
将markdown字符串拆分为不超过max_block_size的块。
+ 拆分后可以通过简单的字符串连接重新组合回原始文本(除了被拆分的代码块)。
+ 尽量保持标题和其内容在同一块中。
参数:
markdown_text: 输入markdown文本
max_block_size: 每个块的最大字符数
返回:
- markdown块列表
+ markdown块列表,可以通过''.join(chunks)重新组合(如果没有代码块被拆分)
"""
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
- return splitter.split_markdown(markdown_text)
\ No newline at end of file
+ return splitter.split_markdown(markdown_text)
+
+
+if __name__ == '__main__':
+ with open(r"C:\Users\jxgm\Desktop\FileTranslate\tests\resource\regex.md", "r") as f:
+ md = f.read()
+ a = split_markdown_text(md)
+ pass
\ No newline at end of file