使用\n拼接markdown块解决标题正文拼接问题
This commit is contained in:
101
.idea/workspace.xml
generated
101
.idea/workspace.xml
generated
@@ -7,10 +7,9 @@
|
||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/Agents/agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/agent.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
@@ -51,9 +50,10 @@
|
||||
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
|
||||
"Python.PDFtranslater (1).executor": "Run",
|
||||
"Python.PDFtranslater (2).executor": "Run",
|
||||
"Python.agent.executor": "Debug",
|
||||
"Python.agent_utils.executor": "Run",
|
||||
"Python.convert.executor": "Run",
|
||||
"Python.markdown_splitter.executor": "Run",
|
||||
"Python.markdown_splitter.executor": "Debug",
|
||||
"Python.markdown_utils.executor": "Run",
|
||||
"Python.test.executor": "Run",
|
||||
"Python.test1.executor": "Run",
|
||||
@@ -109,9 +109,6 @@
|
||||
<option name="OPTIONS" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="毕业论文_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/毕业论文_英文.html" useBuiltInWebServerPort="true">
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="互联网认证授权机制_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/resource/互联网认证授权机制_英文.html" useBuiltInWebServerPort="true">
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
@@ -138,6 +135,29 @@
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="agent" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/Agents" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/Agents/agent.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="convert" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
@@ -161,6 +181,29 @@
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="markdown_splitter" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
@@ -207,29 +250,6 @@
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="translater" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/translater.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration default="true" type="Python.FlaskServer">
|
||||
<module name="filetranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
@@ -330,10 +350,10 @@
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.test1" />
|
||||
<item itemvalue="Python.agent" />
|
||||
<item itemvalue="Python.markdown_splitter" />
|
||||
<item itemvalue="Python.test3" />
|
||||
<item itemvalue="Python.convert" />
|
||||
<item itemvalue="Python.translater" />
|
||||
<item itemvalue="JavaScript 调试.毕业论文_英文.html" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
@@ -378,27 +398,42 @@
|
||||
<workItem from="1746787566021" duration="77000" />
|
||||
<workItem from="1746787698072" duration="24000" />
|
||||
<workItem from="1746788668813" duration="25000" />
|
||||
<workItem from="1746791230782" duration="2932000" />
|
||||
<workItem from="1746791230782" duration="3129000" />
|
||||
<workItem from="1746799824552" duration="317000" />
|
||||
<workItem from="1746801217905" duration="5663000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<breakpoints>
|
||||
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
|
||||
<url>file://$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py</url>
|
||||
<line>269</line>
|
||||
<option name="timeStamp" value="3" />
|
||||
</line-breakpoint>
|
||||
</breakpoints>
|
||||
</breakpoint-manager>
|
||||
</component>
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746793348041" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746806688120" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746785064481" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746710994589" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746779982501" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
|
||||
|
||||
11
README.md
11
README.md
@@ -24,7 +24,7 @@
|
||||
|
||||
# 前置条件
|
||||
|
||||
## huggingface换源
|
||||
## huggingface换源(不能科学上网的友友看这)
|
||||
|
||||
无法访问的huggingface的电脑在以下操作时请换源[点击测试](https://huggingface.co)
|
||||
|
||||
@@ -121,13 +121,13 @@ translater.read_file("<文件路径>").save_as_markdown()
|
||||
```python
|
||||
from docutranslate import FileTranslater
|
||||
|
||||
translater = FileTranslater(base_url="<baseurl>",# 默认的模型baseurl
|
||||
key="<key>",#默认的模型api-key
|
||||
translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl
|
||||
key="<key>", # 默认的模型api-key
|
||||
model_id="<model-id>", # 默认的模型id
|
||||
chunksize=4000, # markdown分块长度,分块越大效果越好,不建议超过4096
|
||||
max_concurrent=6, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
|
||||
docling_artifact=None, #使用提前下载好的docling模型
|
||||
tips=True#开场提示
|
||||
docling_artifact=None, # 使用提前下载好的docling模型
|
||||
tips=True # 开场提示
|
||||
)
|
||||
|
||||
```
|
||||
@@ -161,6 +161,7 @@ translater.translate_file(r"<要翻译的文件路径>",
|
||||
| 阿里云百炼 | [点击获取](https://bailian.console.aliyun.com/?tab=model#/api-key) | https://dashscope.aliyuncs.com/compatible-mode/v1 |
|
||||
| 火山引擎 | [点击获取](https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey?apikey=%7B%7D) | https://ark.cn-beijing.volces.com/api/v3 |
|
||||
| 硅基流动 | [点击获取](https://cloud.siliconflow.cn/account/ak) | https://api.siliconflow.cn/v1 |
|
||||
| DMXAPI | [点击获取](https://www.dmxapi.cn/token) | https://www.dmxapi.cn/v1 |
|
||||
|
||||
# FAQ
|
||||
|
||||
|
||||
@@ -10,16 +10,16 @@ class MDRefineAgent(Agent):
|
||||
你是一个修正markdown文本的专家。
|
||||
# 工作
|
||||
找到markdown片段的不合理之处,对于缺失的句子,应该查看缺失的语句是否可能被错误的放在了其他位置,并通过重组段落、去掉异常字词修复不合理之处。
|
||||
尽量忠实于原文。输入文本开头和结尾如有空行请保留,形如<ph-abc123>的占位符不要改变。code和latex保持原文。
|
||||
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。code和latex保持原文。保留正确的空行。
|
||||
# 输出
|
||||
修正后的markdown纯文本(不能有多余文字)
|
||||
修正后的markdown纯文本
|
||||
# 示例
|
||||
## 调整顺序
|
||||
输入:
|
||||
applications and scenarios becoming more and more extensive.
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
|
||||
输出:
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.【answer-end】
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.
|
||||
## 去掉异常字词
|
||||
输入:
|
||||
一道\题@#目:\(x_1+1=2\)
|
||||
@@ -35,12 +35,11 @@ class MDTranslateAgent(Agent):
|
||||
你是一个翻译markdown文本的专家。
|
||||
# 工作
|
||||
将输入的markdown文本翻译成{to_lang}。
|
||||
尽量忠实于原文,修改明显错误的字符。
|
||||
输入文本开头和结尾如有空行请保留。
|
||||
请忠实于原文。修改明显错误的字符。保留正确的空行。
|
||||
形如<ph-abc123>的占位符不要改变。
|
||||
code和latex保持原文。
|
||||
# 输出
|
||||
翻译后的markdown纯文本(不能有多余文字)
|
||||
翻译后的markdown纯文本
|
||||
# 示例
|
||||
## 英文翻译为中文:
|
||||
输入:
|
||||
|
||||
@@ -82,7 +82,7 @@ class FileTranslater:
|
||||
print("正在修正markdown")
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
|
||||
self.markdown = "".join(result)
|
||||
self.markdown = "\n".join(result)
|
||||
print("markdown已修正")
|
||||
return self.markdown
|
||||
|
||||
@@ -90,7 +90,7 @@ class FileTranslater:
|
||||
print("正在翻译markdown")
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
|
||||
self.markdown = "".join(result)
|
||||
self.markdown = "\n".join(result)
|
||||
print("翻译完成")
|
||||
return self.markdown
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ class MarkdownBlockSplitter:
|
||||
def split_markdown(self, markdown_text: str) -> List[str]:
|
||||
"""
|
||||
将markdown文本拆分为指定最大大小的块。
|
||||
确保拆分后可以重新拼接回原文(除了被拆分的代码块)。
|
||||
尽量保持标题和其内容在同一块中。
|
||||
|
||||
参数:
|
||||
markdown_text: 输入的markdown文本。
|
||||
@@ -22,40 +24,72 @@ class MarkdownBlockSplitter:
|
||||
返回:
|
||||
列表形式的markdown块,每个都是一个字符串。
|
||||
"""
|
||||
# 使用更简单的方法:按Markdown块拆分
|
||||
# 这比使用AST解析更可靠
|
||||
|
||||
# 模式用于识别markdown块(标题、段落、代码块等)
|
||||
# 按Markdown块拆分
|
||||
blocks = self._split_into_logical_blocks(markdown_text)
|
||||
|
||||
# 现在合并块以遵守max_block_size
|
||||
# 现在合并块以遵守max_block_size,同时尽量保持标题和内容在一起
|
||||
result_blocks = []
|
||||
current_block = ""
|
||||
header_waiting = False # 标记是否有待处理的标题
|
||||
|
||||
for block in blocks:
|
||||
# 检查块是否为空行分隔符
|
||||
is_separator = block.strip() == "" and block.count("\n") > 0
|
||||
|
||||
# 检查是否是标题
|
||||
is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
|
||||
|
||||
# 如果单个块大于最大大小,则进一步拆分
|
||||
if len(block) > self.max_block_size:
|
||||
# 如果已有累积内容,先添加
|
||||
if current_block:
|
||||
result_blocks.append(current_block)
|
||||
current_block = ""
|
||||
header_waiting = False
|
||||
|
||||
# 拆分大块
|
||||
large_block_parts = self._split_large_block(block)
|
||||
result_blocks.extend(large_block_parts)
|
||||
continue
|
||||
|
||||
# 如果添加此块会超过限制,则开始新的结果块
|
||||
if len(current_block) + len(block) + 2 > self.max_block_size and current_block:
|
||||
# 确定适当的连接符
|
||||
connector = "" if is_separator or not current_block else "\n"
|
||||
|
||||
# 如果当前块是标题,且之前的块已经很大,先结束之前的块
|
||||
if is_header and len(current_block) + len(block) + len(
|
||||
connector) > self.max_block_size * 0.9 and current_block:
|
||||
result_blocks.append(current_block)
|
||||
current_block = block
|
||||
header_waiting = True
|
||||
continue
|
||||
|
||||
# 如果添加此块会超过限制,则开始新的结果块
|
||||
if len(current_block) + len(block) + len(connector) > self.max_block_size and current_block:
|
||||
# 如果当前块以标题开始,我们会尝试将整个块放入下一个块
|
||||
if header_waiting and not is_header:
|
||||
# 检查是否能添加到当前块而不超出太多
|
||||
if len(current_block) + len(block) + len(connector) <= self.max_block_size * 1.1:
|
||||
current_block += connector + block
|
||||
header_waiting = False
|
||||
continue
|
||||
|
||||
result_blocks.append(current_block)
|
||||
current_block = block
|
||||
header_waiting = is_header
|
||||
else:
|
||||
# 添加到当前块并适当换行
|
||||
# 添加到当前块并适当连接
|
||||
if current_block:
|
||||
current_block += "\n\n" + block
|
||||
current_block += connector + block
|
||||
else:
|
||||
current_block = block
|
||||
|
||||
# 如果刚添加了标题,标记等待内容
|
||||
if is_header:
|
||||
header_waiting = True
|
||||
elif header_waiting and not is_separator:
|
||||
# 如果添加了内容到标题后,不再是等待状态
|
||||
header_waiting = False
|
||||
|
||||
# 如果不为空则添加最后一个块
|
||||
if current_block:
|
||||
result_blocks.append(current_block)
|
||||
@@ -65,6 +99,7 @@ class MarkdownBlockSplitter:
|
||||
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
|
||||
"""
|
||||
将markdown文本拆分为逻辑块(标题、段落、代码块等)
|
||||
保留原始的空行数量,确保能重新拼接回原文
|
||||
|
||||
参数:
|
||||
markdown_text: 输入markdown文本
|
||||
@@ -87,15 +122,48 @@ class MarkdownBlockSplitter:
|
||||
if i % 2 == 1:
|
||||
blocks.append(part)
|
||||
else:
|
||||
# 对于非代码块,按空行拆分
|
||||
part_blocks = re.split(r'\n\s*\n', part)
|
||||
blocks.extend([b.strip() for b in part_blocks if b.strip()])
|
||||
# 对于非代码块,保留原始文本结构
|
||||
if not part: # 跳过空字符串
|
||||
continue
|
||||
|
||||
# 识别并单独处理标题
|
||||
lines = part.split('\n')
|
||||
current_lines = []
|
||||
|
||||
for line in lines:
|
||||
# 检查是否是标题行
|
||||
if re.match(r'^#{1,6}\s+.+', line):
|
||||
# 如果有累积的内容,先添加到块
|
||||
if current_lines:
|
||||
blocks.append('\n'.join(current_lines))
|
||||
current_lines = []
|
||||
|
||||
# 将标题作为单独的块
|
||||
blocks.append(line)
|
||||
else:
|
||||
current_lines.append(line)
|
||||
|
||||
# 处理剩余的行
|
||||
if current_lines:
|
||||
# 按段落分隔符拆分剩余的内容
|
||||
remaining_content = '\n'.join(current_lines)
|
||||
parts_with_sep = re.split(r'(\n\s*\n)', remaining_content)
|
||||
|
||||
for j, p in enumerate(parts_with_sep):
|
||||
if j % 2 == 0: # 这是正文内容
|
||||
if p.strip(): # 只添加非空内容
|
||||
blocks.append(p)
|
||||
else: # 这是分隔符
|
||||
# 添加分隔符作为单独的块以保持原始格式
|
||||
if j > 0 and parts_with_sep[j - 1].strip(): # 确保前面有内容
|
||||
blocks.append(p)
|
||||
|
||||
return blocks
|
||||
|
||||
def _split_large_block(self, block: str) -> List[str]:
|
||||
"""
|
||||
拆分超过max_block_size的大块。
|
||||
只按行拆分,不按句子拆分。
|
||||
|
||||
参数:
|
||||
block: 一个大的markdown块
|
||||
@@ -144,43 +212,36 @@ class MarkdownBlockSplitter:
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
else:
|
||||
# 对于其他块,按句子或行拆分
|
||||
if '.' in block or '!' in block or '?' in block:
|
||||
# 按句子拆分
|
||||
sentences = re.split(r'(?<=[.!?])\s+', block)
|
||||
# 检查是否是标题
|
||||
is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
|
||||
|
||||
current_part = []
|
||||
current_size = 0
|
||||
# 对于所有非代码块,统一按行拆分
|
||||
lines = block.split('\n')
|
||||
|
||||
for sentence in sentences:
|
||||
if current_size + len(sentence) + 1 > self.max_block_size and current_part:
|
||||
result.append(' '.join(current_part))
|
||||
current_part = [sentence]
|
||||
current_size = len(sentence)
|
||||
else:
|
||||
current_part.append(sentence)
|
||||
current_size += len(sentence) + 1 # +1表示空格
|
||||
current_part = []
|
||||
current_size = 0
|
||||
|
||||
if current_part:
|
||||
result.append(' '.join(current_part))
|
||||
else:
|
||||
# 按行拆分
|
||||
lines = block.split('\n')
|
||||
for line in lines:
|
||||
line_size = len(line) + 1 # +1表示换行符
|
||||
|
||||
current_part = []
|
||||
current_size = 0
|
||||
|
||||
for line in lines:
|
||||
if current_size + len(line) + 1 > self.max_block_size and current_part:
|
||||
# 如果这是标题行,并且当前块已经很大,先结束当前块
|
||||
if re.match(r'^#{1,6}\s+.+', line) and current_size > 0:
|
||||
if current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
current_part = [line]
|
||||
current_size = len(line)
|
||||
else:
|
||||
current_part.append(line)
|
||||
current_size += len(line) + 1 # +1表示换行符
|
||||
current_part = [line]
|
||||
current_size = line_size
|
||||
continue
|
||||
|
||||
if current_part:
|
||||
if current_size + line_size > self.max_block_size and current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
current_part = [line]
|
||||
current_size = line_size
|
||||
else:
|
||||
current_part.append(line)
|
||||
current_size += line_size
|
||||
|
||||
if current_part:
|
||||
result.append('\n'.join(current_part))
|
||||
|
||||
return result
|
||||
|
||||
@@ -188,13 +249,22 @@ class MarkdownBlockSplitter:
|
||||
def split_markdown_text(markdown_text, max_block_size=4096):
|
||||
"""
|
||||
将markdown字符串拆分为不超过max_block_size的块。
|
||||
拆分后可以通过简单的字符串连接重新组合回原始文本(除了被拆分的代码块)。
|
||||
尽量保持标题和其内容在同一块中。
|
||||
|
||||
参数:
|
||||
markdown_text: 输入markdown文本
|
||||
max_block_size: 每个块的最大字符数
|
||||
|
||||
返回:
|
||||
markdown块列表
|
||||
markdown块列表,可以通过''.join(chunks)重新组合(如果没有代码块被拆分)
|
||||
"""
|
||||
splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
|
||||
return splitter.split_markdown(markdown_text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with open(r"C:\Users\jxgm\Desktop\FileTranslate\tests\resource\regex.md", "r") as f:
|
||||
md = f.read()
|
||||
a = split_markdown_text(md)
|
||||
pass
|
||||
Reference in New Issue
Block a user