使用\n拼接markdown块解决标题正文拼接问题

This commit is contained in:
xunbu
2025-05-10 00:09:33 +08:00
parent dadab6057f
commit 18c65cfa58
5 changed files with 195 additions and 90 deletions

101
.idea/workspace.xml generated
View File

@@ -7,10 +7,9 @@
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment=""> <list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" /> <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/Agents/agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/agent.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" /> <change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" /> <change beforePath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" afterDir="false" />
</list> </list>
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" /> <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -51,9 +50,10 @@
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
"Python.PDFtranslater (1).executor": "Run", "Python.PDFtranslater (1).executor": "Run",
"Python.PDFtranslater (2).executor": "Run", "Python.PDFtranslater (2).executor": "Run",
"Python.agent.executor": "Debug",
"Python.agent_utils.executor": "Run", "Python.agent_utils.executor": "Run",
"Python.convert.executor": "Run", "Python.convert.executor": "Run",
"Python.markdown_splitter.executor": "Run", "Python.markdown_splitter.executor": "Debug",
"Python.markdown_utils.executor": "Run", "Python.markdown_utils.executor": "Run",
"Python.test.executor": "Run", "Python.test.executor": "Run",
"Python.test1.executor": "Run", "Python.test1.executor": "Run",
@@ -109,9 +109,6 @@
<option name="OPTIONS" value="" /> <option name="OPTIONS" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="毕业论文_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/毕业论文_英文.html" useBuiltInWebServerPort="true">
<method v="2" />
</configuration>
<configuration name="互联网认证授权机制_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/resource/互联网认证授权机制_英文.html" useBuiltInWebServerPort="true"> <configuration name="互联网认证授权机制_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/resource/互联网认证授权机制_英文.html" useBuiltInWebServerPort="true">
<method v="2" /> <method v="2" />
</configuration> </configuration>
@@ -138,6 +135,29 @@
<option name="INPUT_FILE" value="" /> <option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="agent" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/Agents" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/Agents/agent.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="convert" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true"> <configuration name="convert" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" /> <module name="FileTranslate" />
<option name="ENV_FILES" value="" /> <option name="ENV_FILES" value="" />
@@ -161,6 +181,29 @@
<option name="INPUT_FILE" value="" /> <option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="markdown_splitter" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/utils" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true"> <configuration name="test1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" /> <module name="FileTranslate" />
<option name="ENV_FILES" value="" /> <option name="ENV_FILES" value="" />
@@ -207,29 +250,6 @@
<option name="INPUT_FILE" value="" /> <option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="translater" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/translater.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration default="true" type="Python.FlaskServer"> <configuration default="true" type="Python.FlaskServer">
<module name="filetranslate" /> <module name="filetranslate" />
<option name="ENV_FILES" value="" /> <option name="ENV_FILES" value="" />
@@ -330,10 +350,10 @@
<recent_temporary> <recent_temporary>
<list> <list>
<item itemvalue="Python.test1" /> <item itemvalue="Python.test1" />
<item itemvalue="Python.agent" />
<item itemvalue="Python.markdown_splitter" />
<item itemvalue="Python.test3" /> <item itemvalue="Python.test3" />
<item itemvalue="Python.convert" /> <item itemvalue="Python.convert" />
<item itemvalue="Python.translater" />
<item itemvalue="JavaScript 调试.毕业论文_英文.html" />
</list> </list>
</recent_temporary> </recent_temporary>
</component> </component>
@@ -378,27 +398,42 @@
<workItem from="1746787566021" duration="77000" /> <workItem from="1746787566021" duration="77000" />
<workItem from="1746787698072" duration="24000" /> <workItem from="1746787698072" duration="24000" />
<workItem from="1746788668813" duration="25000" /> <workItem from="1746788668813" duration="25000" />
<workItem from="1746791230782" duration="2932000" /> <workItem from="1746791230782" duration="3129000" />
<workItem from="1746799824552" duration="317000" />
<workItem from="1746801217905" duration="5663000" />
</task> </task>
<servers /> <servers />
</component> </component>
<component name="TypeScriptGeneratedFilesManager"> <component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" /> <option name="version" value="3" />
</component> </component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/docutranslate/utils/markdown_splitter.py</url>
<line>269</line>
<option name="timeStamp" value="3" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl"> <component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" /> <SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" /> <SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746793348041" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746806688120" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" /> <SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746785064481" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746785064481" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" /> <SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" /> <SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
<SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746710994589" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746710994589" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746779982501" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" /> <SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746779982501" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" /> <SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />

View File

@@ -24,7 +24,7 @@
# 前置条件 # 前置条件
## huggingface换源 ## huggingface换源(不能科学上网的友友看这)
无法访问的huggingface的电脑在以下操作时请换源[点击测试](https://huggingface.co) 无法访问的huggingface的电脑在以下操作时请换源[点击测试](https://huggingface.co)
@@ -161,6 +161,7 @@ translater.translate_file(r"<要翻译的文件路径>",
| 阿里云百炼 | [点击获取](https://bailian.console.aliyun.com/?tab=model#/api-key) | https://dashscope.aliyuncs.com/compatible-mode/v1 | | 阿里云百炼 | [点击获取](https://bailian.console.aliyun.com/?tab=model#/api-key) | https://dashscope.aliyuncs.com/compatible-mode/v1 |
| 火山引擎 | [点击获取](https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey?apikey=%7B%7D) | https://ark.cn-beijing.volces.com/api/v3 | | 火山引擎 | [点击获取](https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey?apikey=%7B%7D) | https://ark.cn-beijing.volces.com/api/v3 |
| 硅基流动 | [点击获取](https://cloud.siliconflow.cn/account/ak) | https://api.siliconflow.cn/v1 | | 硅基流动 | [点击获取](https://cloud.siliconflow.cn/account/ak) | https://api.siliconflow.cn/v1 |
| DMXAPI | [点击获取](https://www.dmxapi.cn/token) | https://www.dmxapi.cn/v1 |
# FAQ # FAQ

View File

@@ -10,16 +10,16 @@ class MDRefineAgent(Agent):
你是一个修正markdown文本的专家。 你是一个修正markdown文本的专家。
# 工作 # 工作
找到markdown片段的不合理之处对于缺失的句子应该查看缺失的语句是否可能被错误的放在了其他位置并通过重组段落、去掉异常字词修复不合理之处。 找到markdown片段的不合理之处对于缺失的句子应该查看缺失的语句是否可能被错误的放在了其他位置并通过重组段落、去掉异常字词修复不合理之处。
尽量忠实于原文。输入文本开头和结尾如有空行请保留,形如<ph-abc123>的占位符不要改变。code和latex保持原文。 尽量忠实于原文。形如<ph-abc123>的占位符不要改变。code和latex保持原文。保留正确的空行。
# 输出 # 输出
修正后的markdown纯文本(不能有多余文字) 修正后的markdown纯文本
# 示例 # 示例
## 调整顺序 ## 调整顺序
输入: 输入:
applications and scenarios becoming more and more extensive. applications and scenarios becoming more and more extensive.
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
输出: 输出:
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.【answer-end】 Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.
## 去掉异常字词 ## 去掉异常字词
输入: 输入:
一道\题@#目:\(x_1+1=2\) 一道\题@#目:\(x_1+1=2\)
@@ -35,12 +35,11 @@ class MDTranslateAgent(Agent):
你是一个翻译markdown文本的专家。 你是一个翻译markdown文本的专家。
# 工作 # 工作
将输入的markdown文本翻译成{to_lang} 将输入的markdown文本翻译成{to_lang}
尽量忠实于原文修改明显错误的字符。 忠实于原文修改明显错误的字符。保留正确的空行。
输入文本开头和结尾如有空行请保留。
形如<ph-abc123>的占位符不要改变。 形如<ph-abc123>的占位符不要改变。
code和latex保持原文。 code和latex保持原文。
# 输出 # 输出
翻译后的markdown纯文本(不能有多余文字) 翻译后的markdown纯文本
# 示例 # 示例
## 英文翻译为中文: ## 英文翻译为中文:
输入: 输入:

View File

@@ -82,7 +82,7 @@ class FileTranslater:
print("正在修正markdown") print("正在修正markdown")
chuncks = self._split_markdown_into_chunks() chuncks = self._split_markdown_into_chunks()
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000) result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
self.markdown = "".join(result) self.markdown = "\n".join(result)
print("markdown已修正") print("markdown已修正")
return self.markdown return self.markdown
@@ -90,7 +90,7 @@ class FileTranslater:
print("正在翻译markdown") print("正在翻译markdown")
chuncks = self._split_markdown_into_chunks() chuncks = self._split_markdown_into_chunks()
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000) result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
self.markdown = "".join(result) self.markdown = "\n".join(result)
print("翻译完成") print("翻译完成")
return self.markdown return self.markdown

View File

@@ -15,6 +15,8 @@ class MarkdownBlockSplitter:
def split_markdown(self, markdown_text: str) -> List[str]: def split_markdown(self, markdown_text: str) -> List[str]:
""" """
将markdown文本拆分为指定最大大小的块。 将markdown文本拆分为指定最大大小的块。
确保拆分后可以重新拼接回原文(除了被拆分的代码块)。
尽量保持标题和其内容在同一块中。
参数: 参数:
markdown_text: 输入的markdown文本。 markdown_text: 输入的markdown文本。
@@ -22,40 +24,72 @@ class MarkdownBlockSplitter:
返回: 返回:
列表形式的markdown块每个都是一个字符串。 列表形式的markdown块每个都是一个字符串。
""" """
# 使用更简单的方法:按Markdown块拆分 # 按Markdown块拆分
# 这比使用AST解析更可靠
# 模式用于识别markdown块标题、段落、代码块等
blocks = self._split_into_logical_blocks(markdown_text) blocks = self._split_into_logical_blocks(markdown_text)
# 现在合并块以遵守max_block_size # 现在合并块以遵守max_block_size,同时尽量保持标题和内容在一起
result_blocks = [] result_blocks = []
current_block = "" current_block = ""
header_waiting = False # 标记是否有待处理的标题
for block in blocks: for block in blocks:
# 检查块是否为空行分隔符
is_separator = block.strip() == "" and block.count("\n") > 0
# 检查是否是标题
is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
# 如果单个块大于最大大小,则进一步拆分 # 如果单个块大于最大大小,则进一步拆分
if len(block) > self.max_block_size: if len(block) > self.max_block_size:
# 如果已有累积内容,先添加 # 如果已有累积内容,先添加
if current_block: if current_block:
result_blocks.append(current_block) result_blocks.append(current_block)
current_block = "" current_block = ""
header_waiting = False
# 拆分大块 # 拆分大块
large_block_parts = self._split_large_block(block) large_block_parts = self._split_large_block(block)
result_blocks.extend(large_block_parts) result_blocks.extend(large_block_parts)
continue continue
# 如果添加此块会超过限制,则开始新的结果块 # 确定适当的连接符
if len(current_block) + len(block) + 2 > self.max_block_size and current_block: connector = "" if is_separator or not current_block else "\n"
# 如果当前块是标题,且之前的块已经很大,先结束之前的块
if is_header and len(current_block) + len(block) + len(
connector) > self.max_block_size * 0.9 and current_block:
result_blocks.append(current_block) result_blocks.append(current_block)
current_block = block current_block = block
header_waiting = True
continue
# 如果添加此块会超过限制,则开始新的结果块
if len(current_block) + len(block) + len(connector) > self.max_block_size and current_block:
# 如果当前块以标题开始,我们会尝试将整个块放入下一个块
if header_waiting and not is_header:
# 检查是否能添加到当前块而不超出太多
if len(current_block) + len(block) + len(connector) <= self.max_block_size * 1.1:
current_block += connector + block
header_waiting = False
continue
result_blocks.append(current_block)
current_block = block
header_waiting = is_header
else: else:
# 添加到当前块并适当换行 # 添加到当前块并适当连接
if current_block: if current_block:
current_block += "\n\n" + block current_block += connector + block
else: else:
current_block = block current_block = block
# 如果刚添加了标题,标记等待内容
if is_header:
header_waiting = True
elif header_waiting and not is_separator:
# 如果添加了内容到标题后,不再是等待状态
header_waiting = False
# 如果不为空则添加最后一个块 # 如果不为空则添加最后一个块
if current_block: if current_block:
result_blocks.append(current_block) result_blocks.append(current_block)
@@ -65,6 +99,7 @@ class MarkdownBlockSplitter:
def _split_into_logical_blocks(self, markdown_text: str) -> List[str]: def _split_into_logical_blocks(self, markdown_text: str) -> List[str]:
""" """
将markdown文本拆分为逻辑块标题、段落、代码块等 将markdown文本拆分为逻辑块标题、段落、代码块等
保留原始的空行数量,确保能重新拼接回原文
参数: 参数:
markdown_text: 输入markdown文本 markdown_text: 输入markdown文本
@@ -87,15 +122,48 @@ class MarkdownBlockSplitter:
if i % 2 == 1: if i % 2 == 1:
blocks.append(part) blocks.append(part)
else: else:
# 对于非代码块,按空行拆分 # 对于非代码块,保留原始文本结构
part_blocks = re.split(r'\n\s*\n', part) if not part: # 跳过空字符串
blocks.extend([b.strip() for b in part_blocks if b.strip()]) continue
# 识别并单独处理标题
lines = part.split('\n')
current_lines = []
for line in lines:
# 检查是否是标题行
if re.match(r'^#{1,6}\s+.+', line):
# 如果有累积的内容,先添加到块
if current_lines:
blocks.append('\n'.join(current_lines))
current_lines = []
# 将标题作为单独的块
blocks.append(line)
else:
current_lines.append(line)
# 处理剩余的行
if current_lines:
# 按段落分隔符拆分剩余的内容
remaining_content = '\n'.join(current_lines)
parts_with_sep = re.split(r'(\n\s*\n)', remaining_content)
for j, p in enumerate(parts_with_sep):
if j % 2 == 0: # 这是正文内容
if p.strip(): # 只添加非空内容
blocks.append(p)
else: # 这是分隔符
# 添加分隔符作为单独的块以保持原始格式
if j > 0 and parts_with_sep[j - 1].strip(): # 确保前面有内容
blocks.append(p)
return blocks return blocks
def _split_large_block(self, block: str) -> List[str]: def _split_large_block(self, block: str) -> List[str]:
""" """
拆分超过max_block_size的大块。 拆分超过max_block_size的大块。
只按行拆分,不按句子拆分。
参数: 参数:
block: 一个大的markdown块 block: 一个大的markdown块
@@ -144,40 +212,33 @@ class MarkdownBlockSplitter:
result.append('\n'.join(current_part)) result.append('\n'.join(current_part))
else: else:
# 对于其他块,按句子或行拆分 # 检查是否是标题
if '.' in block or '!' in block or '?' in block: is_header = bool(re.match(r'^#{1,6}\s+.+', block.strip()))
# 按句子拆分
sentences = re.split(r'(?<=[.!?])\s+', block)
current_part = [] # 对于所有非代码块,统一按行拆分
current_size = 0
for sentence in sentences:
if current_size + len(sentence) + 1 > self.max_block_size and current_part:
result.append(' '.join(current_part))
current_part = [sentence]
current_size = len(sentence)
else:
current_part.append(sentence)
current_size += len(sentence) + 1 # +1表示空格
if current_part:
result.append(' '.join(current_part))
else:
# 按行拆分
lines = block.split('\n') lines = block.split('\n')
current_part = [] current_part = []
current_size = 0 current_size = 0
for line in lines: for line in lines:
if current_size + len(line) + 1 > self.max_block_size and current_part: line_size = len(line) + 1 # +1表示换行符
# 如果这是标题行,并且当前块已经很大,先结束当前块
if re.match(r'^#{1,6}\s+.+', line) and current_size > 0:
if current_part:
result.append('\n'.join(current_part)) result.append('\n'.join(current_part))
current_part = [line] current_part = [line]
current_size = len(line) current_size = line_size
continue
if current_size + line_size > self.max_block_size and current_part:
result.append('\n'.join(current_part))
current_part = [line]
current_size = line_size
else: else:
current_part.append(line) current_part.append(line)
current_size += len(line) + 1 # +1表示换行符 current_size += line_size
if current_part: if current_part:
result.append('\n'.join(current_part)) result.append('\n'.join(current_part))
@@ -188,13 +249,22 @@ class MarkdownBlockSplitter:
def split_markdown_text(markdown_text, max_block_size=4096): def split_markdown_text(markdown_text, max_block_size=4096):
""" """
将markdown字符串拆分为不超过max_block_size的块。 将markdown字符串拆分为不超过max_block_size的块。
拆分后可以通过简单的字符串连接重新组合回原始文本(除了被拆分的代码块)。
尽量保持标题和其内容在同一块中。
参数: 参数:
markdown_text: 输入markdown文本 markdown_text: 输入markdown文本
max_block_size: 每个块的最大字符数 max_block_size: 每个块的最大字符数
返回: 返回:
markdown块列表 markdown块列表,可以通过''.join(chunks)重新组合(如果没有代码块被拆分)
""" """
splitter = MarkdownBlockSplitter(max_block_size=max_block_size) splitter = MarkdownBlockSplitter(max_block_size=max_block_size)
return splitter.split_markdown(markdown_text) return splitter.split_markdown(markdown_text)
if __name__ == '__main__':
with open(r"C:\Users\jxgm\Desktop\FileTranslate\tests\resource\regex.md", "r") as f:
md = f.read()
a = split_markdown_text(md)
pass