v0.1.0 增加换源说明和多种输入文件
This commit is contained in:
153
.idea/workspace.xml
generated
153
.idea/workspace.xml
generated
@@ -5,8 +5,18 @@
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/docutranslate/Agents/__init__.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/docutranslate/Agents/markdown_agent.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/docutranslate/utils/docling_utils.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/decorator/__init__.py" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/decorator/markdown_mask.py" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/decorator/time.py" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/utils/agent_utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/Agents/agent.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/docutranslate/utils/convert.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/utils/convert.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
@@ -32,43 +42,43 @@
|
||||
<option name="hideEmptyMiddlePackages" value="true" />
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent"><![CDATA[{
|
||||
"keyToString": {
|
||||
"DefaultHtmlFileTemplate": "HTML File",
|
||||
"JavaScript 调试.output.html (1).executor": "Run",
|
||||
"JavaScript 调试.output.html.executor": "Run",
|
||||
"JavaScript 调试.regex_中文.html.executor": "Run",
|
||||
"JavaScript 调试.test2_英文.html.executor": "Run",
|
||||
"JavaScript 调试.test4-1_中文.html.executor": "Run",
|
||||
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
|
||||
"JavaScript 调试.毕业论文_英文.html.executor": "Run",
|
||||
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
||||
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
|
||||
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
|
||||
"Python.PDFtranslater (1).executor": "Run",
|
||||
"Python.PDFtranslater (2).executor": "Run",
|
||||
"Python.agent_utils.executor": "Run",
|
||||
"Python.convert.executor": "Run",
|
||||
"Python.markdown_splitter.executor": "Run",
|
||||
"Python.markdown_utils.executor": "Run",
|
||||
"Python.test.executor": "Run",
|
||||
"Python.test1.executor": "Run",
|
||||
"Python.test2.executor": "Run",
|
||||
"Python.test3.executor": "Run",
|
||||
"Python.translater.executor": "Debug",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"RunOnceActivity.git.unshallow": "true",
|
||||
"git-widget-placeholder": "main",
|
||||
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/resource",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "Errors",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
<component name="PropertiesComponent">{
|
||||
"keyToString": {
|
||||
"DefaultHtmlFileTemplate": "HTML File",
|
||||
"JavaScript 调试.output.html (1).executor": "Run",
|
||||
"JavaScript 调试.output.html.executor": "Run",
|
||||
"JavaScript 调试.regex_中文.html.executor": "Run",
|
||||
"JavaScript 调试.test2_英文.html.executor": "Run",
|
||||
"JavaScript 调试.test4-1_中文.html.executor": "Run",
|
||||
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
|
||||
"JavaScript 调试.毕业论文_英文.html.executor": "Run",
|
||||
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
||||
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
|
||||
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
|
||||
"Python.PDFtranslater (1).executor": "Run",
|
||||
"Python.PDFtranslater (2).executor": "Run",
|
||||
"Python.agent_utils.executor": "Run",
|
||||
"Python.convert.executor": "Run",
|
||||
"Python.markdown_splitter.executor": "Run",
|
||||
"Python.markdown_utils.executor": "Run",
|
||||
"Python.test.executor": "Run",
|
||||
"Python.test1.executor": "Run",
|
||||
"Python.test2.executor": "Run",
|
||||
"Python.test3.executor": "Run",
|
||||
"Python.translater.executor": "Run",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"RunOnceActivity.git.unshallow": "true",
|
||||
"git-widget-placeholder": "main",
|
||||
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/tests/resource",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "Errors",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
}
|
||||
}]]></component>
|
||||
}</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\resource" />
|
||||
@@ -77,6 +87,8 @@
|
||||
<recent name="C:\Users\jxgm\Desktop\PDFtranslate\pdftranslate_packages\output" />
|
||||
</key>
|
||||
<key name="MoveFile.RECENT_KEYS">
|
||||
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests\resource" />
|
||||
<recent name="C:\Users\jxgm\Desktop\FileTranslate\docutranslate\Agents" />
|
||||
<recent name="C:\Users\jxgm\Desktop\FileTranslate\tests" />
|
||||
<recent name="C:\Users\jxgm\Desktop\PDFtranslate\pdf" />
|
||||
</key>
|
||||
@@ -106,10 +118,7 @@
|
||||
<configuration name="毕业论文_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/毕业论文_英文.html" useBuiltInWebServerPort="true">
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test2_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/test2_英文.html" useBuiltInWebServerPort="true">
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test4-1_中文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/output/test4-1_中文.html" useBuiltInWebServerPort="true">
|
||||
<configuration name="互联网认证授权机制_英文.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/resource/互联网认证授权机制_英文.html" useBuiltInWebServerPort="true">
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
|
||||
@@ -135,6 +144,29 @@
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="convert" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/utils/convert.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
@@ -181,6 +213,29 @@
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="translater" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="FileTranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/translater.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration default="true" type="Python.FlaskServer">
|
||||
<module name="filetranslate" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
@@ -281,10 +336,10 @@
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.test3" />
|
||||
<item itemvalue="JavaScript 调试.毕业论文_英文.html" />
|
||||
<item itemvalue="Python.test1" />
|
||||
<item itemvalue="JavaScript 调试.test2_英文.html" />
|
||||
<item itemvalue="JavaScript 调试.test4-1_中文.html" />
|
||||
<item itemvalue="Python.convert" />
|
||||
<item itemvalue="Python.translater" />
|
||||
<item itemvalue="JavaScript 调试.毕业论文_英文.html" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
@@ -317,6 +372,12 @@
|
||||
<workItem from="1746715913355" duration="823000" />
|
||||
<workItem from="1746717711857" duration="114000" />
|
||||
<workItem from="1746718953100" duration="3871000" />
|
||||
<workItem from="1746771270710" duration="2828000" />
|
||||
<workItem from="1746774755487" duration="4243000" />
|
||||
<workItem from="1746779030113" duration="1102000" />
|
||||
<workItem from="1746780247620" duration="1776000" />
|
||||
<workItem from="1746782039257" duration="307000" />
|
||||
<workItem from="1746782370978" duration="3228000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
@@ -326,17 +387,19 @@
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746677277745" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746753086503" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746780691113" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746782563450" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746596984213" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746755666962" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746785064481" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746600434803" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1746710994589" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746779982501" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
||||
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
|
||||
</component>
|
||||
</project>
|
||||
154
README.md
154
README.md
@@ -1,10 +1,11 @@
|
||||
# 简介
|
||||
## DocuTranslate
|
||||
一个使用大语言模型(llm)翻译pdf和markdown的包
|
||||
|
||||
[github主页](https://github.com/xunbu/docutranslate)
|
||||
## DocuTranslate [<svg height="32" aria-hidden="true" viewBox="0 0 24 24" version="1.1" width="20" data-view-component="true" class="octicon octicon-mark-github v-align-middle"><path d="M12 1C5.9225 1 1 5.9225 1 12C1 16.8675 4.14875 20.9787 8.52125 22.4362C9.07125 22.5325 9.2775 22.2025 9.2775 21.9137C9.2775 21.6525 9.26375 20.7862 9.26375 19.865C6.5 20.3737 5.785 19.1912 5.565 18.5725C5.44125 18.2562 4.905 17.28 4.4375 17.0187C4.0525 16.8125 3.5025 16.3037 4.42375 16.29C5.29 16.2762 5.90875 17.0875 6.115 17.4175C7.105 19.0812 8.68625 18.6137 9.31875 18.325C9.415 17.61 9.70375 17.1287 10.02 16.8537C7.5725 16.5787 5.015 15.63 5.015 11.4225C5.015 10.2262 5.44125 9.23625 6.1425 8.46625C6.0325 8.19125 5.6475 7.06375 6.2525 5.55125C6.2525 5.55125 7.17375 5.2625 9.2775 6.67875C10.1575 6.43125 11.0925 6.3075 12.0275 6.3075C12.9625 6.3075 13.8975 6.43125 14.7775 6.67875C16.8813 5.24875 17.8025 5.55125 17.8025 5.55125C18.4075 7.06375 18.0225 8.19125 17.9125 8.46625C18.6138 9.23625 19.04 10.2125 19.04 11.4225C19.04 15.6437 16.4688 16.5787 14.0213 16.8537C14.42 17.1975 14.7638 17.8575 14.7638 18.8887C14.7638 20.36 14.75 21.5425 14.75 21.9137C14.75 22.2025 14.9563 22.5462 15.5063 22.4362C19.8513 20.9787 23 16.8537 23 12C23 5.9225 18.0775 1 12 1Z"></path></svg>](https://github.com/xunbu/docutranslate)
|
||||
|
||||
文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译
|
||||
|
||||
# 安装
|
||||
|
||||
使用pip
|
||||
`pip install docutranslate`
|
||||
|
||||
@@ -12,59 +13,109 @@
|
||||
`uv init`
|
||||
`uv add docutranslate`
|
||||
|
||||
# 前置条件(获取大模型平台的baseurl、key、model-id)
|
||||
# 支持的文件格式
|
||||
|
||||
| 输入格式 | 输出格式 |
|
||||
|------------|--------------|
|
||||
| PDF(非扫描版) | Markdown(推荐) |
|
||||
| Markdown | HTML |
|
||||
| HTML、XHTML | |
|
||||
| CSV | |
|
||||
|
||||
# 前置条件
|
||||
|
||||
## huggingface换源
|
||||
|
||||
无法访问的huggingface的电脑在以下操作时请换源[点击测试](https://huggingface.co)
|
||||
|
||||
- 第一次读取非markdown文本
|
||||
- 第一次使用公式识别或代码识别功能
|
||||
|
||||
### 方法1
|
||||
|
||||
设置电脑的环境变量(记得设置后重启重启IDE)
|
||||
`HF_ENDPOINT=https://hf-mirror.com`
|
||||
|
||||
### 方法2
|
||||
|
||||
在代码开头设置环境变量
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
|
||||
###其余代码写在下方
|
||||
```
|
||||
|
||||
## 获取大模型平台的baseurl、key、model-id
|
||||
|
||||
由于需要使用大语言模型进行markdown调整与翻译,所以需要预先获取模型的baseurl、key、model-id
|
||||
常见的大模型平台baseurl与api获取方式可见[常用ai平台](#常用ai平台)
|
||||
> 比较推荐的模型有阿里云的qwen-plus、智谱的glm-4-air、glm-z1-flash等。免费的智谱glm-4-flash能用但效果欠佳(2025.5)
|
||||
|
||||
# 使用方式
|
||||
|
||||
## 注意事项
|
||||
|
||||
以下操作会自动从[huggingface](https://huggingface.co)下载模型,windows需要使用**管理员模式**打开IDE运行脚本,并按需换源
|
||||
|
||||
- 第一次读取非markdown文本
|
||||
- 第一次使用公式识别或代码识别功能
|
||||
|
||||
## 翻译文件
|
||||
|
||||
```python
|
||||
from docutranslate.translater import FileTranslater
|
||||
|
||||
|
||||
translater = FileTranslater(base_url="<baseurl>",
|
||||
key="<key>",
|
||||
model_id="<model-id>")
|
||||
# 不开启公式、代码识别
|
||||
translater.translate_pdf_file("<pdf路径>", to_lang="中文")
|
||||
# 不开启公式、代码识别(默认输出为markdown文件)(打开文本修复)
|
||||
translater.translate_file("<文件路径>", to_lang="中文", refine=True)
|
||||
|
||||
# 开启公式、代码识别(需要下载更多模型)
|
||||
translater.translate_pdf_file("<pdf路径>", to_lang="中文",formula=True, code=True)
|
||||
translater.translate_file("<文件路径>", to_lang="中文", formula=True, code=True)
|
||||
|
||||
# 翻译markdown文件
|
||||
translater.translate_markdown_file("<markdown路径>",to_lang="中文")
|
||||
translater.translate_file("<markdown路径>", to_lang="中文", refine=False)
|
||||
```
|
||||
> 第一次使用时需要下载模型(约1G、使用公式、代码识别需要多约0.5G),请稍作等待
|
||||
|
||||
> 下载模型时请用管理员模式打开终端运行文件(windows),并按需换源
|
||||
> 输出文件默认放在`./output`中
|
||||
|
||||
## 使用不同的agent分别进行文本修正和翻译
|
||||
|
||||
```python
|
||||
from docutranslate import FileTranslater
|
||||
from docutranslate.Agents import MDRefineAgent, MDTranslateAgent
|
||||
|
||||
translater = FileTranslater()
|
||||
|
||||
refine_agent = translater.create_refine_agent(baseurl="<baseurl-1>", key="<key-1>", model_id="<model-id-1>")
|
||||
translate_agent = translater.create_translate_agent(baseurl="<baseurl-2>", key="<key-2>", model_id="<model-id-2>")
|
||||
refine_agent = MDRefineAgent(baseurl="<baseurl-1>", key="<key-1>", model_id="<model-id-1>")
|
||||
translate_agent = MDTranslateAgent(baseurl="<baseurl-2>", key="<key-2>", model_id="<model-id-2>")
|
||||
|
||||
translater.translate_pdf_file(pdf_path="<pdf路径>", to_lang="中文", refine_agent=refine_agent,
|
||||
translater.translate_file("<文件路径>", to_lang="中文", refine_agent=refine_agent,
|
||||
translate_agent=translate_agent)
|
||||
```
|
||||
## 文件转换(pdf/markdown->markdown/html)
|
||||
|
||||
## 文件转换(pdf/markdown/HTML/Doc等->markdown/html)
|
||||
|
||||
```python
|
||||
from docutranslate import FileTranslater
|
||||
|
||||
translater = FileTranslater(base_url="<baseurl>",
|
||||
key="<key>",
|
||||
model_id="<model-id>")
|
||||
#markdown转html
|
||||
translater.read_markdown("<markdown路径>").save_as_html()
|
||||
#pdf转markdown
|
||||
translater.read_pdf_as_markdown("<pdf路径>").save_as_markdown()
|
||||
# 文件转html
|
||||
translater.read_file("<文件路径>").save_as_html()
|
||||
# 文件转markdown
|
||||
translater.read_file("<文件路径>").save_as_markdown()
|
||||
```
|
||||
|
||||
## 参数说明
|
||||
### 创建FileTranslate
|
||||
|
||||
### 创建FileTranslater
|
||||
|
||||
```python
|
||||
from docutranslate import FileTranslater
|
||||
@@ -75,34 +126,28 @@ translater = FileTranslater(base_url="<baseurl>",
|
||||
chunksize=4000, # 【可选】markdown分块长度,分块越大效果越好,不建议超过4096
|
||||
max_concurrent=6 # 【可选】并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
|
||||
)
|
||||
|
||||
```
|
||||
### 翻译pdf文件
|
||||
|
||||
### 翻译文件
|
||||
|
||||
```python
|
||||
translater.translate_pdf_file(r"<要翻译的pdf路径>",
|
||||
translater.translate_file(r"<要翻译的文件路径>",
|
||||
to_lang="中文",
|
||||
formula=False, # 是否启用公式识别
|
||||
code=False, # 是否启用代码识别
|
||||
refine=True,#是否在翻译前先修正markdown文本
|
||||
refine=True, # 是否在翻译前先修正markdown文本(较耗时)
|
||||
output_format="markdown", # "markdown"与"html"两种输出格式
|
||||
output_dir="./output"#默认输出文件夹
|
||||
output_dir="./output", # 默认输出文件夹
|
||||
refine_agent=None, # 修正Agent
|
||||
translate_agent=None # 翻译Agent
|
||||
)
|
||||
```
|
||||
|
||||
### 翻译markdown文件
|
||||
```python
|
||||
translater.translate_markdown_file(r"<要翻译的markdown路径>",
|
||||
to_lang="中文",
|
||||
refine=False,#【可选】是否在翻译前先修正markdown文本
|
||||
output_format="markdown",#"markdown"与"html"两种输出格式
|
||||
output_dir="./output"#默认输出文件夹
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
|
||||
# 常用ai平台
|
||||
|
||||
| 平台名称 | 获取APIkey | baseurl |
|
||||
|------------|-----------------------------------------------------|---------------------------------------------------|
|
||||
|------------|---------------------------------------------------------------------------------------|---------------------------------------------------|
|
||||
| ollama | | http://127.0.0.1:11434/v1 |
|
||||
| lm studio | | http://127.0.0.1:1234/v1 |
|
||||
| openrouter | [点击获取](https://openrouter.ai/settings/keys) | https://openrouter.ai/api/v1 |
|
||||
@@ -113,3 +158,40 @@ translater.translate_markdown_file(r"<要翻译的markdown路径>",
|
||||
| 阿里云百炼 | [点击获取](https://bailian.console.aliyun.com/?tab=model#/api-key) | https://dashscope.aliyuncs.com/compatible-mode/v1 |
|
||||
| 火山引擎 | [点击获取](https://console.volcengine.com/ark/region:ark+cn-beijing/apiKey?apikey=%7B%7D) | https://ark.cn-beijing.volces.com/api/v3 |
|
||||
| 硅基流动 | [点击获取](https://cloud.siliconflow.cn/account/ak) | https://api.siliconflow.cn/v1 |
|
||||
|
||||
# FAQ
|
||||
|
||||
1. 是否支持扫描件
|
||||
|
||||
> 暂不支持
|
||||
|
||||
2. 第一次使用很慢是怎么回事
|
||||
|
||||
> 第一次是使用时docling需要从huggingface下载转换输入文件为markdown的模型
|
||||
> 通过设置环境变量换源或科学上网可能有助于提高下载速度
|
||||
|
||||
> huggingface换源,请设置环境变量:`HF_ENDPOINT=https://hf-mirror.com`
|
||||
|
||||
3. 如何内网使用(不联网)
|
||||
|
||||
> 可以,对于docling提供的解析pdf、html等功能,可以使用以下方式提前下载所需的模型
|
||||
|
||||
```python
|
||||
from docutranslate.utils.docling_utils import get_docling_artifacts
|
||||
|
||||
print(get_docling_artifacts()) # 会显示模型下载文件夹,通常在`C:\Users\<user>\.cache\docling\models`
|
||||
```
|
||||
|
||||
> 创建FileTranslater时携带模型文件夹即可
|
||||
|
||||
```python
|
||||
from docutranslate import FileTranslater
|
||||
|
||||
translater = FileTranslater(base_url="<baseurl>",
|
||||
key="<key>",
|
||||
model_id="<model-id>", # 使用的模型id
|
||||
docling_artifact=r"C:\Users\<user>\.cache\docling\models"
|
||||
)
|
||||
```
|
||||
|
||||
> 对于llm功能,可以使用ollama或lm studio等方式本地部署。
|
||||
2
docutranslate/Agents/__init__.py
Normal file
2
docutranslate/Agents/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .agent import Agent,AgentArgs
|
||||
from .markdown_agent import MDRefineAgent,MDTranslateAgent
|
||||
@@ -1,18 +1,28 @@
|
||||
import asyncio
|
||||
|
||||
import re
|
||||
from typing import TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
class AgentArgs(TypedDict, total=False):
|
||||
baseurl:str
|
||||
key :str
|
||||
model_id:str
|
||||
system_prompt:str
|
||||
temperature:float
|
||||
max_concurrent:int
|
||||
|
||||
|
||||
TIMEOUT=250
|
||||
|
||||
class Agent:
|
||||
def __init__(self, baseurl="", key="", model_id="", system_prompt="", temperature=0.7, max_concurrent=5):
|
||||
def __init__(self, baseurl="", key="", model_id="", system_prompt="", temperature=0.7, max_concurrent=6):
|
||||
self.baseurl = baseurl
|
||||
self.key = key
|
||||
self.model_id = model_id
|
||||
self.system_prompt = system_prompt
|
||||
self.temperature = temperature
|
||||
# self.client=httpx.Client()
|
||||
self.client_async = httpx.AsyncClient()
|
||||
self.max_concurrent = max_concurrent
|
||||
|
||||
@@ -24,7 +34,7 @@ class Agent:
|
||||
data = {
|
||||
"model": self.model_id,
|
||||
"messages": [
|
||||
{"role": "system", "content": "回复必须以【SSS】开头(该规则适用于之后的所有例子),必须遵守。示例:【SSS】这是示例回答\n"+system_prompt},
|
||||
{"role": "system", "content": "重要:所有回复必须以【SSS】开头(该规则适用于之后的所有例子)。示例:【SSS】这是示例回答\n"+system_prompt},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"temperature": temperature,
|
||||
@@ -83,7 +93,7 @@ class Agent:
|
||||
result = await self.send_async(
|
||||
prompt=p_text,
|
||||
system_prompt=system_prompt,
|
||||
timeout=TIMEOUT
|
||||
timeout=timeout
|
||||
)
|
||||
nonlocal count
|
||||
count += 1
|
||||
49
docutranslate/Agents/markdown_agent.py
Normal file
49
docutranslate/Agents/markdown_agent.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from typing import Unpack
|
||||
|
||||
from .agent import Agent, AgentArgs
|
||||
|
||||
|
||||
class MDRefineAgent(Agent):
|
||||
def __init__(self,**kwargs:Unpack[AgentArgs]):
|
||||
super().__init__(**kwargs)
|
||||
self.system_prompt=r"""# 角色
|
||||
你是一个修正markdown文本的专家。
|
||||
# 工作
|
||||
找到markdown片段的不合理之处,对于缺失的句子,应该查看缺失的语句是否可能被错误的放在了其他位置,并通过重组段落、去掉异常字词修复不合理之处。
|
||||
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。code和latex保持原文。
|
||||
# 输出
|
||||
修正后的markdown纯文本
|
||||
# 示例
|
||||
## 调整顺序
|
||||
输入:
|
||||
applications and scenarios becoming more and more extensive.
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
|
||||
输出:
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.【answer-end】
|
||||
## 去掉异常字词
|
||||
输入:
|
||||
一道\题@#目:\(x_1+1=2\)
|
||||
输出:
|
||||
一道题目:\(x_1+1=2\)
|
||||
\no_think"""
|
||||
|
||||
|
||||
class MDTranslateAgent(Agent):
|
||||
def __init__(self,to_lang="中文",**kwargs:Unpack[AgentArgs]):
|
||||
super().__init__(**kwargs)
|
||||
self.system_prompt=f"""# 角色
|
||||
你是一个翻译markdown文本的专家。
|
||||
# 工作
|
||||
将输入的markdown文本翻译成{to_lang}。
|
||||
尽量忠实于原文(如空行)。
|
||||
形如<ph-abc123>的占位符不要改变。
|
||||
code和latex保持原文。
|
||||
# 输出
|
||||
翻译后的markdown纯文本
|
||||
# 示例
|
||||
## 英文翻译为中文:
|
||||
输入:
|
||||
hello<ph-aaaaaa>, what's your name?
|
||||
输出:
|
||||
你好<ph-aaaaaa>,你叫什么名字?
|
||||
\\no_think"""
|
||||
@@ -1,32 +0,0 @@
|
||||
from functools import wraps
|
||||
from typing import Concatenate, ParamSpec, Callable
|
||||
import re
|
||||
|
||||
from docutranslate.utils.markdown_utils import MaskDict
|
||||
|
||||
P=ParamSpec("P")
|
||||
def mask_uris_temp(func:Callable[Concatenate[str, P], str]) -> Callable[Concatenate[str, P], str]:
|
||||
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(markdown: str, *args: P.args, **kwargs: P.kwargs) -> str:
|
||||
mask_dict=MaskDict()
|
||||
def uri2placeholder(match:re.Match):
|
||||
id=mask_dict.create_id()
|
||||
mask_dict.set(id,match.group())
|
||||
return f"<ph-{id}>"
|
||||
def placeholder2uri(match:re.Match):
|
||||
id=match.group(1)
|
||||
uri=mask_dict.get(id)
|
||||
if uri is None:
|
||||
return match.group()
|
||||
return uri
|
||||
uri_pattern=r'!?\[.*?\]\(.*?\)'
|
||||
markdown=re.sub(uri_pattern,uri2placeholder,markdown)
|
||||
result=func(markdown, *args, **kwargs)
|
||||
ph_pattern=r"<ph-([a-zA-Z0-9]+)>"
|
||||
result=re.sub(ph_pattern,placeholder2uri,result)
|
||||
return result
|
||||
return wrapper
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
@@ -3,16 +3,16 @@ from typing import Literal
|
||||
|
||||
import markdown2
|
||||
|
||||
from docutranslate.decorator.markdown_mask import MaskDict
|
||||
from docutranslate.utils.agent_utils import Agent
|
||||
from docutranslate.utils.convert import pdf2markdown_embed_images
|
||||
from docutranslate.Agents import MDRefineAgent, MDTranslateAgent
|
||||
from docutranslate.Agents.agent import Agent, AgentArgs
|
||||
from docutranslate.utils.convert import file2markdown_embed_images
|
||||
from docutranslate.utils.markdown_splitter import split_markdown_text
|
||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris
|
||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
|
||||
|
||||
|
||||
class FileTranslater:
|
||||
def __init__(self, file_path: Path | str | None = None, chunksize: int = 4096, base_url="", key=None,
|
||||
model_id="", temperature=0.7, max_concurrent=6):
|
||||
model_id="", temperature=0.7, max_concurrent=6, docling_artifact: Path | str | None = None):
|
||||
if isinstance(file_path, str):
|
||||
file_path = Path(file_path)
|
||||
self.file_path: Path = file_path
|
||||
@@ -25,6 +25,7 @@ class FileTranslater:
|
||||
self.key: str = key if key is not None else "xx"
|
||||
self.model_id: str = model_id
|
||||
self.temperature = temperature
|
||||
self.docling_artifact=docling_artifact
|
||||
|
||||
def _mask_uris_in_markdown(self):
|
||||
self.markdown = uris2placeholder(self.markdown, self._mask_dict)
|
||||
@@ -39,110 +40,62 @@ class FileTranslater:
|
||||
print(f"markdown分为{len(chunks)}块")
|
||||
return chunks
|
||||
|
||||
def create_refine_agent(self, baseurl=None, key=None, model_id=None, temperature=None):
|
||||
baseurl = self.base_url if baseurl is None else baseurl
|
||||
key = self.key if key is None else key
|
||||
model_id = self.model_id if model_id is None else model_id
|
||||
temperature = self.temperature if temperature is None else temperature
|
||||
agent = Agent(baseurl=baseurl,
|
||||
key=key,
|
||||
model_id=model_id,
|
||||
temperature=temperature,
|
||||
max_concurrent=self.max_concurrent)
|
||||
agent.system_prompt = r"""# 角色
|
||||
你是一个修正markdown文本的专家。
|
||||
# 工作
|
||||
找到markdown片段的不合理之处,对于缺失的句子,应该查看缺失的语句是否可能被错误的放在了其他位置,并通过重组段落、去掉异常字词修复不合理之处。
|
||||
尽量忠实于原文。形如<ph-abc123>的占位符不要改变。code和latex保持原文。
|
||||
# 输出
|
||||
修正后的markdown纯文本
|
||||
# 示例
|
||||
## 调整顺序
|
||||
输入:
|
||||
applications and scenarios becoming more and more extensive.
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its
|
||||
输出:
|
||||
Blockchain's origination was Bitcoin, the most successful of the digital currencies (cryptocurrencies). Since 1983, when digital currency was first proposed, the Internet has continued to burgeon, with its applications and scenarios becoming more and more extensive.【answer-end】
|
||||
## 去掉异常字词
|
||||
输入:
|
||||
你@好,你叫什么\名#字
|
||||
输出:
|
||||
你好,你叫什么名字\no_think"""
|
||||
return agent
|
||||
def default_agent_params(self) -> AgentArgs:
|
||||
result: AgentArgs = {
|
||||
"baseurl": self.base_url,
|
||||
"key": self.key,
|
||||
"model_id": self.model_id,
|
||||
"temperature": self.temperature,
|
||||
"max_concurrent": self.max_concurrent
|
||||
}
|
||||
return result
|
||||
|
||||
def create_translate_agent(self, baseurl=None, key=None, model_id=None, temperature=None, to_lang="中文"):
|
||||
baseurl = self.base_url if baseurl is None else baseurl
|
||||
key = self.key if key is None else key
|
||||
model_id = self.model_id if model_id is None else model_id
|
||||
temperature = self.temperature if temperature is None else temperature
|
||||
agent = Agent(baseurl=baseurl,
|
||||
key=key,
|
||||
model_id=model_id,
|
||||
temperature=temperature,
|
||||
max_concurrent=self.max_concurrent)
|
||||
agent.system_prompt = r"""# 角色
|
||||
你是一个翻译markdown文本的专家。
|
||||
# 工作
|
||||
将输入的markdown文本翻译成{0}。
|
||||
尽量忠实于原文(如空行)。
|
||||
形如<ph-abc123>的占位符不要改变。
|
||||
code和latex保持原文。
|
||||
# 输出
|
||||
翻译后的markdown纯文本
|
||||
# 示例
|
||||
## 英文翻译为中文:
|
||||
输入:
|
||||
hello<ph-aaaaaa>, what's your name?
|
||||
输出:
|
||||
你好<ph-aaaaaa>,你叫什么名字?\no_think""".format(to_lang)
|
||||
return agent
|
||||
|
||||
def read_pdf_as_markdown(self, pdf: Path | str | None = None, formula=False, code=False, save=False):
|
||||
print("正在将pdf转换为markdown")
|
||||
if pdf is None:
|
||||
pdf = self.file_path
|
||||
if isinstance(pdf, str):
|
||||
pdf = Path(pdf)
|
||||
self.markdown = pdf2markdown_embed_images(pdf, formula, code)
|
||||
print("pdf已转换")
|
||||
if save:
|
||||
self.save_as_markdown(filename=f"{pdf.stem}.md")
|
||||
return self
|
||||
|
||||
def read_markdown(self, markdown_path: Path | str):
|
||||
if isinstance(markdown_path, str):
|
||||
markdown_path = Path(markdown_path)
|
||||
self.file_path = markdown_path
|
||||
with open(markdown_path, "r") as f:
|
||||
def read_file(self, file_path: Path | str | None = None, formula=False, code=False, save=False):
|
||||
if file_path is None:
|
||||
if self.file_path is None:
|
||||
raise Exception("未设置文件路径")
|
||||
file_path = self.file_path
|
||||
if isinstance(file_path, str):
|
||||
file_path = Path(file_path)
|
||||
print(f"读取文件:{file_path.name}")
|
||||
# 如果是markdown,直接读取
|
||||
if file_path.suffix == ".md":
|
||||
with open(file_path, "r") as f:
|
||||
self.markdown = f.read()
|
||||
else:
|
||||
print(f"正在将{file_path.resolve().name}转换为markdown")
|
||||
self.markdown = file2markdown_embed_images(file_path, formula, code, artifacts_path=self.docling_artifact)
|
||||
print("已转换为markdown")
|
||||
if save:
|
||||
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||
return self
|
||||
|
||||
def refine_markdown(self, refine_agent: Agent | None = None) -> str:
|
||||
def refine_markdown_by_agent(self, refine_agent: Agent | None = None) -> str:
|
||||
print("正在修正markdown")
|
||||
if refine_agent is None:
|
||||
refine_agent = self.create_refine_agent(self.base_url, self.key, self.model_id, self.temperature)
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
result: list[str] = refine_agent.send_prompts(chuncks, timeout=10000)
|
||||
self.markdown = "".join(result)
|
||||
print("markdown已修正")
|
||||
return self.markdown
|
||||
|
||||
def translate_markdown(self, translate_agent: Agent | None = None):
|
||||
def translate_markdown_by_agent(self, translate_agent: Agent):
|
||||
print("正在翻译markdown")
|
||||
if translate_agent is None:
|
||||
translate_agent = self.create_translate_agent()
|
||||
chuncks = self._split_markdown_into_chunks()
|
||||
result: list[str] = translate_agent.send_prompts(chuncks, timeout=10000)
|
||||
self.markdown = "".join(result)
|
||||
print("翻译完成")
|
||||
return self.markdown
|
||||
|
||||
def save_as_markdown(self, filename: str | Path = "output.md", output_dir: str | Path = "./output"):
|
||||
def save_as_markdown(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
||||
if isinstance(filename, str):
|
||||
filename = Path(filename)
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
if filename is None:
|
||||
if self.file_path is not None:
|
||||
filename = self.file_path.name
|
||||
else:
|
||||
filename = "output.md"
|
||||
# 确保输出目录存在
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
full_name = output_dir / filename
|
||||
@@ -154,12 +107,16 @@ hello<ph-aaaaaa>, what's your name?
|
||||
def export_to_markdown(self):
|
||||
return self.markdown
|
||||
|
||||
def save_as_html(self, filename: str | Path = "output.html", output_dir: str | Path = "./output"):
|
||||
def save_as_html(self, filename: str | Path | None = None, output_dir: str | Path = "./output"):
|
||||
if isinstance(filename, str):
|
||||
filename = Path(filename)
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
if filename is None:
|
||||
if self.file_path is not None:
|
||||
filename = self.file_path.name
|
||||
else:
|
||||
filename = "output.html"
|
||||
# 确保输出目录存在
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
full_name = output_dir / filename
|
||||
@@ -223,62 +180,29 @@ hello<ph-aaaaaa>, what's your name?
|
||||
"""
|
||||
return html
|
||||
|
||||
def translate_pdf_file(self, pdf_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
||||
def translate_file(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
||||
formula=False,
|
||||
code=False, output_format: Literal["markdown", "html"] = "markdown", refine=True,
|
||||
refine_agent: Agent | None = None, translate_agent: Agent | None = None):
|
||||
assert output_format in ("markdown", "html"), "output_format格式错误"
|
||||
if pdf_path is None:
|
||||
if file_path is None:
|
||||
assert self.file_path is not None, "未输入文件路径"
|
||||
pdf_path = self.file_path
|
||||
if isinstance(pdf_path, str):
|
||||
pdf_path = Path(pdf_path)
|
||||
self.read_pdf_as_markdown(pdf_path, formula=formula, code=code)
|
||||
file_path = self.file_path
|
||||
if isinstance(file_path, str):
|
||||
file_path = Path(file_path)
|
||||
self.read_file(file_path, formula=formula, code=code)
|
||||
self._mask_uris_in_markdown()
|
||||
if refine:
|
||||
if refine_agent is None:
|
||||
refine_agent = self.create_refine_agent()
|
||||
self.refine_markdown(refine_agent)
|
||||
refine_agent = MDRefineAgent(**self.default_agent_params())
|
||||
self.refine_markdown_by_agent(refine_agent)
|
||||
if translate_agent is None:
|
||||
translate_agent = self.create_translate_agent(to_lang=to_lang)
|
||||
self.translate_markdown(translate_agent)
|
||||
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
|
||||
self.translate_markdown_by_agent(translate_agent)
|
||||
self._unmask_uris_in_markdown()
|
||||
if output_format == "markdown":
|
||||
filename = f"{pdf_path.stem}_{to_lang}.md"
|
||||
filename = f"{file_path.stem}_{to_lang}.md"
|
||||
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
||||
elif output_format == "html":
|
||||
filename = f"{pdf_path.stem}_{to_lang}.html"
|
||||
filename = f"{file_path.stem}_{to_lang}.html"
|
||||
self.save_as_html(filename=filename, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
def translate_markdown_file(self, markdown_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
||||
output_format: Literal["markdown", "html"] = "markdown",
|
||||
refine=False, refine_agent: Agent | None = None, translate_agent: Agent | None = None):
|
||||
assert output_format in ("markdown", "html"), "output_format格式错误"
|
||||
if markdown_path is None:
|
||||
assert self.file_path is not None, "未输入文件路径"
|
||||
markdown_path = self.file_path
|
||||
elif isinstance(markdown_path, str):
|
||||
markdown_path = Path(markdown_path)
|
||||
with open(markdown_path, "r") as f:
|
||||
self.markdown = f.read()
|
||||
self._mask_uris_in_markdown()
|
||||
if refine:
|
||||
if refine_agent is None:
|
||||
refine_agent = self.create_refine_agent()
|
||||
self.refine_markdown(refine_agent)
|
||||
if translate_agent is None:
|
||||
translate_agent = self.create_translate_agent(to_lang=to_lang)
|
||||
self.translate_markdown(translate_agent)
|
||||
self._unmask_uris_in_markdown()
|
||||
if output_format == "markdown":
|
||||
filename = f"{markdown_path.stem}_{to_lang}.md"
|
||||
self.save_as_markdown(filename=filename, output_dir=output_dir)
|
||||
elif output_format == "html":
|
||||
filename = f"{markdown_path.stem}_{to_lang}.html"
|
||||
self.save_as_html(filename=filename, output_dir=output_dir)
|
||||
return self
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import os
|
||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
@@ -7,8 +9,11 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
IMAGE_RESOLUTION_SCALE = 4
|
||||
|
||||
|
||||
def pdf2markdown_embed_images(pdf: Path | str, formula=False, code=False) -> str:
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
def file2markdown_embed_images(file_path: Path | str, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
|
||||
if isinstance(file_path,str):
|
||||
file_path=Path(file_path)
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
# pipeline_options.do_ocr=False
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_picture_images = True
|
||||
if formula:
|
||||
@@ -18,8 +23,10 @@ def pdf2markdown_embed_images(pdf: Path | str, formula=False, code=False) -> str
|
||||
converter = DocumentConverter(format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
})
|
||||
result = converter.convert(pdf).document.export_to_markdown( image_mode=ImageRefMode.EMBEDDED)
|
||||
try:
|
||||
result = converter.convert(file_path).document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
except LocalEntryNotFoundError:
|
||||
print(f"无法连接huggingface,正在尝试换源")
|
||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||
result = converter.convert(file_path).document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||
return result
|
||||
|
||||
if __name__ == '__main__':
|
||||
pass
|
||||
5
docutranslate/utils/docling_utils.py
Normal file
5
docutranslate/utils/docling_utils.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
def get_docling_artifacts():
|
||||
path = StandardPdfPipeline.download_models_hf()
|
||||
return path
|
||||
@@ -1,7 +1,7 @@
|
||||
[project]
|
||||
name = "docutranslate"
|
||||
version = "0.0.8"
|
||||
description = "能翻译pdf和markdown的工具"
|
||||
version = "0.1.0"
|
||||
description = "文件翻译工具"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
|
||||
Reference in New Issue
Block a user