增加对mineru的支持

This commit is contained in:
xunbu
2025-05-20 18:16:58 +08:00
parent a55d20af77
commit 85573561e4
10 changed files with 814 additions and 263 deletions

253
.idea/workspace.xml generated
View File

@@ -5,7 +5,16 @@
</component> </component>
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment=""> <list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
<change beforePath="$PROJECT_DIR$/app.spec" beforeDir="false" afterPath="$PROJECT_DIR$/app.spec" afterDir="false" /> <change afterPath="$PROJECT_DIR$/docutranslate/converter/__init__.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/docutranslate/converter/converter.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/docutranslate/converter/converter_docling.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/docutranslate/converter/converter_mineru.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/app.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/static/index.html" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/static/index.html" afterDir="false" />
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
</list> </list>
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" /> <option name="HIGHLIGHT_CONFLICTS" value="true" />
@@ -35,59 +44,62 @@
<option name="hideEmptyMiddlePackages" value="true" /> <option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" /> <option name="showLibraryContents" value="true" />
</component> </component>
<component name="PropertiesComponent">{ <component name="PropertiesComponent"><![CDATA[{
&quot;keyToString&quot;: { "keyToString": {
&quot;DefaultHtmlFileTemplate&quot;: &quot;HTML File&quot;, "DefaultHtmlFileTemplate": "HTML File",
&quot;JavaScript 调试.output.html (1).executor&quot;: &quot;Run&quot;, "JavaScript 调试.output.html (1).executor": "Run",
&quot;JavaScript 调试.output.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.output.html.executor": "Run",
&quot;JavaScript 调试.regex.md_中文.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.regex.md_中文.html.executor": "Run",
&quot;JavaScript 调试.regex_中文.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.regex_中文.html.executor": "Run",
&quot;JavaScript 调试.test.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.test.html.executor": "Run",
&quot;JavaScript 调试.test2.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.test2.html.executor": "Run",
&quot;JavaScript 调试.test2_英文.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.test2_英文.html.executor": "Run",
&quot;JavaScript 调试.test4-1_中文.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.test4-1_中文.html.executor": "Run",
&quot;JavaScript 调试.互联网认证授权机制.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.互联网认证授权机制.html.executor": "Run",
&quot;JavaScript 调试.互联网认证授权机制_英文.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
&quot;JavaScript 调试.毕业论文_英文.html.executor&quot;: &quot;Run&quot;, "JavaScript 调试.毕业论文_英文.html.executor": "Run",
&quot;ModuleVcsDetector.initialDetectionPerformed&quot;: &quot;true&quot;, "ModuleVcsDetector.initialDetectionPerformed": "true",
&quot;Python 测试.Python 测试 (markdown_mask.py 内).executor&quot;: &quot;Run&quot;, "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
&quot;Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor&quot;: &quot;Run&quot;, "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
&quot;Python 测试.pytest (test_html.py 内).executor&quot;: &quot;Run&quot;, "Python 测试.pytest (test_html.py 内).executor": "Run",
&quot;Python.2test2 (1).executor&quot;: &quot;Run&quot;, "Python.1test.executor": "Run",
&quot;Python.PDFtranslater (1).executor&quot;: &quot;Run&quot;, "Python.2test2 (1).executor": "Run",
&quot;Python.PDFtranslater (2).executor&quot;: &quot;Run&quot;, "Python.PDFtranslater (1).executor": "Run",
&quot;Python.agent.executor&quot;: &quot;Debug&quot;, "Python.PDFtranslater (2).executor": "Run",
&quot;Python.agent_utils.executor&quot;: &quot;Run&quot;, "Python.agent.executor": "Debug",
&quot;Python.app (1).executor&quot;: &quot;Run&quot;, "Python.agent_utils.executor": "Run",
&quot;Python.app.executor&quot;: &quot;Run&quot;, "Python.app (1).executor": "Run",
&quot;Python.app2.executor&quot;: &quot;Run&quot;, "Python.app.executor": "Run",
&quot;Python.app_test (1).executor&quot;: &quot;Run&quot;, "Python.app2.executor": "Run",
&quot;Python.convert.executor&quot;: &quot;Run&quot;, "Python.app_test (1).executor": "Run",
&quot;Python.markdown_splitter.executor&quot;: &quot;Debug&quot;, "Python.convert.executor": "Run",
&quot;Python.markdown_utils.executor&quot;: &quot;Run&quot;, "Python.converter_docling.executor": "Run",
&quot;Python.test.executor&quot;: &quot;Run&quot;, "Python.converter_mineru.executor": "Run",
&quot;Python.test1.executor&quot;: &quot;Run&quot;, "Python.markdown_splitter.executor": "Debug",
&quot;Python.test2.executor&quot;: &quot;Run&quot;, "Python.markdown_utils.executor": "Run",
&quot;Python.test3.executor&quot;: &quot;Run&quot;, "Python.test.executor": "Run",
&quot;Python.test4.executor&quot;: &quot;Run&quot;, "Python.test1.executor": "Run",
&quot;Python.testhtml.executor&quot;: &quot;Run&quot;, "Python.test2.executor": "Run",
&quot;Python.translater.executor&quot;: &quot;Run&quot;, "Python.test3.executor": "Run",
&quot;Python.切分测试.executor&quot;: &quot;Run&quot;, "Python.test4.executor": "Run",
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;, "Python.testhtml.executor": "Run",
&quot;RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager&quot;: &quot;true&quot;, "Python.translater.executor": "Run",
&quot;RunOnceActivity.git.unshallow&quot;: &quot;true&quot;, "Python.切分测试.executor": "Run",
&quot;git-widget-placeholder&quot;: &quot;main&quot;, "RunOnceActivity.ShowReadmeOnStart": "true",
&quot;last_opened_file_path&quot;: &quot;C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate&quot;, "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
&quot;list.type.of.created.stylesheet&quot;: &quot;CSS&quot;, "RunOnceActivity.git.unshallow": "true",
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;, "git-widget-placeholder": "main",
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;, "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate",
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;, "list.type.of.created.stylesheet": "CSS",
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;, "node.js.detected.package.eslint": "true",
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;, "node.js.detected.package.tslint": "true",
&quot;settings.editor.selected.configurable&quot;: &quot;preferences.pluginManager&quot;, "node.js.selected.package.eslint": "(autodetect)",
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot; "node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "preferences.pluginManager",
"vue.rearranger.settings.migration": "true"
} }
}</component> }]]></component>
<component name="RecentsManager"> <component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS"> <key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\jxgm\Desktop\FileTranslate\dist\DocuTranslate" /> <recent name="C:\Users\jxgm\Desktop\FileTranslate\dist\DocuTranslate" />
@@ -267,7 +279,27 @@
<option name="OPTIONS" value="" /> <option name="OPTIONS" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="test.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/test.html" useBuiltInWebServerPort="true"> <configuration name="1test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/1test.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="2test2 (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true"> <configuration name="2test2 (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
@@ -293,29 +325,6 @@
<option name="INPUT_FILE" value="" /> <option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="3testhtml" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="C:\Users\jxgm\Desktop\FileTranslate\tests\3testhtml.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration default="true" type="PythonConfigurationType" factoryName="Python"> <configuration default="true" type="PythonConfigurationType" factoryName="Python">
<module name="filetranslate" /> <module name="filetranslate" />
<option name="ENV_FILES" value="" /> <option name="ENV_FILES" value="" />
@@ -362,6 +371,52 @@
<option name="INPUT_FILE" value="" /> <option name="INPUT_FILE" value="" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="converter_docling" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/converter" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/converter/converter_docling.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="converter_mineru" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/converter" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/converter/converter_mineru.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration default="true" type="Python.FlaskServer"> <configuration default="true" type="Python.FlaskServer">
<module name="filetranslate" /> <module name="filetranslate" />
<option name="ENV_FILES" value="" /> <option name="ENV_FILES" value="" />
@@ -459,31 +514,13 @@
<option name="USE_PATTERN" value="false" /> <option name="USE_PATTERN" value="false" />
<method v="2" /> <method v="2" />
</configuration> </configuration>
<configuration name="pytest (3testhtml.py 内)" type="tests" factoryName="py.test" temporary="true" nameIsGenerated="true">
<module name="FileTranslate" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="_new_keywords" value="&quot;&quot;" />
<option name="_new_parameters" value="&quot;&quot;" />
<option name="_new_additionalArguments" value="&quot;&quot;" />
<option name="_new_target" value="&quot;$PROJECT_DIR$/tests/3testhtml.py&quot;" />
<option name="_new_targetType" value="&quot;PATH&quot;" />
<method v="2" />
</configuration>
<recent_temporary> <recent_temporary>
<list> <list>
<item itemvalue="Python.app_test (1)" /> <item itemvalue="Python.app_test (1)" />
<item itemvalue="Python.1test" />
<item itemvalue="Python.converter_docling" />
<item itemvalue="Python.converter_mineru" />
<item itemvalue="Python.2test2 (1)" /> <item itemvalue="Python.2test2 (1)" />
<item itemvalue="Python.3testhtml" />
<item itemvalue="JavaScript 调试.test.html" />
<item itemvalue="Python 测试.pytest (3testhtml.py 内)" />
</list> </list>
</recent_temporary> </recent_temporary>
</component> </component>
@@ -568,14 +605,26 @@
<workItem from="1747628254543" duration="7347000" /> <workItem from="1747628254543" duration="7347000" />
<workItem from="1747635705571" duration="391000" /> <workItem from="1747635705571" duration="391000" />
<workItem from="1747650908714" duration="121000" /> <workItem from="1747650908714" duration="121000" />
<workItem from="1747731199599" duration="4260000" />
</task> </task>
<servers /> <servers />
</component> </component>
<component name="TypeScriptGeneratedFilesManager"> <component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" /> <option name="version" value="3" />
</component> </component>
<component name="Vcs.Log.Tabs.Properties">
<option name="TAB_STATES">
<map>
<entry key="MAIN">
<value>
<State />
</value>
</entry>
</map>
</option>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl"> <component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747634037187" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747733748258" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" /> <SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" /> <SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
@@ -583,14 +632,17 @@
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/filetranslate$app2.coverage" NAME="app2 覆盖结果" MODIFIED="1747108180309" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" /> <SUITE FILE_PATH="coverage/filetranslate$app2.coverage" NAME="app2 覆盖结果" MODIFIED="1747108180309" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
<SUITE FILE_PATH="coverage/filetranslate$app.coverage" NAME="app 覆盖结果" MODIFIED="1747448464521" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" /> <SUITE FILE_PATH="coverage/filetranslate$app.coverage" NAME="app 覆盖结果" MODIFIED="1747448464521" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
<SUITE FILE_PATH="coverage/filetranslate$converter_mineru.coverage" NAME="converter_mineru 覆盖结果" MODIFIED="1747726229881" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/converter" />
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/filetranslate$converter_docling.coverage" NAME="converter_docling 覆盖结果" MODIFIED="1747726654277" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/converter" />
<SUITE FILE_PATH="coverage/filetranslate$1test.coverage" NAME="1test 覆盖结果" MODIFIED="1747732504752" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746884110572" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746884110572" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$app__1_.coverage" NAME="app (1) 覆盖结果" MODIFIED="1747136094477" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$app__1_.coverage" NAME="app (1) 覆盖结果" MODIFIED="1747136094477" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" /> <SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" /> <SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" />
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" /> <SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$2test2__1_.coverage" NAME="2test2 (1) 覆盖结果" MODIFIED="1747579915531" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$2test2__1_.coverage" NAME="2test2 (1) 覆盖结果" MODIFIED="1747722801777" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" /> <SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/filetranslate$.coverage" NAME="切分测试 覆盖结果" MODIFIED="1747187128847" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$.coverage" NAME="切分测试 覆盖结果" MODIFIED="1747187128847" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746936018440" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746936018440" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
@@ -601,6 +653,7 @@
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" /> <SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1747553897731" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" /> <SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1747553897731" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746843159560" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" /> <SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746843159560" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
<SUITE FILE_PATH="coverage/filetranslate$docling_utils.coverage" NAME="docling_utils 覆盖结果" MODIFIED="1747710836730" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" /> <SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
</component> </component>
</project> </project>

View File

@@ -4,12 +4,15 @@
[![image](https://img.shields.io/badge/github-DocuTranslate-blue)](https://github.com/xunbu/docutranslate) [![image](https://img.shields.io/badge/github-DocuTranslate-blue)](https://github.com/xunbu/docutranslate)
文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译 文件翻译工具,借助[docling](https://github.com/docling-project/docling)、[minerU](https://mineru.net/)与大语言模型实现多种格式文件的翻译
> QQ交流群1047781902
# 整合包 # 整合包
对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases) 对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases)
上下载最新的整合包该整合包点击即用您所需的只是获取某个ai平台的api-key。 上下载最新的整合包该整合包点击即用您所需的只是获取某个ai平台的api-key。
以及可以在mineru申请token进行pdf识别【可选】
# 安装 # 安装
@@ -34,7 +37,16 @@
# 前置条件 # 前置条件
## huggingface换源 本翻译工具的翻译流程总体如下:
1. 使用文本转换引擎将文档转换成markdown有docling本地、minerU联网两种引擎
2. 使用大语言模型翻译markdown文本需要申请api-key或本地部署
## 使用docling引擎注意事项
使用docling将文档转换为markdown时需要下载模型到本地也可以提前下载见FAQ因此可能会遇到一些网络问题
### huggingface换源
> 不能科学上网的友友注意了 > 不能科学上网的友友注意了
@@ -43,12 +55,12 @@
- 第一次读取非markdown文本 - 第一次读取非markdown文本
- 第一次使用公式识别或代码识别功能 - 第一次使用公式识别或代码识别功能
### 方法1 #### 方法1
设置电脑的环境变量(记得设置后重启IDE) 设置电脑的环境变量(记得设置后重启IDE)
`HF_ENDPOINT=https://hf-mirror.com` `HF_ENDPOINT=https://hf-mirror.com`
### 方法2 #### 方法2
在代码开头设置环境变量 在代码开头设置环境变量
@@ -60,6 +72,13 @@ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
###其余代码写在下方 ###其余代码写在下方
``` ```
## 使用minerU引擎注意事项
使用minerU将文档转换为markdown时需要在minerU平台申请token
1. 打开[minerU官网](https://mineru.net/apiManage/docs)申请token
2. 申请成功后,在[API Token管理界面](https://mineru.net/apiManage/token)创建API Token
## 获取大模型平台的baseurl、key、model-id ## 获取大模型平台的baseurl、key、model-id
由于需要使用大语言模型进行markdown调整与翻译所以需要预先获取模型的baseurl、key、model-id 由于需要使用大语言模型进行markdown调整与翻译所以需要预先获取模型的baseurl、key、model-id
@@ -90,7 +109,12 @@ from docutranslate.translater import FileTranslater
translater = FileTranslater(base_url="<baseurl>", translater = FileTranslater(base_url="<baseurl>",
key="<key>", key="<key>",
model_id="<model-id>") model_id="<model-id>",
convert_engin="docling" # 默认使用docling
# convert_engin="mineru",# 使用mineru
# mineru_token="<申请的mineru_token>"#使用mineru时必填
)
# 不开启公式、代码识别默认输出为markdown文件 # 不开启公式、代码识别默认输出为markdown文件
translater.translate_file("<文件路径>", to_lang="中文") translater.translate_file("<文件路径>", to_lang="中文")
@@ -141,12 +165,14 @@ translater.read_file("<文件路径>").save_as_markdown()
from docutranslate import FileTranslater from docutranslate import FileTranslater
translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl
key="<key>", # 默认的模型api-key key="<api-key>", # 默认的大语言模型平台api-key
model_id="<model-id>", # 默认的模型id model_id="<model-id>", # 默认的模型id
chunksize=2000, # markdown分块长度单位byte分块越大效果越好也越慢不建议超过8000 chunksize=2000, # markdown分块长度单位byte分块越大效果越好也越慢不建议超过8000
max_concurrent=20, # 并发数受到ai平台并发量限制如果文章很长建议适当加大到20以上 max_concurrent=20, # 并发数受到ai平台并发量限制如果文章很长建议适当加大到20以上
docling_artifact=None, # 使用提前下载好的docling模型
timeout=2000, # 调用api的超时时间 timeout=2000, # 调用api的超时时间
docling_artifact=None, # 使用提前下载好的docling模型
convert_engin="mineru", # 可选docling或minerU
mineru_token="<mineru-token>", # minerU的token使用minerU时必填
tips=True # 开场提示 tips=True # 开场提示
) )
@@ -206,7 +232,8 @@ from docutranslate.utils.docling_utils import get_docling_artifacts
print(get_docling_artifacts()) # 会显示模型下载文件夹,通常在`C:\Users\<user>\.cache\docling\models` print(get_docling_artifacts()) # 会显示模型下载文件夹,通常在`C:\Users\<user>\.cache\docling\models`
``` ```
> 创建FileTranslater时携带模型文件夹即可 > 将模型文件夹命名为docling_artifact放置在项目下
> 或创建FileTranslater时docling_artifact参数设置为文件夹位置
```python ```python
from docutranslate import FileTranslater from docutranslate import FileTranslater

View File

@@ -11,7 +11,7 @@ from fastapi import FastAPI, File, Form, UploadFile, Request, HTTPException
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse,FileResponse from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse,FileResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from docutranslate import FileTranslater from docutranslate import FileTranslater # Assuming FileTranslater is in docutranslate module
from docutranslate.logger import translater_logger from docutranslate.logger import translater_logger
from docutranslate.utils.resource_utils import resource_path from docutranslate.utils.resource_utils import resource_path
@@ -19,11 +19,10 @@ app = FastAPI()
STATIC_DIR=resource_path("static") STATIC_DIR=resource_path("static")
# print(f"__file__:{Path(__file__).resolve()}")
app.mount("/static",StaticFiles(directory=STATIC_DIR), name="static") app.mount("/static",StaticFiles(directory=STATIC_DIR), name="static")
# --- 全局配置 --- # --- 全局配置 ---
log_queue: Optional[asyncio.Queue] = None # Will be initialized in startup_event log_queue: Optional[asyncio.Queue] = None
current_state: Dict[str, Any] = { current_state: Dict[str, Any] = {
"is_processing": False, "is_processing": False,
"status_message": "空闲", "status_message": "空闲",
@@ -36,9 +35,9 @@ current_state: Dict[str, Any] = {
"task_end_time": 0, "task_end_time": 0,
"current_task_ref": None, "current_task_ref": None,
} }
templates = Jinja2Templates(directory=".") templates = Jinja2Templates(directory=".") # Not strictly used if index.html is served as FileResponse
MAX_LOG_HISTORY = 200 # Max items for the persistent log_history list MAX_LOG_HISTORY = 200
log_history: List[str] = [] # Keeps a longer history, not directly for "unread" log_history: List[str] = []
# --- 日志处理器 --- # --- 日志处理器 ---
@@ -51,25 +50,20 @@ class QueueAndHistoryHandler(logging.Handler):
def emit(self, record: logging.LogRecord): def emit(self, record: logging.LogRecord):
log_entry = self.format(record) log_entry = self.format(record)
print(log_entry) # Keep console log for server visibility
# Add to the persistent history (capped)
self.history_list.append(log_entry) self.history_list.append(log_entry)
if len(self.history_list) > self.max_history: if len(self.history_list) > self.max_history:
del self.history_list[:len(self.history_list) - self.max_history] del self.history_list[:len(self.history_list) - self.max_history]
# Add to the "unread" queue for frontend consumption
try:
# Ensure self.queue is not None (it's initialized at startup)
if self.queue is not None: if self.queue is not None:
try:
main_loop = getattr(app.state, "main_event_loop", None) main_loop = getattr(app.state, "main_event_loop", None)
if main_loop and main_loop.is_running(): if main_loop and main_loop.is_running():
main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry) main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry)
else: else:
self.queue.put_nowait(log_entry) # Fallback self.queue.put_nowait(log_entry)
else:
print(f"CRITICAL: Log queue not initialized. Log: {log_entry}")
except asyncio.QueueFull: except asyncio.QueueFull:
print(f"Log queue is full. Log dropped: {log_entry}") # Or handle differently print(f"Log queue is full. Log dropped: {log_entry}")
except Exception as e: except Exception as e:
print(f"Error putting log to queue: {e}. Log: {log_entry}") print(f"Error putting log to queue: {e}. Log: {log_entry}")
@@ -79,7 +73,7 @@ class QueueAndHistoryHandler(logging.Handler):
async def startup_event(): async def startup_event():
global log_queue global log_queue
app.state.main_event_loop = asyncio.get_running_loop() app.state.main_event_loop = asyncio.get_running_loop()
log_queue = asyncio.Queue() # Initialize the global log_queue log_queue = asyncio.Queue()
for handler in translater_logger.handlers[:]: for handler in translater_logger.handlers[:]:
translater_logger.removeHandler(handler) translater_logger.removeHandler(handler)
@@ -93,7 +87,7 @@ async def startup_event():
translater_logger.setLevel(logging.INFO) translater_logger.setLevel(logging.INFO)
log_history.clear() log_history.clear()
while not log_queue.empty(): # Clear queue just in case while not log_queue.empty():
try: try:
log_queue.get_nowait() log_queue.get_nowait()
except asyncio.QueueEmpty: except asyncio.QueueEmpty:
@@ -112,6 +106,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
try: try:
translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}") translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}")
translater_logger.info(f"文件大小: {len(file_contents)} 字节。目标语言: {params['to_lang']}") translater_logger.info(f"文件大小: {len(file_contents)} 字节。目标语言: {params['to_lang']}")
translater_logger.info(f"使用转换引擎: {params['convert_engin']}")
translater_logger.info( translater_logger.info(
f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}") f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}")
@@ -119,7 +114,9 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
base_url=params['base_url'], base_url=params['base_url'],
key=params['apikey'], key=params['apikey'],
model_id=params['model_id'], model_id=params['model_id'],
tips=False convert_engin=params['convert_engin'],
mineru_token=params['mineru_token'],
tips=False # Assuming tips are not needed for server-side processing
) )
await ft.translate_bytes_async( await ft.translate_bytes_async(
name=original_filename, name=original_filename,
@@ -152,7 +149,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
translater_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).") translater_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).")
current_state.update({ current_state.update({
"status_message": f"翻译任务已取消(若有转换任务仍会后台进行) (用时 {duration:.2f} 秒).", "status_message": f"翻译任务已取消(若有转换任务仍会后台进行) (用时 {duration:.2f} 秒).",
"error_flag": False, # Cancellation is not an error in this context "error_flag": False,
"download_ready": False, "download_ready": False,
"markdown_content": None, "markdown_content": None,
"html_content": None, "html_content": None,
@@ -180,11 +177,25 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
# --- API Endpoints --- # --- API Endpoints ---
@app.get("/", response_class=HTMLResponse) @app.get("/", response_class=HTMLResponse)
async def main_page(request: Request): async def main_page(request: Request):
return FileResponse(STATIC_DIR/"index.html") # Serve index.html from the static directory or root project directory
# Assuming index.html is at the same level as app.py or in STATIC_DIR
# For simplicity, if index.html is at root:
# return FileResponse(Path(__file__).parent / "index.html")
# If using Jinja2Templates and index.html is in "templates" folder:
# return templates.TemplateResponse("index.html", {"request": request})
# Using FileResponse for index.html directly:
index_path = Path("index.html") # Adjust if index.html is elsewhere
if not index_path.exists():
# Fallback to static dir if not in root
index_path = STATIC_DIR / "index.html"
if not index_path.exists():
raise HTTPException(status_code=404, detail="index.html not found")
return FileResponse(index_path)
@app.post("/translate") @app.post("/translate")
async def handle_translate( async def handle_translate(
request: Request, # Added request for potential future use, not strictly needed now
base_url: str = Form(...), base_url: str = Form(...),
apikey: str = Form(...), apikey: str = Form(...),
model_id: str = Form(...), model_id: str = Form(...),
@@ -192,6 +203,8 @@ async def handle_translate(
formula_ocr: bool = Form(False), formula_ocr: bool = Form(False),
code_ocr: bool = Form(False), code_ocr: bool = Form(False),
refine_markdown: bool = Form(False), refine_markdown: bool = Form(False),
convert_engin: str = Form(...), # New parameter
mineru_token: Optional[str] = Form(None), # New parameter
file: UploadFile = File(...) file: UploadFile = File(...)
): ):
global current_state, log_queue, log_history global current_state, log_queue, log_history
@@ -209,6 +222,12 @@ async def handle_translate(
content={"task_started": False, "message": "没有选择文件或文件无效。"} content={"task_started": False, "message": "没有选择文件或文件无效。"}
) )
if convert_engin == "mineru" and (not mineru_token or not mineru_token.strip()):
return JSONResponse(
status_code=400,
content={"task_started": False, "message": "使用 Mineru 引擎时必须提供有效的 Mineru Token。"}
)
current_state["is_processing"] = True current_state["is_processing"] = True
original_filename_for_init = file.filename or "uploaded_file" original_filename_for_init = file.filename or "uploaded_file"
@@ -224,26 +243,22 @@ async def handle_translate(
"current_task_ref": None, "current_task_ref": None,
}) })
# Clear logs for the new task
log_history.clear() log_history.clear()
if log_queue: # Ensure log_queue is initialized if log_queue:
while not log_queue.empty(): while not log_queue.empty():
try: try:
log_queue.get_nowait() log_queue.get_nowait()
except asyncio.QueueEmpty: except asyncio.QueueEmpty:
break break
# Add initial log entry for the new task
# We create a LogRecord manually to ensure it goes through the formatter and handler
initial_log_msg = f"收到新的翻译请求: {original_filename_for_init}" initial_log_msg = f"收到新的翻译请求: {original_filename_for_init}"
if translater_logger.handlers and isinstance(translater_logger.handlers[0], QueueAndHistoryHandler): if translater_logger.handlers and isinstance(translater_logger.handlers[0], QueueAndHistoryHandler):
# Use the existing handler to format and queue/store the log
record = logging.LogRecord( record = logging.LogRecord(
name=translater_logger.name, level=logging.INFO, pathname="", lineno=0, name=translater_logger.name, level=logging.INFO, pathname="", lineno=0,
msg=initial_log_msg, args=(), exc_info=None, func="" msg=initial_log_msg, args=(), exc_info=None, func=""
) )
translater_logger.handlers[0].emit(record) # This will add to both queue and history translater_logger.handlers[0].emit(record)
else: # Fallback if handler setup is unusual else:
translater_logger.info(initial_log_msg) translater_logger.info(initial_log_msg)
try: try:
@@ -255,6 +270,8 @@ async def handle_translate(
"base_url": base_url, "apikey": apikey, "model_id": model_id, "base_url": base_url, "apikey": apikey, "model_id": model_id,
"to_lang": to_lang, "formula_ocr": formula_ocr, "to_lang": to_lang, "formula_ocr": formula_ocr,
"code_ocr": code_ocr, "refine_markdown": refine_markdown, "code_ocr": code_ocr, "refine_markdown": refine_markdown,
"convert_engin": convert_engin, # Pass to task
"mineru_token": mineru_token, # Pass to task
} }
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
@@ -332,18 +349,17 @@ async def get_status():
@app.get("/get-logs") @app.get("/get-logs")
async def get_logs_from_queue(): # Renamed for clarity, though path is the same async def get_logs_from_queue():
global log_queue global log_queue
new_logs = [] new_logs = []
if log_queue: # Ensure log_queue is initialized if log_queue:
while not log_queue.empty(): while not log_queue.empty():
try: try:
log_entry = log_queue.get_nowait() # Consume from queue log_entry = log_queue.get_nowait()
new_logs.append(log_entry) new_logs.append(log_entry)
log_queue.task_done() # Important for queue management if using join() elsewhere log_queue.task_done()
except asyncio.QueueEmpty: except asyncio.QueueEmpty:
break break
# No total_count, as the frontend just appends what it receives
return JSONResponse(content={"logs": new_logs}) return JSONResponse(content={"logs": new_logs})
@@ -384,10 +400,11 @@ async def download_html(filename_with_ext: str):
def run_app(): def run_app():
print("正在启动 DocuTranslate") print("正在启动 DocuTranslate WebUI")
print("请访问 http://127.0.0.1:8010") print("请访问 http://127.0.0.1:8010")
uvicorn.run(app, host="127.0.0.1", port=8010, workers=1) uvicorn.run(app, host="127.0.0.1", port=8010, workers=1)
if __name__ == "__main__": if __name__ == "__main__":
run_app() run_app()

View File

@@ -0,0 +1,3 @@
from .converter import Document,Converter
from .converter_mineru import ConverterMineru
from .converter_docling import ConverterDocling

View File

@@ -0,0 +1,25 @@
from typing import Protocol
from pathlib import Path
class Document:
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
if path is None and (filename is None or filebytes is None):
raise Exception("Document的路径或filename、filebytes不能同时为空")
self.filebytes = filebytes
self.filename = filename
self.path = path
if path:
if isinstance(path,str):
path=Path(path)
self.path=path
self.filename=path.name
self.filebytes=path.read_bytes()
class Converter(Protocol):
#转换为markdown
def convert(self,document:Document)->str:
...
async def convert_async(self,document:Document)->str:
...

View File

@@ -0,0 +1,80 @@
import os
import time
from io import BytesIO
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode
from huggingface_hub.errors import LocalEntryNotFoundError
from docutranslate.logger import translater_logger
from docutranslate.converter import Converter, Document
import asyncio
IMAGE_RESOLUTION_SCALE = 4
def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False,
artifacts_path: Path | str | None = None) -> str:
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
pipeline_options.do_ocr = False
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
pipeline_options.table_structure_options.do_cell_matching = False
if formula:
pipeline_options.do_formula_enrichment = True
if code:
pipeline_options.do_code_enrichment = True
# pipeline_options.accelerator_options= AcceleratorOptions(
# num_threads=4, device=AcceleratorDevice.AUTO
# )
# 打印时间
settings.debug.profile_pipeline_timings = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
try:
conversion_result = converter.convert(file_path)
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
except LocalEntryNotFoundError:
translater_logger.info(f"无法连接huggingface正在尝试换源")
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
conversion_result = converter.convert(file_path)
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
return result
class ConverterDocling(Converter):
def __init__(self, code=True, formula=True, artifact=None):
self.code = code
self.formula = formula
self.artifact = artifact
def convert(self, document):
assert isinstance(document.filename, str)
translater_logger.info(f"正在将文档转换为markdown")
time1 = time.time()
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
result = file2markdown_embed_images(document_stream, formula=self.formula, code=self.code,
artifacts_path=self.artifact)
translater_logger.info(f"已转换为markdown耗时{time.time() - time1}")
return result
async def convert_async(self, document: Document) -> str:
return await asyncio.to_thread(
self.convert,
document
)
if __name__ == '__main__':
pass

View File

@@ -0,0 +1,220 @@
import asyncio
import base64
import io
import mimetypes
import os
import re
import time
import zipfile
import httpx
from docutranslate.converter import Converter, Document
from docutranslate.logger import translater_logger
URL = 'https://mineru.net/api/v4/file-urls/batch'
client=httpx.Client(trust_env=False)
#TODO: 提供更详细的logger
class ConverterMineru(Converter):
def __init__(self, token: str, formula=True):
self.mineru_token = token.strip()
self.client_async = httpx.AsyncClient()
self.formula = formula
def _get_header(self):
return {
'Content-Type': 'application/json',
"Authorization": f"Bearer {self.mineru_token}"
}
def _get_upload_data(self, document: Document):
return {
"enable_formula": self.formula,
"language": "auto",
"enable_table": True,
"files": [
{"name": f"{document.filename}", "is_ocr": True}
]
}
def upload(self, document: Document):
# 获取上传链接
response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
response.raise_for_status()
result = response.json()
# print('response success. result:{}'.format(result))
if result["code"] == 0:
batch_id = result["data"]["batch_id"]
urls = result["data"]["file_urls"]
# print('batch_id:{},urls:{}'.format(batch_id, urls))
# 获取
res_upload = client.put(urls[0], content=document.filebytes)
res_upload.raise_for_status()
# print(f"{urls[0]} upload success")
return batch_id
else:
raise Exception('apply upload url failed,reason:{}'.format(result.msg))
def get_file_url(self, batch_id: str) -> str:
while True:
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
header = self._get_header()
res = client.get(url, headers=header)
res.raise_for_status()
fileinfo = res.json()["data"]["extract_result"][0]
if fileinfo["state"] == "done":
fileurl = fileinfo["full_zip_url"]
return fileurl
else:
time.sleep(3)
def convert(self, document: Document) -> str:
translater_logger.info(f"正在将文档转换为markdown")
time1=time.time()
batch_id = self.upload(document)
file_url = self.get_file_url(batch_id)
result=get_md_from_zip_url_with_inline_images(zip_url=file_url)
translater_logger.info(f"已转换为markdown耗时{time.time()-time1}")
return result
# TODO: 实现细粒度更高的协程
async def convert_async(self, document: Document) -> str:
# 待优化
return await asyncio.to_thread(
self.convert,
document
)
def get_md_from_zip_url_with_inline_images(
zip_url: str,
filename_in_zip: str = "full.md",
encoding: str = "utf-8"
) -> str | None:
"""
从给定的ZIP文件URL中下载并提取指定文件的内容
并将Markdown文件中的相对路径图片转换为内联Base64图片。
Args:
zip_url (str): ZIP文件的下载链接。
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称包括路径
默认为 "full.md"
encoding (str): 目标文件的预期编码。默认为 "utf-8"
Returns:
str | None: 如果成功返回处理后的Markdown文本内容否则返回 None。
"""
try:
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
response = client.get(zip_url, timeout=60.0) # 增加超时
response.raise_for_status()
print("ZIP文件下载完成。")
zip_file_bytes = io.BytesIO(response.content)
print(f"正在尝试打开内存中的ZIP存档...")
with zipfile.ZipFile(zip_file_bytes, 'r') as archive:
print(f"ZIP存档已打开。正在查找文件 '{filename_in_zip}'...")
if filename_in_zip not in archive.namelist():
print(f"错误: 文件 '{filename_in_zip}' 在ZIP压缩包中未找到。")
print(f"压缩包中的可用文件列表: {archive.namelist()}")
return None
md_content_bytes = archive.read(filename_in_zip)
print(f"文件 '{filename_in_zip}' 已找到并读取。")
md_content_text = md_content_bytes.decode(encoding)
print(f"文件内容已使用 '{encoding}' 编码成功解码。")
# --- 新增:处理图片 ---
print("开始处理Markdown中的图片...")
# 获取Markdown文件在ZIP包内的基本目录用于解析相对图片路径
# 例如,如果 filename_in_zip 是 "docs/guide/full.md", base_md_path_in_zip 是 "docs/guide"
# 如果 filename_in_zip 是 "full.md", base_md_path_in_zip 是 ""
base_md_path_in_zip = os.path.dirname(filename_in_zip)
def replace_image_with_base64(match):
alt_text = match.group(1)
original_image_path = match.group(2)
# 检查是否是外部链接或已经是data URI
if original_image_path.startswith(('http://', 'https://', 'data:')):
print(f" 跳过外部或已内联图片: {original_image_path}")
return match.group(0) # 返回原始匹配
# 构建图片在ZIP文件中的绝对路径
# os.path.join 会正确处理 base_md_path_in_zip 为空字符串的情况
image_path_in_zip = os.path.join(base_md_path_in_zip, original_image_path)
# zipfile 使用正斜杠并且路径是相对于zip根目录的os.path.normpath确保路径格式正确
image_path_in_zip = os.path.normpath(image_path_in_zip).replace(os.sep, '/')
# 确保路径不是以 './' 开头如果filename_in_zip在根目录且图片路径也是相对的
if image_path_in_zip.startswith('./'):
image_path_in_zip = image_path_in_zip[2:]
# print(f" 尝试内联图片: '{original_image_path}' (解析为ZIP内路径: '{image_path_in_zip}')")
try:
image_bytes = archive.read(image_path_in_zip)
# 猜测MIME类型
mime_type, _ = mimetypes.guess_type(image_path_in_zip)
if not mime_type:
# 备用:根据扩展名手动判断一些常见类型
ext = os.path.splitext(image_path_in_zip)[1].lower()
if ext == '.png':
mime_type = 'image/png'
elif ext in ['.jpg', '.jpeg']:
mime_type = 'image/jpeg'
elif ext == '.gif':
mime_type = 'image/gif'
elif ext == '.svg':
mime_type = 'image/svg+xml'
elif ext == '.webp':
mime_type = 'image/webp'
else:
print(f" 警告: 无法确定图片 '{image_path_in_zip}' 的MIME类型。跳过内联。")
return match.group(0) # 返回原始匹配
base64_encoded_data = base64.b64encode(image_bytes).decode('utf-8')
new_image_tag = f"![{alt_text}](data:{mime_type};base64,{base64_encoded_data})"
# print(f" 成功内联图片: {original_image_path} -> data:{mime_type[:20]}...")
return new_image_tag
except KeyError:
print(f" 警告: 图片 '{image_path_in_zip}' 在ZIP压缩包中未找到。原始链接将被保留。")
return match.group(0) # 图片不在zip中返回原始匹配
except Exception as e_img:
print(f" 错误: 处理图片 '{image_path_in_zip}' 时发生错误: {e_img}。原始链接将被保留。")
return match.group(0)
# 正则表达式查找Markdown图片: ![alt text](path/to/image.ext)
# 修改了正则表达式使其不贪婪地匹配alt文本和路径
image_regex = r"!\[(.*?)\]\((.*?)\)"
modified_md_content = re.sub(image_regex, replace_image_with_base64, md_content_text)
print("图片处理完成。")
return modified_md_content
except httpx.HTTPStatusError as e:
print(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}")
print(f"响应内容: {e.response.text[:200]}...")
return None
except httpx.RequestError as e:
print(f"下载ZIP文件时发生错误 (httpx): {e}")
return None
except zipfile.BadZipFile:
print("错误: 下载的文件不是一个有效的ZIP压缩文件或已损坏。")
return None
except UnicodeDecodeError:
print(f"错误: 无法使用 '{encoding}' 编码解码文件 '{filename_in_zip}' 的内容。")
print("请尝试其他编码,如 'gbk', 'latin1' 等,或确认文件本身的编码。")
return None
except Exception as e:
print(f"发生未知错误: {e}")
import traceback
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
return None
if __name__ == '__main__':
pass

View File

@@ -28,11 +28,11 @@
} }
.error-message { .error-message {
color: #d32f2f; color: #d32f2f; /* Pico invalid color */
} }
.success-message { .success-message {
color: #2e7d32; color: #2e7d32; /* Pico valid color */
} }
.form-group { .form-group {
@@ -65,9 +65,15 @@
.checkbox-group { .checkbox-group {
display: flex; display: flex;
flex-wrap: wrap; flex-wrap: wrap;
gap: 1rem; /* Added gap for better spacing */
margin-bottom: 1rem; margin-bottom: 1rem;
} }
.checkbox-group label { /* Ensure checkboxes are aligned */
margin-right: 10px;
}
#resultArea { #resultArea {
margin-top: 1.5rem; margin-top: 1.5rem;
padding-top: 1rem; padding-top: 1rem;
@@ -116,7 +122,6 @@
display: none; display: none;
} }
/* Styles for drag and drop area */
#fileDropArea { #fileDropArea {
border: 2px dashed #ccc; border: 2px dashed #ccc;
padding: 20px; padding: 20px;
@@ -126,16 +131,16 @@
} }
#fileDropArea.drag-over { #fileDropArea.drag-over {
border-color: #1095c1; /* Pico primary color (定量替换 var(--pico-primary-focus)) */ border-color: #1095c1;
background-color: #e7f5fa; /* Pico primary background (定量替换 var(--pico-primary-background)) */ background-color: #e7f5fa;
} }
#fileDropArea.file-selected { #fileDropArea.file-selected {
border-color: #2e7d32; /* Pico success color (定量替换 var(--pico-form-element-valid-border-color, #2e7d32)) */ border-color: #2e7d32;
background-color: #e8f5e9; /* Light green (定量替换 var(--pico-form-element-valid-background-color, #e8f5e9)) */ background-color: #e8f5e9;
} }
#fileDropArea p { /* General style for <p> inside drop area */ #fileDropArea p {
margin: 0.5rem 0; margin: 0.5rem 0;
color: #555; color: #555;
} }
@@ -149,19 +154,18 @@
#fileNameDisplay.has-file { #fileNameDisplay.has-file {
font-style: normal; font-style: normal;
font-weight: bold; font-weight: bold;
color: #1a531d; /* Darker green or success color (定量替换 var(--pico-form-element-valid-border-color, #1a531d)) */ color: #1a531d;
} }
#fileDropArea.input-error { #fileDropArea.input-error, input.input-error, select.input-error { /* Extended to input/select */
border-color: #d32f2f !important; /* (定量替换 var(--pico-form-element-invalid-border-color, #d32f2f)) */ border-color: #d32f2f !important;
} }
#fileNameDisplay.input-error-text { #fileNameDisplay.input-error-text {
color: #d32f2f !important; /* (定量替换 var(--pico-form-element-invalid-border-color, #d32f2f)) */ color: #d32f2f !important;
font-weight: bold; font-weight: bold;
} }
@media (max-width: 768px) { @media (max-width: 768px) {
.form-grid { .form-grid {
grid-template-columns: 1fr; grid-template-columns: 1fr;
@@ -176,7 +180,6 @@
</h1> </h1>
<form id="translateForm"> <form id="translateForm">
<!-- Modified File Input Area -->
<div class="form-group"> <div class="form-group">
<label for="file">文档选择</label> <label for="file">文档选择</label>
<div id="fileDropArea"> <div id="fileDropArea">
@@ -206,17 +209,34 @@
</select> </select>
</div> </div>
<div class="form-group"> <div class="form-group">
<label>高级选项</label> <label>选项</label>
<div class="checkbox-group"> <div class="checkbox-group">
<label for="formula_ocr"><input type="checkbox" id="formula_ocr" name="formula_ocr">公式识别</label> <label for="formula_ocr"><input type="checkbox" id="formula_ocr" name="formula_ocr" role="switch">公式识别</label>
<label for="code_ocr"><input type="checkbox" id="code_ocr" name="code_ocr">代码识别</label> <label for="code_ocr"><input type="checkbox" id="code_ocr" name="code_ocr"
<label for="refine_markdown"><input type="checkbox" id="refine_markdown" role="switch">代码识别</label>
name="refine_markdown">修正文本(耗时)</label> <label for="refine_markdown"><input type="checkbox" id="refine_markdown" name="refine_markdown"
role="switch">修正文本(耗时)</label>
</div> </div>
</div> </div>
</div> </div>
<details> <details>
<summary>API 配置</summary> <summary>文档转换引擎配置</summary>
<div class="form-group">
<label for="convert_engin">转换引擎</label>
<select id="convert_engin" name="convert_engin">
<option value="mineru" selected>Mineru</option>
<option value="docling">Docling</option>
</select>
</div>
<div class="form-group hidden" id="mineruTokenGroup">
<label for="mineru_token">Mineru Token</label>
<input type="password" id="mineru_token" name="mineru_token" placeholder="使用 Mineru 引擎时必须填写">
</div>
</details>
<details>
<summary>翻译API配置</summary>
<div class="form-grid"> <div class="form-grid">
<div class="form-group"> <div class="form-group">
<label for="platform_select">AI 平台</label> <label for="platform_select">AI 平台</label>
@@ -225,8 +245,7 @@
<option value="https://api.openai.com/v1">OpenAI</option> <option value="https://api.openai.com/v1">OpenAI</option>
<option value="https://open.bigmodel.cn/api/paas/v4">智谱AI</option> <option value="https://open.bigmodel.cn/api/paas/v4">智谱AI</option>
<option value="https://api.deepseek.com/v1">DeepSeek</option> <option value="https://api.deepseek.com/v1">DeepSeek</option>
<option value="https://dashscope.aliyuncs.com/compatible-mode/v1">阿里云百炼 <option value="https://dashscope.aliyuncs.com/compatible-mode/v1">阿里云百炼</option>
</option>
<option value="https://www.dmxapi.cn/v1">DMXAPI</option> <option value="https://www.dmxapi.cn/v1">DMXAPI</option>
<option value="https://openrouter.ai/api/v1">OpenRouter</option> <option value="https://openrouter.ai/api/v1">OpenRouter</option>
<option value="https://ark.cn-beijing.volces.com/api/v3">火山引擎</option> <option value="https://ark.cn-beijing.volces.com/api/v3">火山引擎</option>
@@ -235,14 +254,12 @@
</div> </div>
<div class="form-group hidden" id="baseUrlGroup"> <div class="form-group hidden" id="baseUrlGroup">
<label for="base_url">API 地址 (Base URL)</label> <label for="base_url">API 地址 (Base URL)</label>
<input type="text" id="base_url" name="base_url" <input type="text" id="base_url" name="base_url" placeholder="https://api.openai.com/v1">
placeholder="https://api.openai.com/v1">
</div> </div>
</div> </div>
<div class="form-group"> <div class="form-group">
<label for="apikey">API 密钥</label> <label for="apikey">API 密钥</label>
<input type="password" id="apikey" name="apikey" placeholder="平台对应的API Key" <input type="password" id="apikey" name="apikey" placeholder="平台对应的API Key" required>
required>
</div> </div>
<div class="form-group"> <div class="form-group">
<label for="model_id">模型 ID</label> <label for="model_id">模型 ID</label>
@@ -267,7 +284,7 @@
</main> </main>
<div id="previewModal" class="modal"> <div id="previewModal" class="modal">
<div class="modal-content"> <div class="modal-content">
<span id="closeModalBtn" style="cursor:pointer; float:right;">×</span> <span id="closeModalBtn" style="cursor:pointer; float:right; font-size: 1.5rem; line-height: 1;">×</span>
<h3>HTML 预览</h3> <h3>HTML 预览</h3>
<iframe id="previewFrame"></iframe> <iframe id="previewFrame"></iframe>
<div class="button-group"> <div class="button-group">
@@ -277,7 +294,6 @@
</div> </div>
</div> </div>
<iframe id="printFrame" style="display:none;"></iframe> <iframe id="printFrame" style="display:none;"></iframe>
<script> <script>
const platformSelect = document.getElementById('platform_select'); const platformSelect = document.getElementById('platform_select');
const baseUrlGroup = document.getElementById('baseUrlGroup'); const baseUrlGroup = document.getElementById('baseUrlGroup');
@@ -288,6 +304,11 @@
const formulaCheckbox = document.getElementById('formula_ocr'); const formulaCheckbox = document.getElementById('formula_ocr');
const codeCheckbox = document.getElementById('code_ocr'); const codeCheckbox = document.getElementById('code_ocr');
const refineCheckbox = document.getElementById('refine_markdown'); const refineCheckbox = document.getElementById('refine_markdown');
const convertEnginSelect = document.getElementById('convert_engin');
const mineruTokenGroup = document.getElementById('mineruTokenGroup');
const mineruTokenInput = document.getElementById('mineru_token');
const form = document.getElementById('translateForm'); const form = document.getElementById('translateForm');
const submitButton = document.getElementById('submitButton'); const submitButton = document.getElementById('submitButton');
const logArea = document.getElementById('logArea'); const logArea = document.getElementById('logArea');
@@ -311,7 +332,6 @@
let logPollIntervalId = null; let logPollIntervalId = null;
let statusPollIntervalId = null; let statusPollIntervalId = null;
// let lastLogCount = 0; // No longer needed for fetching logs
let isTranslating = false; let isTranslating = false;
function saveToStorage(key, value) { function saveToStorage(key, value) {
@@ -347,7 +367,34 @@
saveToStorage('translator_last_platform', selectedPlatformValue); saveToStorage('translator_last_platform', selectedPlatformValue);
} }
loadSettings(); function updateConvertEnginUI() {
const selectedEngin = convertEnginSelect.value;
if (selectedEngin === 'mineru') {
mineruTokenGroup.classList.remove('hidden');
mineruTokenInput.required = true;
mineruTokenInput.value = getFromStorage('translator_mineru_token');
} else {
mineruTokenGroup.classList.add('hidden');
mineruTokenInput.required = false;
// Optionally clear if not needed: mineruTokenInput.value = '';
}
saveToStorage('translator_convert_engin', selectedEngin);
}
function loadSettings() {
platformSelect.value = getFromStorage('translator_last_platform', 'custom');
updatePlatformUI();
convertEnginSelect.value = getFromStorage('translator_convert_engin', 'mineru');
updateConvertEnginUI(); // Must be after setting convertEnginSelect.value
toLangSelect.value = getFromStorage('translator_to_lang', '中文');
formulaCheckbox.checked = getFromStorage('translator_formula_ocr') === 'true';
codeCheckbox.checked = getFromStorage('translator_code_ocr') === 'true';
refineCheckbox.checked = getFromStorage('translator_refine_markdown') === 'true';
}
loadSettings(); // Initial load
platformSelect.addEventListener('change', updatePlatformUI); platformSelect.addEventListener('change', updatePlatformUI);
apikeyInput.addEventListener('input', (e) => saveToStorage(`translator_platform_${platformSelect.value}_apikey`, e.target.value)); apikeyInput.addEventListener('input', (e) => saveToStorage(`translator_platform_${platformSelect.value}_apikey`, e.target.value));
@@ -355,10 +402,14 @@
baseUrlInput.addEventListener('input', (e) => { baseUrlInput.addEventListener('input', (e) => {
if (platformSelect.value === 'custom') saveToStorage('translator_platform_custom_base_url', e.target.value); if (platformSelect.value === 'custom') saveToStorage('translator_platform_custom_base_url', e.target.value);
}); });
convertEnginSelect.addEventListener('change', updateConvertEnginUI);
mineruTokenInput.addEventListener('input', (e) => saveToStorage('translator_mineru_token', e.target.value));
toLangSelect.addEventListener('change', e => saveToStorage('translator_to_lang', e.target.value)); toLangSelect.addEventListener('change', e => saveToStorage('translator_to_lang', e.target.value));
formulaCheckbox.addEventListener('change', e => saveToStorage('translator_formula_ocr', e.target.checked)); formulaCheckbox.addEventListener('change', e => saveToStorage('translator_formula_ocr', e.target.checked.toString()));
codeCheckbox.addEventListener('change', e => saveToStorage('translator_code_ocr', e.target.checked)); codeCheckbox.addEventListener('change', e => saveToStorage('translator_code_ocr', e.target.checked.toString()));
refineCheckbox.addEventListener('change', e => saveToStorage('translator_refine_markdown', e.target.checked)); refineCheckbox.addEventListener('change', e => saveToStorage('translator_refine_markdown', e.target.checked.toString()));
[closeModalButton, closePreviewBtn].forEach(elem => elem.addEventListener('click', () => modal.style.display = 'none')); [closeModalButton, closePreviewBtn].forEach(elem => elem.addEventListener('click', () => modal.style.display = 'none'));
window.addEventListener('click', (event) => { window.addEventListener('click', (event) => {
@@ -374,9 +425,7 @@
} }
}); });
fileDropArea.addEventListener('click', () => { fileDropArea.addEventListener('click', () => fileInput.click());
fileInput.click();
});
fileInput.addEventListener('change', () => { fileInput.addEventListener('change', () => {
if (fileInput.files.length > 0) { if (fileInput.files.length > 0) {
@@ -422,7 +471,6 @@
fileDropArea.addEventListener('drop', (e) => { fileDropArea.addEventListener('drop', (e) => {
const dt = e.dataTransfer; const dt = e.dataTransfer;
const files = dt.files; const files = dt.files;
if (files.length > 0) { if (files.length > 0) {
fileInput.files = files; fileInput.files = files;
const event = new Event('change', {bubbles: true}); const event = new Event('change', {bubbles: true});
@@ -432,8 +480,7 @@
async function pollLogs() { async function pollLogs() {
try { try {
// const response = await fetch(`/get-logs?since=${lastLogCount}`); // OLD const response = await fetch('/get-logs');
const response = await fetch('/get-logs'); // NEW: No 'since' parameter
if (!response.ok) { if (!response.ok) {
console.warn(`Log polling failed: ${response.status}`); console.warn(`Log polling failed: ${response.status}`);
return; return;
@@ -444,9 +491,8 @@
const escapedLog = log.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">"); const escapedLog = log.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
logArea.innerHTML += escapedLog + "<br>"; logArea.innerHTML += escapedLog + "<br>";
}); });
logArea.scrollTop = logArea.scrollHeight; // Scroll to bottom logArea.scrollTop = logArea.scrollHeight;
} }
// lastLogCount = data.total_count; // OLD: No longer tracking count this way
} catch (error) { } catch (error) {
console.warn("Error polling logs:", error); console.warn("Error polling logs:", error);
} }
@@ -561,7 +607,7 @@
} else { } else {
submitButton.textContent = '取消翻译'; submitButton.textContent = '取消翻译';
submitButton.classList.remove('primary'); submitButton.classList.remove('primary');
submitButton.classList.add('secondary'); submitButton.classList.add('secondary', 'contrast'); // Using contrast for cancel
isTranslating = true; isTranslating = true;
submitButton.disabled = false; submitButton.disabled = false;
submitButton.removeAttribute('aria-busy'); submitButton.removeAttribute('aria-busy');
@@ -576,10 +622,9 @@
function startPolling() { function startPolling() {
stopPolling(); stopPolling();
// lastLogCount = 0; // No longer needed logArea.innerHTML = '';
logArea.innerHTML = ''; // Clear log area for new task pollLogs();
pollLogs(); // Initial poll for logs pollStatus();
pollStatus(); // Initial poll for status
logPollIntervalId = setInterval(pollLogs, 2000); logPollIntervalId = setInterval(pollLogs, 2000);
statusPollIntervalId = setInterval(pollStatus, 1500); statusPollIntervalId = setInterval(pollStatus, 1500);
} }
@@ -589,16 +634,7 @@
if (statusPollIntervalId) clearInterval(statusPollIntervalId); if (statusPollIntervalId) clearInterval(statusPollIntervalId);
logPollIntervalId = null; logPollIntervalId = null;
statusPollIntervalId = null; statusPollIntervalId = null;
setTimeout(pollLogs, 500); setTimeout(pollLogs, 500); // One last poll for logs
}
function loadSettings() {
platformSelect.value = getFromStorage('translator_last_platform', 'custom');
updatePlatformUI();
toLangSelect.value = getFromStorage('translator_to_lang', '中文');
formulaCheckbox.checked = getFromStorage('translator_formula_ocr') === 'true';
codeCheckbox.checked = getFromStorage('translator_code_ocr') === 'true';
refineCheckbox.checked = getFromStorage('translator_refine_markdown') === 'true';
} }
async function cancelTranslation() { async function cancelTranslation() {
@@ -609,13 +645,13 @@
try { try {
const response = await fetch('/cancel-translate', {method: 'POST'}); const response = await fetch('/cancel-translate', {method: 'POST'});
const result = await response.json(); const result = await response.json();
if (response.ok && result.cancelled) { if (response.ok && result.cancelled) {
statusMsg.textContent = result.message || '取消请求已发送。'; statusMsg.textContent = result.message || '取消请求已发送。';
statusMsg.className = ''; statusMsg.className = ''; // Clear error class
} else { } else {
statusMsg.textContent = result.message || '取消失败。'; statusMsg.textContent = result.message || '取消失败。';
statusMsg.className = 'error-message'; statusMsg.className = 'error-message';
// Re-enable button if cancellation failed to register server-side
submitButton.disabled = false; submitButton.disabled = false;
submitButton.textContent = '取消翻译'; submitButton.textContent = '取消翻译';
submitButton.removeAttribute('aria-busy'); submitButton.removeAttribute('aria-busy');
@@ -628,6 +664,7 @@
submitButton.textContent = '取消翻译'; submitButton.textContent = '取消翻译';
submitButton.removeAttribute('aria-busy'); submitButton.removeAttribute('aria-busy');
} }
// Status poller will eventually update the button state correctly
} }
form.addEventListener('submit', async function (event) { form.addEventListener('submit', async function (event) {
@@ -638,6 +675,10 @@
return; return;
} }
// Clear previous input errors
[fileDropArea, mineruTokenInput].forEach(el => el.classList.remove('input-error'));
fileNameDisplay.classList.remove('input-error-text');
if (fileInput.files.length === 0) { if (fileInput.files.length === 0) {
statusMsg.textContent = '请选择一个文件进行翻译。'; statusMsg.textContent = '请选择一个文件进行翻译。';
statusMsg.className = 'error-message'; statusMsg.className = 'error-message';
@@ -648,14 +689,18 @@
setTimeout(() => { setTimeout(() => {
fileDropArea.classList.remove('input-error'); fileDropArea.classList.remove('input-error');
fileNameDisplay.classList.remove('input-error-text'); fileNameDisplay.classList.remove('input-error-text');
if (fileNameDisplay.textContent === '请选择文件!') { if (fileNameDisplay.textContent === '请选择文件!') fileNameDisplay.textContent = '未选择文件';
fileNameDisplay.textContent = '未选择文件'; if (fileInput.files.length === 0) fileDropPrompt.classList.remove('hidden');
} }, 3000);
if (fileInput.files.length === 0) { return;
fileDropPrompt.classList.remove('hidden');
} }
}, 3000); if (convertEnginSelect.value === 'mineru' && !mineruTokenInput.value.trim()) {
statusMsg.textContent = '使用 Mineru 引擎时,必须填写 Mineru Token。';
statusMsg.className = 'error-message';
mineruTokenInput.classList.add('input-error');
mineruTokenInput.focus();
setTimeout(() => mineruTokenInput.classList.remove('input-error'), 3000);
return; return;
} }
@@ -667,9 +712,10 @@
statusMsg.textContent = '正在提交任务...'; statusMsg.textContent = '正在提交任务...';
statusMsg.className = ''; statusMsg.className = '';
downloadBtns.style.display = 'none'; downloadBtns.style.display = 'none';
// lastLogCount = 0; // No longer needed
const formData = new FormData(form); const formData = new FormData(form);
// FormData automatically includes convert_engin and mineru_token due to 'name' attributes
try { try {
const response = await fetch('/translate', {method: 'POST', body: formData}); const response = await fetch('/translate', {method: 'POST', body: formData});
const result = await response.json(); const result = await response.json();
@@ -678,8 +724,9 @@
statusMsg.className = ''; statusMsg.className = '';
submitButton.textContent = '取消翻译'; submitButton.textContent = '取消翻译';
submitButton.classList.remove('primary'); submitButton.classList.remove('primary');
submitButton.classList.add('secondary'); submitButton.classList.add('secondary', 'contrast');
isTranslating = true; isTranslating = true;
submitButton.disabled = false; // Enable cancel button
submitButton.removeAttribute('aria-busy'); submitButton.removeAttribute('aria-busy');
startPolling(); startPolling();
} else { } else {

View File

@@ -1,24 +1,25 @@
import asyncio import asyncio
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Literal from typing import Literal
import markdown2 import markdown2
from docling.datamodel.document import DocumentStream
from docutranslate.agents import Agent, AgentArgs from docutranslate.agents import Agent, AgentArgs
from docutranslate.agents import MDRefineAgent, MDTranslateAgent from docutranslate.agents import MDRefineAgent, MDTranslateAgent
from docutranslate.utils.convert import file2markdown_embed_images from docutranslate.converter import Document, ConverterDocling, ConverterMineru
from docutranslate.utils.markdown_splitter import split_markdown_text,join_markdown_texts from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
from docutranslate.logger import translater_logger from docutranslate.logger import translater_logger
class FileTranslater: class FileTranslater:
def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000, base_url="", key=None, def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000,
model_id="", temperature=0.7, max_concurrent=20, docling_artifact: Path | str | None = None, base_url="", key=None, model_id="", temperature=0.7,
timeout=2000, tips=True): max_concurrent=20, timeout=2000,
convert_engin: Literal["docling", "mineru"] = "docling",
docling_artifact: Path | str | None = None,
mineru_token: str = None,
tips=True):
self.convert_engin = convert_engin
self.mineru_token = mineru_token.strip() if mineru_token is not None else None
if isinstance(file_path, str): if isinstance(file_path, str):
file_path = Path(file_path) file_path = Path(file_path)
self.file_path: Path = file_path self.file_path: Path = file_path
@@ -33,10 +34,10 @@ class FileTranslater:
self.temperature = temperature self.temperature = temperature
self.docling_artifact = docling_artifact self.docling_artifact = docling_artifact
if docling_artifact is None: if docling_artifact is None:
artifact_path=Path("./artifact") artifact_path = Path("./docling_artifact")
if artifact_path.is_dir(): if artifact_path.is_dir():
translater_logger.info("检测到artifact文件夹") translater_logger.info("检测到docling_artifact文件夹")
self.docling_artifact=artifact_path self.docling_artifact = artifact_path
self.timeout = timeout self.timeout = timeout
if tips: if tips:
print(""" print("""
@@ -66,7 +67,7 @@ class FileTranslater:
translater_logger.info(f"markdown分为{len(chunks)}") translater_logger.info(f"markdown分为{len(chunks)}")
return chunks return chunks
def default_agent_params(self) -> AgentArgs: def _default_agent_params(self) -> AgentArgs:
result: AgentArgs = { result: AgentArgs = {
"baseurl": self.base_url, "baseurl": self.base_url,
"key": self.key, "key": self.key,
@@ -77,16 +78,48 @@ class FileTranslater:
} }
return result return result
def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str:
translater_logger.info(f"正在使用{self.convert_engin}转换文件为markdown")
if self.convert_engin == "docling":
if artifact is None:
artifact = self.docling_artifact
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact)
result = mdconverter.convert(document)
else:
if self.mineru_token is None:
raise Exception("mineru_token未配置")
if code:
translater_logger.info("mineru暂不支持code识别")
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
result = mdconverter.convert(document)
return result
async def _convert2markdown_async(self, document: Document, formula: bool, code: bool,
artifact: Path = None) -> str:
if self.convert_engin == "docling":
if artifact is None:
artifact = self.docling_artifact
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact)
result = await mdconverter.convert_async(document)
else:
if self.mineru_token is None:
raise Exception("mineru_token未配置")
if code:
translater_logger.info("mineru暂不支持code识别")
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
result = await mdconverter.convert_async(document)
return result
def read_bytes(self, name: str, file: bytes, formula=True, code=True, save=False, def read_bytes(self, name: str, file: bytes, formula=True, code=True, save=False,
save_format: Literal["markdown", "html"] = "markdown", refine=False, save_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None): refine_agent: Agent | None = None):
ds = DocumentStream(name=name, stream=BytesIO(file)) document = Document(filename=name, filebytes=file)
file_path = Path(name) file_path = Path(name)
# 如果是markdown直接读取 # 如果是markdown直接读取
if file_path.suffix == ".md": if file_path.suffix == ".md":
self.markdown = file.decode() self.markdown = file.decode()
else: else:
self.markdown = file2markdown_embed_images(ds, formula, code, artifacts_path=self.docling_artifact) self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
if refine: if refine:
self.refine_markdown_by_agent(refine_agent) self.refine_markdown_by_agent(refine_agent)
if save: if save:
@@ -96,6 +129,26 @@ class FileTranslater:
self.save_as_markdown(filename=f"{file_path.stem}.md") self.save_as_markdown(filename=f"{file_path.stem}.md")
return self return self
async def read_bytes_async(self, name: str, file: bytes, formula=True, code=True, save=False,
save_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None):
document = Document(filename=name, filebytes=file)
file_path = Path(name)
# 如果是markdown直接读取
if file_path.suffix == ".md":
self.markdown = file.decode()
else:
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
artifact=self.docling_artifact)
if refine:
await self.refine_markdown_by_agent_async(refine_agent)
if save:
if save_format == "html":
self.save_as_html(filename=f"{file_path.stem}.html")
else:
self.save_as_markdown(filename=f"{file_path.stem}.md")
return self
def read_file(self, file_path: Path | str | None = None, formula=True, code=True, save=False, def read_file(self, file_path: Path | str | None = None, formula=True, code=True, save=False,
save_format: Literal["markdown", "html"] = "markdown", refine=False, save_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None): refine_agent: Agent | None = None):
@@ -112,7 +165,8 @@ class FileTranslater:
with open(file_path, "r") as f: with open(file_path, "r") as f:
self.markdown = f.read() self.markdown = f.read()
else: else:
self.markdown = file2markdown_embed_images(file_path, formula, code, artifacts_path=self.docling_artifact) document = Document(file_path)
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
if refine: if refine:
self.refine_markdown_by_agent(refine_agent) self.refine_markdown_by_agent(refine_agent)
if save: if save:
@@ -122,14 +176,42 @@ class FileTranslater:
self.save_as_markdown(filename=f"{file_path.stem}.md") self.save_as_markdown(filename=f"{file_path.stem}.md")
return self return self
async def read_file_async(self, file_path: Path | str | None = None, formula=True, code=True, save=False,
save_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None):
if file_path is None:
if self.file_path is None:
translater_logger.debug("未设置文件路径")
raise Exception("未设置文件路径")
file_path = self.file_path
if isinstance(file_path, str):
file_path = Path(file_path)
translater_logger.info(f"读取文件:{file_path.name}")
# 如果是markdown直接读取
if file_path.suffix == ".md":
with open(file_path, "r") as f:
self.markdown = f.read()
else:
document = Document(file_path)
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
artifact=self.docling_artifact)
if refine:
await self.refine_markdown_by_agent_async(refine_agent)
if save:
if save_format == "html":
self.save_as_html(filename=f"{file_path.stem}.html")
else:
self.save_as_markdown(filename=f"{file_path.stem}.md")
return self
def refine_markdown_by_agent(self, refine_agent: Agent | None = None) -> str: def refine_markdown_by_agent(self, refine_agent: Agent | None = None) -> str:
translater_logger.info("正在修正markdown") translater_logger.info("正在修正markdown")
self._mask_uris_in_markdown() self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks() chuncks = self._split_markdown_into_chunks()
if refine_agent is None: if refine_agent is None:
refine_agent = MDRefineAgent(**self.default_agent_params()) refine_agent = MDRefineAgent(**self._default_agent_params())
result: list[str] = refine_agent.send_prompts(chuncks) result: list[str] = refine_agent.send_prompts(chuncks)
self.markdown=join_markdown_texts(result) self.markdown = join_markdown_texts(result)
self._unmask_uris_in_markdown() self._unmask_uris_in_markdown()
translater_logger.info("markdown已修正") translater_logger.info("markdown已修正")
return self.markdown return self.markdown
@@ -139,22 +221,21 @@ class FileTranslater:
self._mask_uris_in_markdown() self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks() chuncks = self._split_markdown_into_chunks()
if translate_agent is None: if translate_agent is None:
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params()) translate_agent = MDTranslateAgent(to_lang=to_lang, **self._default_agent_params())
result: list[str] = translate_agent.send_prompts(chuncks) result: list[str] = translate_agent.send_prompts(chuncks)
self.markdown=join_markdown_texts(result) self.markdown = join_markdown_texts(result)
self._unmask_uris_in_markdown() self._unmask_uris_in_markdown()
translater_logger.info("翻译完成") translater_logger.info("翻译完成")
return self.markdown return self.markdown
async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None) -> str: async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None) -> str:
translater_logger.info("正在修正markdown") translater_logger.info("正在修正markdown")
self._mask_uris_in_markdown() self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks() chuncks = self._split_markdown_into_chunks()
if refine_agent is None: if refine_agent is None:
refine_agent = MDRefineAgent(**self.default_agent_params()) refine_agent = MDRefineAgent(**self._default_agent_params())
result: list[str] = await refine_agent.send_prompts_async(chuncks) result: list[str] = await refine_agent.send_prompts_async(chuncks)
self.markdown=join_markdown_texts(result) self.markdown = join_markdown_texts(result)
self._unmask_uris_in_markdown() self._unmask_uris_in_markdown()
translater_logger.info("markdown已修正") translater_logger.info("markdown已修正")
return self.markdown return self.markdown
@@ -164,9 +245,9 @@ class FileTranslater:
self._mask_uris_in_markdown() self._mask_uris_in_markdown()
chuncks = self._split_markdown_into_chunks() chuncks = self._split_markdown_into_chunks()
if translate_agent is None: if translate_agent is None:
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params()) translate_agent = MDTranslateAgent(to_lang=to_lang, **self._default_agent_params())
result: list[str] = await translate_agent.send_prompts_async(chuncks) result: list[str] = await translate_agent.send_prompts_async(chuncks)
self.markdown=join_markdown_texts(result) self.markdown = join_markdown_texts(result)
self._unmask_uris_in_markdown() self._unmask_uris_in_markdown()
translater_logger.info("翻译完成") translater_logger.info("翻译完成")
return self.markdown return self.markdown
@@ -217,6 +298,7 @@ class FileTranslater:
def export_to_html(self, title="title") -> str: def export_to_html(self, title="title") -> str:
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"]) markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
# TODO:实现完全本地化css和js
# language=html # language=html
html = f"""<!DOCTYPE html> html = f"""<!DOCTYPE html>
<html lang="en"> <html lang="en">
@@ -245,7 +327,7 @@ class FileTranslater:
</script> </script>
</head> </head>
<body> <body>
{markdowner.convert(self.markdown.replace("\\","\\\\"))} {markdowner.convert(self.markdown.replace("\\", "\\\\"))}
</body> </body>
<script type="module" defer> <script type="module" defer>
import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@9/dist/mermaid.esm.min.mjs'; import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@9/dist/mermaid.esm.min.mjs';
@@ -273,7 +355,7 @@ class FileTranslater:
def translate_file(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output", def translate_file(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output",
formula=True, formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False, code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None, translate_agent: Agent | None = None,save=True): refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=True):
if file_path is None: if file_path is None:
assert self.file_path is not None, "未输入文件路径" assert self.file_path is not None, "未输入文件路径"
file_path = self.file_path file_path = self.file_path
@@ -291,10 +373,11 @@ class FileTranslater:
filename = f"{file_path.stem}_{to_lang}.html" filename = f"{file_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir) self.save_as_html(filename=filename, output_dir=output_dir)
return self return self
async def translate_file_async(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output", async def translate_file_async(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output",
formula=True, formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False, code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None, translate_agent: Agent | None = None,save=True): refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=True):
if file_path is None: if file_path is None:
assert self.file_path is not None, "未输入文件路径" assert self.file_path is not None, "未输入文件路径"
file_path = self.file_path file_path = self.file_path
@@ -317,11 +400,12 @@ class FileTranslater:
filename = f"{file_path.stem}_{to_lang}.html" filename = f"{file_path.stem}_{to_lang}.html"
self.save_as_html(filename=filename, output_dir=output_dir) self.save_as_html(filename=filename, output_dir=output_dir)
return self return self
def translate_bytes(self, name:str,file: bytes, to_lang="中文", output_dir="./output",
def translate_bytes(self, name: str, file: bytes, to_lang="中文", output_dir="./output",
formula=True, formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False, code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None, translate_agent: Agent | None = None,save=True): refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=True):
self.read_bytes(name=name,file=file, formula=formula, code=code) self.read_bytes(name=name, file=file, formula=formula, code=code)
if refine: if refine:
self.refine_markdown_by_agent(refine_agent) self.refine_markdown_by_agent(refine_agent)
self.translate_markdown_by_agent(translate_agent, to_lang=to_lang) self.translate_markdown_by_agent(translate_agent, to_lang=to_lang)
@@ -334,17 +418,12 @@ class FileTranslater:
self.save_as_html(filename=filename, output_dir=output_dir) self.save_as_html(filename=filename, output_dir=output_dir)
return self return self
async def translate_bytes_async(self, name:str,file: bytes, to_lang="中文", output_dir="./output", async def translate_bytes_async(self, name: str, file: bytes, to_lang="中文", output_dir="./output",
formula=True, formula=True,
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False, code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
refine_agent: Agent | None = None, translate_agent: Agent | None = None,save=True): refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=True):
await asyncio.to_thread( await self.read_bytes_async(name=name, file=file, formula=formula, code=code)
self.read_bytes,
name=name,
file=file,
formula=formula,
code=code
)
if refine: if refine:
await self.refine_markdown_by_agent_async(refine_agent) await self.refine_markdown_by_agent_async(refine_agent)
await self.translate_markdown_by_agent_async(translate_agent, to_lang=to_lang) await self.translate_markdown_by_agent_async(translate_agent, to_lang=to_lang)

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "docutranslate" name = "docutranslate"
version = "0.2.19" version = "0.2.20"
description = "文件翻译工具" description = "文件翻译工具"
readme = "README.md" readme = "README.md"
requires-python = ">=3.10" requires-python = ">=3.10"