增加对mineru的支持
This commit is contained in:
253
.idea/workspace.xml
generated
253
.idea/workspace.xml
generated
@@ -5,7 +5,16 @@
|
|||||||
</component>
|
</component>
|
||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||||
<change beforePath="$PROJECT_DIR$/app.spec" beforeDir="false" afterPath="$PROJECT_DIR$/app.spec" afterDir="false" />
|
<change afterPath="$PROJECT_DIR$/docutranslate/converter/__init__.py" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/docutranslate/converter/converter.py" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/docutranslate/converter/converter_docling.py" afterDir="false" />
|
||||||
|
<change afterPath="$PROJECT_DIR$/docutranslate/converter/converter_mineru.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/app.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/static/index.html" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/static/index.html" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/translater.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/translater.py" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||||
@@ -35,59 +44,62 @@
|
|||||||
<option name="hideEmptyMiddlePackages" value="true" />
|
<option name="hideEmptyMiddlePackages" value="true" />
|
||||||
<option name="showLibraryContents" value="true" />
|
<option name="showLibraryContents" value="true" />
|
||||||
</component>
|
</component>
|
||||||
<component name="PropertiesComponent">{
|
<component name="PropertiesComponent"><![CDATA[{
|
||||||
"keyToString": {
|
"keyToString": {
|
||||||
"DefaultHtmlFileTemplate": "HTML File",
|
"DefaultHtmlFileTemplate": "HTML File",
|
||||||
"JavaScript 调试.output.html (1).executor": "Run",
|
"JavaScript 调试.output.html (1).executor": "Run",
|
||||||
"JavaScript 调试.output.html.executor": "Run",
|
"JavaScript 调试.output.html.executor": "Run",
|
||||||
"JavaScript 调试.regex.md_中文.html.executor": "Run",
|
"JavaScript 调试.regex.md_中文.html.executor": "Run",
|
||||||
"JavaScript 调试.regex_中文.html.executor": "Run",
|
"JavaScript 调试.regex_中文.html.executor": "Run",
|
||||||
"JavaScript 调试.test.html.executor": "Run",
|
"JavaScript 调试.test.html.executor": "Run",
|
||||||
"JavaScript 调试.test2.html.executor": "Run",
|
"JavaScript 调试.test2.html.executor": "Run",
|
||||||
"JavaScript 调试.test2_英文.html.executor": "Run",
|
"JavaScript 调试.test2_英文.html.executor": "Run",
|
||||||
"JavaScript 调试.test4-1_中文.html.executor": "Run",
|
"JavaScript 调试.test4-1_中文.html.executor": "Run",
|
||||||
"JavaScript 调试.互联网认证授权机制.html.executor": "Run",
|
"JavaScript 调试.互联网认证授权机制.html.executor": "Run",
|
||||||
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
|
"JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
|
||||||
"JavaScript 调试.毕业论文_英文.html.executor": "Run",
|
"JavaScript 调试.毕业论文_英文.html.executor": "Run",
|
||||||
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
"ModuleVcsDetector.initialDetectionPerformed": "true",
|
||||||
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
|
"Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
|
||||||
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
|
"Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
|
||||||
"Python 测试.pytest (test_html.py 内).executor": "Run",
|
"Python 测试.pytest (test_html.py 内).executor": "Run",
|
||||||
"Python.2test2 (1).executor": "Run",
|
"Python.1test.executor": "Run",
|
||||||
"Python.PDFtranslater (1).executor": "Run",
|
"Python.2test2 (1).executor": "Run",
|
||||||
"Python.PDFtranslater (2).executor": "Run",
|
"Python.PDFtranslater (1).executor": "Run",
|
||||||
"Python.agent.executor": "Debug",
|
"Python.PDFtranslater (2).executor": "Run",
|
||||||
"Python.agent_utils.executor": "Run",
|
"Python.agent.executor": "Debug",
|
||||||
"Python.app (1).executor": "Run",
|
"Python.agent_utils.executor": "Run",
|
||||||
"Python.app.executor": "Run",
|
"Python.app (1).executor": "Run",
|
||||||
"Python.app2.executor": "Run",
|
"Python.app.executor": "Run",
|
||||||
"Python.app_test (1).executor": "Run",
|
"Python.app2.executor": "Run",
|
||||||
"Python.convert.executor": "Run",
|
"Python.app_test (1).executor": "Run",
|
||||||
"Python.markdown_splitter.executor": "Debug",
|
"Python.convert.executor": "Run",
|
||||||
"Python.markdown_utils.executor": "Run",
|
"Python.converter_docling.executor": "Run",
|
||||||
"Python.test.executor": "Run",
|
"Python.converter_mineru.executor": "Run",
|
||||||
"Python.test1.executor": "Run",
|
"Python.markdown_splitter.executor": "Debug",
|
||||||
"Python.test2.executor": "Run",
|
"Python.markdown_utils.executor": "Run",
|
||||||
"Python.test3.executor": "Run",
|
"Python.test.executor": "Run",
|
||||||
"Python.test4.executor": "Run",
|
"Python.test1.executor": "Run",
|
||||||
"Python.testhtml.executor": "Run",
|
"Python.test2.executor": "Run",
|
||||||
"Python.translater.executor": "Run",
|
"Python.test3.executor": "Run",
|
||||||
"Python.切分测试.executor": "Run",
|
"Python.test4.executor": "Run",
|
||||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
"Python.testhtml.executor": "Run",
|
||||||
"RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
|
"Python.translater.executor": "Run",
|
||||||
"RunOnceActivity.git.unshallow": "true",
|
"Python.切分测试.executor": "Run",
|
||||||
"git-widget-placeholder": "main",
|
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||||
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate",
|
"RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
|
||||||
"list.type.of.created.stylesheet": "CSS",
|
"RunOnceActivity.git.unshallow": "true",
|
||||||
"node.js.detected.package.eslint": "true",
|
"git-widget-placeholder": "main",
|
||||||
"node.js.detected.package.tslint": "true",
|
"last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate",
|
||||||
"node.js.selected.package.eslint": "(autodetect)",
|
"list.type.of.created.stylesheet": "CSS",
|
||||||
"node.js.selected.package.tslint": "(autodetect)",
|
"node.js.detected.package.eslint": "true",
|
||||||
"nodejs_package_manager_path": "npm",
|
"node.js.detected.package.tslint": "true",
|
||||||
"settings.editor.selected.configurable": "preferences.pluginManager",
|
"node.js.selected.package.eslint": "(autodetect)",
|
||||||
"vue.rearranger.settings.migration": "true"
|
"node.js.selected.package.tslint": "(autodetect)",
|
||||||
|
"nodejs_package_manager_path": "npm",
|
||||||
|
"settings.editor.selected.configurable": "preferences.pluginManager",
|
||||||
|
"vue.rearranger.settings.migration": "true"
|
||||||
}
|
}
|
||||||
}</component>
|
}]]></component>
|
||||||
<component name="RecentsManager">
|
<component name="RecentsManager">
|
||||||
<key name="CopyFile.RECENT_KEYS">
|
<key name="CopyFile.RECENT_KEYS">
|
||||||
<recent name="C:\Users\jxgm\Desktop\FileTranslate\dist\DocuTranslate" />
|
<recent name="C:\Users\jxgm\Desktop\FileTranslate\dist\DocuTranslate" />
|
||||||
@@ -267,7 +279,27 @@
|
|||||||
<option name="OPTIONS" value="" />
|
<option name="OPTIONS" value="" />
|
||||||
<method v="2" />
|
<method v="2" />
|
||||||
</configuration>
|
</configuration>
|
||||||
<configuration name="test.html" type="JavascriptDebugType" temporary="true" nameIsGenerated="true" uri="http://localhost:63342/filetranslate/tests/test.html" useBuiltInWebServerPort="true">
|
<configuration name="1test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||||
|
<module name="FileTranslate" />
|
||||||
|
<option name="ENV_FILES" value="" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/1test.py" />
|
||||||
|
<option name="PARAMETERS" value="" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
<method v="2" />
|
<method v="2" />
|
||||||
</configuration>
|
</configuration>
|
||||||
<configuration name="2test2 (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
<configuration name="2test2 (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||||
@@ -293,29 +325,6 @@
|
|||||||
<option name="INPUT_FILE" value="" />
|
<option name="INPUT_FILE" value="" />
|
||||||
<method v="2" />
|
<method v="2" />
|
||||||
</configuration>
|
</configuration>
|
||||||
<configuration name="3testhtml" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
|
||||||
<module name="FileTranslate" />
|
|
||||||
<option name="ENV_FILES" value="" />
|
|
||||||
<option name="INTERPRETER_OPTIONS" value="" />
|
|
||||||
<option name="PARENT_ENVS" value="true" />
|
|
||||||
<envs>
|
|
||||||
<env name="PYTHONUNBUFFERED" value="1" />
|
|
||||||
</envs>
|
|
||||||
<option name="SDK_HOME" value="" />
|
|
||||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
|
|
||||||
<option name="IS_MODULE_SDK" value="true" />
|
|
||||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
|
||||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
|
||||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
|
||||||
<option name="SCRIPT_NAME" value="C:\Users\jxgm\Desktop\FileTranslate\tests\3testhtml.py" />
|
|
||||||
<option name="PARAMETERS" value="" />
|
|
||||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
|
||||||
<option name="EMULATE_TERMINAL" value="false" />
|
|
||||||
<option name="MODULE_MODE" value="false" />
|
|
||||||
<option name="REDIRECT_INPUT" value="false" />
|
|
||||||
<option name="INPUT_FILE" value="" />
|
|
||||||
<method v="2" />
|
|
||||||
</configuration>
|
|
||||||
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
|
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
|
||||||
<module name="filetranslate" />
|
<module name="filetranslate" />
|
||||||
<option name="ENV_FILES" value="" />
|
<option name="ENV_FILES" value="" />
|
||||||
@@ -362,6 +371,52 @@
|
|||||||
<option name="INPUT_FILE" value="" />
|
<option name="INPUT_FILE" value="" />
|
||||||
<method v="2" />
|
<method v="2" />
|
||||||
</configuration>
|
</configuration>
|
||||||
|
<configuration name="converter_docling" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||||
|
<module name="FileTranslate" />
|
||||||
|
<option name="ENV_FILES" value="" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/converter" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/converter/converter_docling.py" />
|
||||||
|
<option name="PARAMETERS" value="" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
|
<configuration name="converter_mineru" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||||
|
<module name="FileTranslate" />
|
||||||
|
<option name="ENV_FILES" value="" />
|
||||||
|
<option name="INTERPRETER_OPTIONS" value="" />
|
||||||
|
<option name="PARENT_ENVS" value="true" />
|
||||||
|
<envs>
|
||||||
|
<env name="PYTHONUNBUFFERED" value="1" />
|
||||||
|
</envs>
|
||||||
|
<option name="SDK_HOME" value="" />
|
||||||
|
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/docutranslate/converter" />
|
||||||
|
<option name="IS_MODULE_SDK" value="true" />
|
||||||
|
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||||
|
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||||
|
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||||
|
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/docutranslate/converter/converter_mineru.py" />
|
||||||
|
<option name="PARAMETERS" value="" />
|
||||||
|
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||||
|
<option name="EMULATE_TERMINAL" value="false" />
|
||||||
|
<option name="MODULE_MODE" value="false" />
|
||||||
|
<option name="REDIRECT_INPUT" value="false" />
|
||||||
|
<option name="INPUT_FILE" value="" />
|
||||||
|
<method v="2" />
|
||||||
|
</configuration>
|
||||||
<configuration default="true" type="Python.FlaskServer">
|
<configuration default="true" type="Python.FlaskServer">
|
||||||
<module name="filetranslate" />
|
<module name="filetranslate" />
|
||||||
<option name="ENV_FILES" value="" />
|
<option name="ENV_FILES" value="" />
|
||||||
@@ -459,31 +514,13 @@
|
|||||||
<option name="USE_PATTERN" value="false" />
|
<option name="USE_PATTERN" value="false" />
|
||||||
<method v="2" />
|
<method v="2" />
|
||||||
</configuration>
|
</configuration>
|
||||||
<configuration name="pytest (3testhtml.py 内)" type="tests" factoryName="py.test" temporary="true" nameIsGenerated="true">
|
|
||||||
<module name="FileTranslate" />
|
|
||||||
<option name="ENV_FILES" value="" />
|
|
||||||
<option name="INTERPRETER_OPTIONS" value="" />
|
|
||||||
<option name="PARENT_ENVS" value="true" />
|
|
||||||
<option name="SDK_HOME" value="" />
|
|
||||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
|
|
||||||
<option name="IS_MODULE_SDK" value="true" />
|
|
||||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
|
||||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
|
||||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
|
||||||
<option name="_new_keywords" value="""" />
|
|
||||||
<option name="_new_parameters" value="""" />
|
|
||||||
<option name="_new_additionalArguments" value="""" />
|
|
||||||
<option name="_new_target" value=""$PROJECT_DIR$/tests/3testhtml.py"" />
|
|
||||||
<option name="_new_targetType" value=""PATH"" />
|
|
||||||
<method v="2" />
|
|
||||||
</configuration>
|
|
||||||
<recent_temporary>
|
<recent_temporary>
|
||||||
<list>
|
<list>
|
||||||
<item itemvalue="Python.app_test (1)" />
|
<item itemvalue="Python.app_test (1)" />
|
||||||
|
<item itemvalue="Python.1test" />
|
||||||
|
<item itemvalue="Python.converter_docling" />
|
||||||
|
<item itemvalue="Python.converter_mineru" />
|
||||||
<item itemvalue="Python.2test2 (1)" />
|
<item itemvalue="Python.2test2 (1)" />
|
||||||
<item itemvalue="Python.3testhtml" />
|
|
||||||
<item itemvalue="JavaScript 调试.test.html" />
|
|
||||||
<item itemvalue="Python 测试.pytest (3testhtml.py 内)" />
|
|
||||||
</list>
|
</list>
|
||||||
</recent_temporary>
|
</recent_temporary>
|
||||||
</component>
|
</component>
|
||||||
@@ -568,14 +605,26 @@
|
|||||||
<workItem from="1747628254543" duration="7347000" />
|
<workItem from="1747628254543" duration="7347000" />
|
||||||
<workItem from="1747635705571" duration="391000" />
|
<workItem from="1747635705571" duration="391000" />
|
||||||
<workItem from="1747650908714" duration="121000" />
|
<workItem from="1747650908714" duration="121000" />
|
||||||
|
<workItem from="1747731199599" duration="4260000" />
|
||||||
</task>
|
</task>
|
||||||
<servers />
|
<servers />
|
||||||
</component>
|
</component>
|
||||||
<component name="TypeScriptGeneratedFilesManager">
|
<component name="TypeScriptGeneratedFilesManager">
|
||||||
<option name="version" value="3" />
|
<option name="version" value="3" />
|
||||||
</component>
|
</component>
|
||||||
|
<component name="Vcs.Log.Tabs.Properties">
|
||||||
|
<option name="TAB_STATES">
|
||||||
|
<map>
|
||||||
|
<entry key="MAIN">
|
||||||
|
<value>
|
||||||
|
<State />
|
||||||
|
</value>
|
||||||
|
</entry>
|
||||||
|
</map>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747634037187" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747733748258" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||||
@@ -583,14 +632,17 @@
|
|||||||
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746617703678" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$app2.coverage" NAME="app2 覆盖结果" MODIFIED="1747108180309" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
<SUITE FILE_PATH="coverage/filetranslate$app2.coverage" NAME="app2 覆盖结果" MODIFIED="1747108180309" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$app.coverage" NAME="app 覆盖结果" MODIFIED="1747448464521" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
<SUITE FILE_PATH="coverage/filetranslate$app.coverage" NAME="app 覆盖结果" MODIFIED="1747448464521" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
||||||
|
<SUITE FILE_PATH="coverage/filetranslate$converter_mineru.coverage" NAME="converter_mineru 覆盖结果" MODIFIED="1747726229881" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/converter" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746599883603" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
|
<SUITE FILE_PATH="coverage/filetranslate$converter_docling.coverage" NAME="converter_docling 覆盖结果" MODIFIED="1747726654277" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/converter" />
|
||||||
|
<SUITE FILE_PATH="coverage/filetranslate$1test.coverage" NAME="1test 覆盖结果" MODIFIED="1747732504752" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746884110572" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test3.coverage" NAME="test3 覆盖结果" MODIFIED="1746884110572" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$app__1_.coverage" NAME="app (1) 覆盖结果" MODIFIED="1747136094477" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$app__1_.coverage" NAME="app (1) 覆盖结果" MODIFIED="1747136094477" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$markdown_splitter.coverage" NAME="markdown_splitter 覆盖结果" MODIFIED="1746805063874" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" />
|
<SUITE FILE_PATH="coverage/filetranslate$agent.coverage" NAME="agent 覆盖结果" MODIFIED="1746805293987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/Agents" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
|
<SUITE FILE_PATH="coverage/filetranslate$PDFtranslater__2_.coverage" NAME="PDFtranslater (2) 覆盖结果" MODIFIED="1746679546680" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/filetranslate_packages" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1746629433597" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$2test2__1_.coverage" NAME="2test2 (1) 覆盖结果" MODIFIED="1747579915531" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$2test2__1_.coverage" NAME="2test2 (1) 覆盖结果" MODIFIED="1747722801777" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$agent_utils.coverage" NAME="agent_utils 覆盖结果" MODIFIED="1746708534311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$.coverage" NAME="切分测试 覆盖结果" MODIFIED="1747187128847" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$.coverage" NAME="切分测试 覆盖结果" MODIFIED="1747187128847" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746936018440" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test1.coverage" NAME="test1 覆盖结果" MODIFIED="1746936018440" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
@@ -601,6 +653,7 @@
|
|||||||
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$markdown_utils.coverage" NAME="markdown_utils 覆盖结果" MODIFIED="1746598797872" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages/utils" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1747553897731" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test2.coverage" NAME="test2 覆盖结果" MODIFIED="1747553897731" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746843159560" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
<SUITE FILE_PATH="coverage/filetranslate$translater.coverage" NAME="translater 覆盖结果" MODIFIED="1746843159560" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate" />
|
||||||
|
<SUITE FILE_PATH="coverage/filetranslate$docling_utils.coverage" NAME="docling_utils 覆盖结果" MODIFIED="1747710836730" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$.coverage" NAME=" 覆盖结果" MODIFIED="1746588350286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/decorator" />
|
||||||
</component>
|
</component>
|
||||||
</project>
|
</project>
|
||||||
43
README.md
43
README.md
@@ -4,12 +4,15 @@
|
|||||||
|
|
||||||
[](https://github.com/xunbu/docutranslate)
|
[](https://github.com/xunbu/docutranslate)
|
||||||
|
|
||||||
文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译
|
文件翻译工具,借助[docling](https://github.com/docling-project/docling)、[minerU](https://mineru.net/)与大语言模型实现多种格式文件的翻译
|
||||||
|
|
||||||
|
> QQ交流群:1047781902
|
||||||
|
|
||||||
# 整合包
|
# 整合包
|
||||||
|
|
||||||
对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases)
|
对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases)
|
||||||
上下载最新的整合包,该整合包点击即用,您所需的只是获取某个ai平台的api-key。
|
上下载最新的整合包,该整合包点击即用,您所需的只是获取某个ai平台的api-key。
|
||||||
|
以及可以在mineru申请token进行pdf识别【可选】
|
||||||
|
|
||||||
# 安装
|
# 安装
|
||||||
|
|
||||||
@@ -34,7 +37,16 @@
|
|||||||
|
|
||||||
# 前置条件
|
# 前置条件
|
||||||
|
|
||||||
## huggingface换源
|
本翻译工具的翻译流程总体如下:
|
||||||
|
|
||||||
|
1. 使用文本转换引擎将文档转换成markdown(有docling(本地)、minerU(联网)两种引擎)
|
||||||
|
2. 使用大语言模型翻译markdown文本(需要申请api-key或本地部署)
|
||||||
|
|
||||||
|
## 使用docling引擎注意事项
|
||||||
|
|
||||||
|
使用docling将文档转换为markdown时,需要下载模型到本地(也可以提前下载,见FAQ),因此可能会遇到一些网络问题
|
||||||
|
|
||||||
|
### huggingface换源
|
||||||
|
|
||||||
> 不能科学上网的友友注意了
|
> 不能科学上网的友友注意了
|
||||||
|
|
||||||
@@ -43,12 +55,12 @@
|
|||||||
- 第一次读取非markdown文本
|
- 第一次读取非markdown文本
|
||||||
- 第一次使用公式识别或代码识别功能
|
- 第一次使用公式识别或代码识别功能
|
||||||
|
|
||||||
### 方法1
|
#### 方法1
|
||||||
|
|
||||||
设置电脑的环境变量(记得设置后重启IDE)
|
设置电脑的环境变量(记得设置后重启IDE)
|
||||||
`HF_ENDPOINT=https://hf-mirror.com`
|
`HF_ENDPOINT=https://hf-mirror.com`
|
||||||
|
|
||||||
### 方法2
|
#### 方法2
|
||||||
|
|
||||||
在代码开头设置环境变量
|
在代码开头设置环境变量
|
||||||
|
|
||||||
@@ -60,6 +72,13 @@ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
|||||||
###其余代码写在下方
|
###其余代码写在下方
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 使用minerU引擎注意事项
|
||||||
|
|
||||||
|
使用minerU将文档转换为markdown时,需要在minerU平台申请token
|
||||||
|
|
||||||
|
1. 打开[minerU官网](https://mineru.net/apiManage/docs)申请token
|
||||||
|
2. 申请成功后,在[API Token管理界面](https://mineru.net/apiManage/token)创建API Token
|
||||||
|
|
||||||
## 获取大模型平台的baseurl、key、model-id
|
## 获取大模型平台的baseurl、key、model-id
|
||||||
|
|
||||||
由于需要使用大语言模型进行markdown调整与翻译,所以需要预先获取模型的baseurl、key、model-id
|
由于需要使用大语言模型进行markdown调整与翻译,所以需要预先获取模型的baseurl、key、model-id
|
||||||
@@ -90,7 +109,12 @@ from docutranslate.translater import FileTranslater
|
|||||||
|
|
||||||
translater = FileTranslater(base_url="<baseurl>",
|
translater = FileTranslater(base_url="<baseurl>",
|
||||||
key="<key>",
|
key="<key>",
|
||||||
model_id="<model-id>")
|
model_id="<model-id>",
|
||||||
|
convert_engin="docling" # 默认使用docling
|
||||||
|
# convert_engin="mineru",# 使用mineru
|
||||||
|
# mineru_token="<申请的mineru_token>"#使用mineru时必填
|
||||||
|
)
|
||||||
|
|
||||||
# 不开启公式、代码识别(默认输出为markdown文件)
|
# 不开启公式、代码识别(默认输出为markdown文件)
|
||||||
translater.translate_file("<文件路径>", to_lang="中文")
|
translater.translate_file("<文件路径>", to_lang="中文")
|
||||||
|
|
||||||
@@ -141,12 +165,14 @@ translater.read_file("<文件路径>").save_as_markdown()
|
|||||||
from docutranslate import FileTranslater
|
from docutranslate import FileTranslater
|
||||||
|
|
||||||
translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl
|
translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl
|
||||||
key="<key>", # 默认的模型api-key
|
key="<api-key>", # 默认的大语言模型平台api-key
|
||||||
model_id="<model-id>", # 默认的模型id
|
model_id="<model-id>", # 默认的模型id
|
||||||
chunksize=2000, # markdown分块长度(单位byte),分块越大效果越好(也越慢),不建议超过8000
|
chunksize=2000, # markdown分块长度(单位byte),分块越大效果越好(也越慢),不建议超过8000
|
||||||
max_concurrent=20, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
|
max_concurrent=20, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
|
||||||
docling_artifact=None, # 使用提前下载好的docling模型
|
|
||||||
timeout=2000, # 调用api的超时时间
|
timeout=2000, # 调用api的超时时间
|
||||||
|
docling_artifact=None, # 使用提前下载好的docling模型
|
||||||
|
convert_engin="mineru", # 可选docling或minerU
|
||||||
|
mineru_token="<mineru-token>", # minerU的token,使用minerU时必填
|
||||||
tips=True # 开场提示
|
tips=True # 开场提示
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -206,7 +232,8 @@ from docutranslate.utils.docling_utils import get_docling_artifacts
|
|||||||
print(get_docling_artifacts()) # 会显示模型下载文件夹,通常在`C:\Users\<user>\.cache\docling\models`
|
print(get_docling_artifacts()) # 会显示模型下载文件夹,通常在`C:\Users\<user>\.cache\docling\models`
|
||||||
```
|
```
|
||||||
|
|
||||||
> 创建FileTranslater时携带模型文件夹即可
|
> 将模型文件夹命名为docling_artifact放置在项目下
|
||||||
|
> 或创建FileTranslater时docling_artifact参数设置为文件夹位置
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docutranslate import FileTranslater
|
from docutranslate import FileTranslater
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from fastapi import FastAPI, File, Form, UploadFile, Request, HTTPException
|
|||||||
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse,FileResponse
|
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse,FileResponse
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from docutranslate import FileTranslater
|
from docutranslate import FileTranslater # Assuming FileTranslater is in docutranslate module
|
||||||
from docutranslate.logger import translater_logger
|
from docutranslate.logger import translater_logger
|
||||||
from docutranslate.utils.resource_utils import resource_path
|
from docutranslate.utils.resource_utils import resource_path
|
||||||
|
|
||||||
@@ -19,11 +19,10 @@ app = FastAPI()
|
|||||||
|
|
||||||
STATIC_DIR=resource_path("static")
|
STATIC_DIR=resource_path("static")
|
||||||
|
|
||||||
# print(f"__file__:{Path(__file__).resolve()}")
|
|
||||||
app.mount("/static",StaticFiles(directory=STATIC_DIR), name="static")
|
app.mount("/static",StaticFiles(directory=STATIC_DIR), name="static")
|
||||||
|
|
||||||
# --- 全局配置 ---
|
# --- 全局配置 ---
|
||||||
log_queue: Optional[asyncio.Queue] = None # Will be initialized in startup_event
|
log_queue: Optional[asyncio.Queue] = None
|
||||||
current_state: Dict[str, Any] = {
|
current_state: Dict[str, Any] = {
|
||||||
"is_processing": False,
|
"is_processing": False,
|
||||||
"status_message": "空闲",
|
"status_message": "空闲",
|
||||||
@@ -36,9 +35,9 @@ current_state: Dict[str, Any] = {
|
|||||||
"task_end_time": 0,
|
"task_end_time": 0,
|
||||||
"current_task_ref": None,
|
"current_task_ref": None,
|
||||||
}
|
}
|
||||||
templates = Jinja2Templates(directory=".")
|
templates = Jinja2Templates(directory=".") # Not strictly used if index.html is served as FileResponse
|
||||||
MAX_LOG_HISTORY = 200 # Max items for the persistent log_history list
|
MAX_LOG_HISTORY = 200
|
||||||
log_history: List[str] = [] # Keeps a longer history, not directly for "unread"
|
log_history: List[str] = []
|
||||||
|
|
||||||
|
|
||||||
# --- 日志处理器 ---
|
# --- 日志处理器 ---
|
||||||
@@ -51,25 +50,20 @@ class QueueAndHistoryHandler(logging.Handler):
|
|||||||
|
|
||||||
def emit(self, record: logging.LogRecord):
|
def emit(self, record: logging.LogRecord):
|
||||||
log_entry = self.format(record)
|
log_entry = self.format(record)
|
||||||
|
print(log_entry) # Keep console log for server visibility
|
||||||
# Add to the persistent history (capped)
|
|
||||||
self.history_list.append(log_entry)
|
self.history_list.append(log_entry)
|
||||||
if len(self.history_list) > self.max_history:
|
if len(self.history_list) > self.max_history:
|
||||||
del self.history_list[:len(self.history_list) - self.max_history]
|
del self.history_list[:len(self.history_list) - self.max_history]
|
||||||
|
|
||||||
# Add to the "unread" queue for frontend consumption
|
|
||||||
try:
|
|
||||||
# Ensure self.queue is not None (it's initialized at startup)
|
|
||||||
if self.queue is not None:
|
if self.queue is not None:
|
||||||
|
try:
|
||||||
main_loop = getattr(app.state, "main_event_loop", None)
|
main_loop = getattr(app.state, "main_event_loop", None)
|
||||||
if main_loop and main_loop.is_running():
|
if main_loop and main_loop.is_running():
|
||||||
main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry)
|
main_loop.call_soon_threadsafe(self.queue.put_nowait, log_entry)
|
||||||
else:
|
else:
|
||||||
self.queue.put_nowait(log_entry) # Fallback
|
self.queue.put_nowait(log_entry)
|
||||||
else:
|
|
||||||
print(f"CRITICAL: Log queue not initialized. Log: {log_entry}")
|
|
||||||
except asyncio.QueueFull:
|
except asyncio.QueueFull:
|
||||||
print(f"Log queue is full. Log dropped: {log_entry}") # Or handle differently
|
print(f"Log queue is full. Log dropped: {log_entry}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error putting log to queue: {e}. Log: {log_entry}")
|
print(f"Error putting log to queue: {e}. Log: {log_entry}")
|
||||||
|
|
||||||
@@ -79,7 +73,7 @@ class QueueAndHistoryHandler(logging.Handler):
|
|||||||
async def startup_event():
|
async def startup_event():
|
||||||
global log_queue
|
global log_queue
|
||||||
app.state.main_event_loop = asyncio.get_running_loop()
|
app.state.main_event_loop = asyncio.get_running_loop()
|
||||||
log_queue = asyncio.Queue() # Initialize the global log_queue
|
log_queue = asyncio.Queue()
|
||||||
|
|
||||||
for handler in translater_logger.handlers[:]:
|
for handler in translater_logger.handlers[:]:
|
||||||
translater_logger.removeHandler(handler)
|
translater_logger.removeHandler(handler)
|
||||||
@@ -93,7 +87,7 @@ async def startup_event():
|
|||||||
translater_logger.setLevel(logging.INFO)
|
translater_logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
log_history.clear()
|
log_history.clear()
|
||||||
while not log_queue.empty(): # Clear queue just in case
|
while not log_queue.empty():
|
||||||
try:
|
try:
|
||||||
log_queue.get_nowait()
|
log_queue.get_nowait()
|
||||||
except asyncio.QueueEmpty:
|
except asyncio.QueueEmpty:
|
||||||
@@ -112,6 +106,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
|||||||
try:
|
try:
|
||||||
translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}")
|
translater_logger.info(f"使用 Base URL: {params['base_url']}, Model: {params['model_id']}")
|
||||||
translater_logger.info(f"文件大小: {len(file_contents)} 字节。目标语言: {params['to_lang']}")
|
translater_logger.info(f"文件大小: {len(file_contents)} 字节。目标语言: {params['to_lang']}")
|
||||||
|
translater_logger.info(f"使用转换引擎: {params['convert_engin']}")
|
||||||
translater_logger.info(
|
translater_logger.info(
|
||||||
f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}")
|
f"选项 - 公式: {params['formula_ocr']}, 代码: {params['code_ocr']}, 修正: {params['refine_markdown']}")
|
||||||
|
|
||||||
@@ -119,7 +114,9 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
|||||||
base_url=params['base_url'],
|
base_url=params['base_url'],
|
||||||
key=params['apikey'],
|
key=params['apikey'],
|
||||||
model_id=params['model_id'],
|
model_id=params['model_id'],
|
||||||
tips=False
|
convert_engin=params['convert_engin'],
|
||||||
|
mineru_token=params['mineru_token'],
|
||||||
|
tips=False # Assuming tips are not needed for server-side processing
|
||||||
)
|
)
|
||||||
await ft.translate_bytes_async(
|
await ft.translate_bytes_async(
|
||||||
name=original_filename,
|
name=original_filename,
|
||||||
@@ -152,7 +149,7 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
|||||||
translater_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).")
|
translater_logger.info(f"翻译任务 '{original_filename}' 已被取消 (用时 {duration:.2f} 秒).")
|
||||||
current_state.update({
|
current_state.update({
|
||||||
"status_message": f"翻译任务已取消(若有转换任务仍会后台进行) (用时 {duration:.2f} 秒).",
|
"status_message": f"翻译任务已取消(若有转换任务仍会后台进行) (用时 {duration:.2f} 秒).",
|
||||||
"error_flag": False, # Cancellation is not an error in this context
|
"error_flag": False,
|
||||||
"download_ready": False,
|
"download_ready": False,
|
||||||
"markdown_content": None,
|
"markdown_content": None,
|
||||||
"html_content": None,
|
"html_content": None,
|
||||||
@@ -180,11 +177,25 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
|||||||
# --- API Endpoints ---
|
# --- API Endpoints ---
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
async def main_page(request: Request):
|
async def main_page(request: Request):
|
||||||
return FileResponse(STATIC_DIR/"index.html")
|
# Serve index.html from the static directory or root project directory
|
||||||
|
# Assuming index.html is at the same level as app.py or in STATIC_DIR
|
||||||
|
# For simplicity, if index.html is at root:
|
||||||
|
# return FileResponse(Path(__file__).parent / "index.html")
|
||||||
|
# If using Jinja2Templates and index.html is in "templates" folder:
|
||||||
|
# return templates.TemplateResponse("index.html", {"request": request})
|
||||||
|
# Using FileResponse for index.html directly:
|
||||||
|
index_path = Path("index.html") # Adjust if index.html is elsewhere
|
||||||
|
if not index_path.exists():
|
||||||
|
# Fallback to static dir if not in root
|
||||||
|
index_path = STATIC_DIR / "index.html"
|
||||||
|
if not index_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="index.html not found")
|
||||||
|
return FileResponse(index_path)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/translate")
|
@app.post("/translate")
|
||||||
async def handle_translate(
|
async def handle_translate(
|
||||||
|
request: Request, # Added request for potential future use, not strictly needed now
|
||||||
base_url: str = Form(...),
|
base_url: str = Form(...),
|
||||||
apikey: str = Form(...),
|
apikey: str = Form(...),
|
||||||
model_id: str = Form(...),
|
model_id: str = Form(...),
|
||||||
@@ -192,6 +203,8 @@ async def handle_translate(
|
|||||||
formula_ocr: bool = Form(False),
|
formula_ocr: bool = Form(False),
|
||||||
code_ocr: bool = Form(False),
|
code_ocr: bool = Form(False),
|
||||||
refine_markdown: bool = Form(False),
|
refine_markdown: bool = Form(False),
|
||||||
|
convert_engin: str = Form(...), # New parameter
|
||||||
|
mineru_token: Optional[str] = Form(None), # New parameter
|
||||||
file: UploadFile = File(...)
|
file: UploadFile = File(...)
|
||||||
):
|
):
|
||||||
global current_state, log_queue, log_history
|
global current_state, log_queue, log_history
|
||||||
@@ -209,6 +222,12 @@ async def handle_translate(
|
|||||||
content={"task_started": False, "message": "没有选择文件或文件无效。"}
|
content={"task_started": False, "message": "没有选择文件或文件无效。"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if convert_engin == "mineru" and (not mineru_token or not mineru_token.strip()):
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content={"task_started": False, "message": "使用 Mineru 引擎时必须提供有效的 Mineru Token。"}
|
||||||
|
)
|
||||||
|
|
||||||
current_state["is_processing"] = True
|
current_state["is_processing"] = True
|
||||||
original_filename_for_init = file.filename or "uploaded_file"
|
original_filename_for_init = file.filename or "uploaded_file"
|
||||||
|
|
||||||
@@ -224,26 +243,22 @@ async def handle_translate(
|
|||||||
"current_task_ref": None,
|
"current_task_ref": None,
|
||||||
})
|
})
|
||||||
|
|
||||||
# Clear logs for the new task
|
|
||||||
log_history.clear()
|
log_history.clear()
|
||||||
if log_queue: # Ensure log_queue is initialized
|
if log_queue:
|
||||||
while not log_queue.empty():
|
while not log_queue.empty():
|
||||||
try:
|
try:
|
||||||
log_queue.get_nowait()
|
log_queue.get_nowait()
|
||||||
except asyncio.QueueEmpty:
|
except asyncio.QueueEmpty:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Add initial log entry for the new task
|
|
||||||
# We create a LogRecord manually to ensure it goes through the formatter and handler
|
|
||||||
initial_log_msg = f"收到新的翻译请求: {original_filename_for_init}"
|
initial_log_msg = f"收到新的翻译请求: {original_filename_for_init}"
|
||||||
if translater_logger.handlers and isinstance(translater_logger.handlers[0], QueueAndHistoryHandler):
|
if translater_logger.handlers and isinstance(translater_logger.handlers[0], QueueAndHistoryHandler):
|
||||||
# Use the existing handler to format and queue/store the log
|
|
||||||
record = logging.LogRecord(
|
record = logging.LogRecord(
|
||||||
name=translater_logger.name, level=logging.INFO, pathname="", lineno=0,
|
name=translater_logger.name, level=logging.INFO, pathname="", lineno=0,
|
||||||
msg=initial_log_msg, args=(), exc_info=None, func=""
|
msg=initial_log_msg, args=(), exc_info=None, func=""
|
||||||
)
|
)
|
||||||
translater_logger.handlers[0].emit(record) # This will add to both queue and history
|
translater_logger.handlers[0].emit(record)
|
||||||
else: # Fallback if handler setup is unusual
|
else:
|
||||||
translater_logger.info(initial_log_msg)
|
translater_logger.info(initial_log_msg)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -255,6 +270,8 @@ async def handle_translate(
|
|||||||
"base_url": base_url, "apikey": apikey, "model_id": model_id,
|
"base_url": base_url, "apikey": apikey, "model_id": model_id,
|
||||||
"to_lang": to_lang, "formula_ocr": formula_ocr,
|
"to_lang": to_lang, "formula_ocr": formula_ocr,
|
||||||
"code_ocr": code_ocr, "refine_markdown": refine_markdown,
|
"code_ocr": code_ocr, "refine_markdown": refine_markdown,
|
||||||
|
"convert_engin": convert_engin, # Pass to task
|
||||||
|
"mineru_token": mineru_token, # Pass to task
|
||||||
}
|
}
|
||||||
|
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
@@ -332,18 +349,17 @@ async def get_status():
|
|||||||
|
|
||||||
|
|
||||||
@app.get("/get-logs")
|
@app.get("/get-logs")
|
||||||
async def get_logs_from_queue(): # Renamed for clarity, though path is the same
|
async def get_logs_from_queue():
|
||||||
global log_queue
|
global log_queue
|
||||||
new_logs = []
|
new_logs = []
|
||||||
if log_queue: # Ensure log_queue is initialized
|
if log_queue:
|
||||||
while not log_queue.empty():
|
while not log_queue.empty():
|
||||||
try:
|
try:
|
||||||
log_entry = log_queue.get_nowait() # Consume from queue
|
log_entry = log_queue.get_nowait()
|
||||||
new_logs.append(log_entry)
|
new_logs.append(log_entry)
|
||||||
log_queue.task_done() # Important for queue management if using join() elsewhere
|
log_queue.task_done()
|
||||||
except asyncio.QueueEmpty:
|
except asyncio.QueueEmpty:
|
||||||
break
|
break
|
||||||
# No total_count, as the frontend just appends what it receives
|
|
||||||
return JSONResponse(content={"logs": new_logs})
|
return JSONResponse(content={"logs": new_logs})
|
||||||
|
|
||||||
|
|
||||||
@@ -384,10 +400,11 @@ async def download_html(filename_with_ext: str):
|
|||||||
|
|
||||||
|
|
||||||
def run_app():
|
def run_app():
|
||||||
print("正在启动 DocuTranslate")
|
print("正在启动 DocuTranslate WebUI")
|
||||||
print("请访问 http://127.0.0.1:8010")
|
print("请访问 http://127.0.0.1:8010")
|
||||||
uvicorn.run(app, host="127.0.0.1", port=8010, workers=1)
|
uvicorn.run(app, host="127.0.0.1", port=8010, workers=1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
run_app()
|
run_app()
|
||||||
3
docutranslate/converter/__init__.py
Normal file
3
docutranslate/converter/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .converter import Document,Converter
|
||||||
|
from .converter_mineru import ConverterMineru
|
||||||
|
from .converter_docling import ConverterDocling
|
||||||
25
docutranslate/converter/converter.py
Normal file
25
docutranslate/converter/converter.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
from typing import Protocol
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class Document:
|
||||||
|
def __init__(self,path:Path|str=None,filename:str=None,filebytes:bytes=None):
|
||||||
|
if path is None and (filename is None or filebytes is None):
|
||||||
|
raise Exception("Document的路径或filename、filebytes不能同时为空")
|
||||||
|
self.filebytes = filebytes
|
||||||
|
self.filename = filename
|
||||||
|
self.path = path
|
||||||
|
if path:
|
||||||
|
if isinstance(path,str):
|
||||||
|
path=Path(path)
|
||||||
|
self.path=path
|
||||||
|
self.filename=path.name
|
||||||
|
self.filebytes=path.read_bytes()
|
||||||
|
|
||||||
|
class Converter(Protocol):
|
||||||
|
#转换为markdown
|
||||||
|
def convert(self,document:Document)->str:
|
||||||
|
...
|
||||||
|
|
||||||
|
async def convert_async(self,document:Document)->str:
|
||||||
|
...
|
||||||
80
docutranslate/converter/converter_docling.py
Normal file
80
docutranslate/converter/converter_docling.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import DocumentStream
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling_core.types.doc import ImageRefMode
|
||||||
|
from huggingface_hub.errors import LocalEntryNotFoundError
|
||||||
|
|
||||||
|
from docutranslate.logger import translater_logger
|
||||||
|
|
||||||
|
from docutranslate.converter import Converter, Document
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
IMAGE_RESOLUTION_SCALE = 4
|
||||||
|
|
||||||
|
|
||||||
|
def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False,
|
||||||
|
artifacts_path: Path | str | None = None) -> str:
|
||||||
|
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||||
|
pipeline_options.do_ocr = False
|
||||||
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
pipeline_options.generate_picture_images = True
|
||||||
|
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = False
|
||||||
|
if formula:
|
||||||
|
pipeline_options.do_formula_enrichment = True
|
||||||
|
if code:
|
||||||
|
pipeline_options.do_code_enrichment = True
|
||||||
|
# pipeline_options.accelerator_options= AcceleratorOptions(
|
||||||
|
# num_threads=4, device=AcceleratorDevice.AUTO
|
||||||
|
# )
|
||||||
|
# 打印时间
|
||||||
|
settings.debug.profile_pipeline_timings = True
|
||||||
|
converter = DocumentConverter(format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
|
||||||
|
})
|
||||||
|
try:
|
||||||
|
conversion_result = converter.convert(file_path)
|
||||||
|
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||||
|
except LocalEntryNotFoundError:
|
||||||
|
translater_logger.info(f"无法连接huggingface,正在尝试换源")
|
||||||
|
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
||||||
|
conversion_result = converter.convert(file_path)
|
||||||
|
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
||||||
|
# translater_logger.info(f"docling转换耗时: {conversion_result.timings["pipeline_total"].times}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class ConverterDocling(Converter):
|
||||||
|
def __init__(self, code=True, formula=True, artifact=None):
|
||||||
|
self.code = code
|
||||||
|
self.formula = formula
|
||||||
|
self.artifact = artifact
|
||||||
|
|
||||||
|
def convert(self, document):
|
||||||
|
assert isinstance(document.filename, str)
|
||||||
|
translater_logger.info(f"正在将文档转换为markdown")
|
||||||
|
time1 = time.time()
|
||||||
|
document_stream = DocumentStream(name=document.filename, stream=BytesIO(document.filebytes))
|
||||||
|
result = file2markdown_embed_images(document_stream, formula=self.formula, code=self.code,
|
||||||
|
artifacts_path=self.artifact)
|
||||||
|
translater_logger.info(f"已转换为markdown,耗时{time.time() - time1}秒")
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def convert_async(self, document: Document) -> str:
|
||||||
|
return await asyncio.to_thread(
|
||||||
|
self.convert,
|
||||||
|
document
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pass
|
||||||
220
docutranslate/converter/converter_mineru.py
Normal file
220
docutranslate/converter/converter_mineru.py
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import mimetypes
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import zipfile
|
||||||
|
import httpx
|
||||||
|
from docutranslate.converter import Converter, Document
|
||||||
|
from docutranslate.logger import translater_logger
|
||||||
|
|
||||||
|
URL = 'https://mineru.net/api/v4/file-urls/batch'
|
||||||
|
|
||||||
|
client=httpx.Client(trust_env=False)
|
||||||
|
|
||||||
|
#TODO: 提供更详细的logger
|
||||||
|
class ConverterMineru(Converter):
|
||||||
|
def __init__(self, token: str, formula=True):
|
||||||
|
self.mineru_token = token.strip()
|
||||||
|
self.client_async = httpx.AsyncClient()
|
||||||
|
self.formula = formula
|
||||||
|
|
||||||
|
def _get_header(self):
|
||||||
|
return {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
"Authorization": f"Bearer {self.mineru_token}"
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_upload_data(self, document: Document):
|
||||||
|
return {
|
||||||
|
"enable_formula": self.formula,
|
||||||
|
"language": "auto",
|
||||||
|
"enable_table": True,
|
||||||
|
"files": [
|
||||||
|
{"name": f"{document.filename}", "is_ocr": True}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
def upload(self, document: Document):
|
||||||
|
# 获取上传链接
|
||||||
|
response = client.post(URL, headers=self._get_header(), json=self._get_upload_data(document))
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
# print('response success. result:{}'.format(result))
|
||||||
|
if result["code"] == 0:
|
||||||
|
batch_id = result["data"]["batch_id"]
|
||||||
|
urls = result["data"]["file_urls"]
|
||||||
|
# print('batch_id:{},urls:{}'.format(batch_id, urls))
|
||||||
|
# 获取
|
||||||
|
res_upload = client.put(urls[0], content=document.filebytes)
|
||||||
|
res_upload.raise_for_status()
|
||||||
|
# print(f"{urls[0]} upload success")
|
||||||
|
return batch_id
|
||||||
|
else:
|
||||||
|
raise Exception('apply upload url failed,reason:{}'.format(result.msg))
|
||||||
|
|
||||||
|
def get_file_url(self, batch_id: str) -> str:
|
||||||
|
while True:
|
||||||
|
url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
|
||||||
|
header = self._get_header()
|
||||||
|
res = client.get(url, headers=header)
|
||||||
|
res.raise_for_status()
|
||||||
|
fileinfo = res.json()["data"]["extract_result"][0]
|
||||||
|
if fileinfo["state"] == "done":
|
||||||
|
fileurl = fileinfo["full_zip_url"]
|
||||||
|
return fileurl
|
||||||
|
else:
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
def convert(self, document: Document) -> str:
|
||||||
|
translater_logger.info(f"正在将文档转换为markdown")
|
||||||
|
time1=time.time()
|
||||||
|
batch_id = self.upload(document)
|
||||||
|
file_url = self.get_file_url(batch_id)
|
||||||
|
result=get_md_from_zip_url_with_inline_images(zip_url=file_url)
|
||||||
|
translater_logger.info(f"已转换为markdown,耗时{time.time()-time1}秒")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# TODO: 实现细粒度更高的协程
|
||||||
|
async def convert_async(self, document: Document) -> str:
|
||||||
|
# 待优化
|
||||||
|
return await asyncio.to_thread(
|
||||||
|
self.convert,
|
||||||
|
document
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_md_from_zip_url_with_inline_images(
|
||||||
|
zip_url: str,
|
||||||
|
filename_in_zip: str = "full.md",
|
||||||
|
encoding: str = "utf-8"
|
||||||
|
) -> str | None:
|
||||||
|
"""
|
||||||
|
从给定的ZIP文件URL中下载并提取指定文件的内容,
|
||||||
|
并将Markdown文件中的相对路径图片转换为内联Base64图片。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
zip_url (str): ZIP文件的下载链接。
|
||||||
|
filename_in_zip (str): ZIP压缩包内目标Markdown文件的名称(包括路径)。
|
||||||
|
默认为 "full.md"。
|
||||||
|
encoding (str): 目标文件的预期编码。默认为 "utf-8"。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str | None: 如果成功,返回处理后的Markdown文本内容;否则返回 None。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
print(f"正在从 {zip_url} 下载ZIP文件 (使用 httpx.get)...")
|
||||||
|
response = client.get(zip_url, timeout=60.0) # 增加超时
|
||||||
|
response.raise_for_status()
|
||||||
|
print("ZIP文件下载完成。")
|
||||||
|
|
||||||
|
zip_file_bytes = io.BytesIO(response.content)
|
||||||
|
|
||||||
|
print(f"正在尝试打开内存中的ZIP存档...")
|
||||||
|
with zipfile.ZipFile(zip_file_bytes, 'r') as archive:
|
||||||
|
print(f"ZIP存档已打开。正在查找文件 '{filename_in_zip}'...")
|
||||||
|
|
||||||
|
if filename_in_zip not in archive.namelist():
|
||||||
|
print(f"错误: 文件 '{filename_in_zip}' 在ZIP压缩包中未找到。")
|
||||||
|
print(f"压缩包中的可用文件列表: {archive.namelist()}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
md_content_bytes = archive.read(filename_in_zip)
|
||||||
|
print(f"文件 '{filename_in_zip}' 已找到并读取。")
|
||||||
|
md_content_text = md_content_bytes.decode(encoding)
|
||||||
|
print(f"文件内容已使用 '{encoding}' 编码成功解码。")
|
||||||
|
|
||||||
|
# --- 新增:处理图片 ---
|
||||||
|
print("开始处理Markdown中的图片...")
|
||||||
|
# 获取Markdown文件在ZIP包内的基本目录,用于解析相对图片路径
|
||||||
|
# 例如,如果 filename_in_zip 是 "docs/guide/full.md", base_md_path_in_zip 是 "docs/guide"
|
||||||
|
# 如果 filename_in_zip 是 "full.md", base_md_path_in_zip 是 ""
|
||||||
|
base_md_path_in_zip = os.path.dirname(filename_in_zip)
|
||||||
|
|
||||||
|
def replace_image_with_base64(match):
|
||||||
|
alt_text = match.group(1)
|
||||||
|
original_image_path = match.group(2)
|
||||||
|
|
||||||
|
# 检查是否是外部链接或已经是data URI
|
||||||
|
if original_image_path.startswith(('http://', 'https://', 'data:')):
|
||||||
|
print(f" 跳过外部或已内联图片: {original_image_path}")
|
||||||
|
return match.group(0) # 返回原始匹配
|
||||||
|
|
||||||
|
# 构建图片在ZIP文件中的绝对路径
|
||||||
|
# os.path.join 会正确处理 base_md_path_in_zip 为空字符串的情况
|
||||||
|
image_path_in_zip = os.path.join(base_md_path_in_zip, original_image_path)
|
||||||
|
# zipfile 使用正斜杠,并且路径是相对于zip根目录的,os.path.normpath确保路径格式正确
|
||||||
|
image_path_in_zip = os.path.normpath(image_path_in_zip).replace(os.sep, '/')
|
||||||
|
|
||||||
|
# 确保路径不是以 './' 开头,如果filename_in_zip在根目录且图片路径也是相对的
|
||||||
|
if image_path_in_zip.startswith('./'):
|
||||||
|
image_path_in_zip = image_path_in_zip[2:]
|
||||||
|
|
||||||
|
# print(f" 尝试内联图片: '{original_image_path}' (解析为ZIP内路径: '{image_path_in_zip}')")
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_bytes = archive.read(image_path_in_zip)
|
||||||
|
|
||||||
|
# 猜测MIME类型
|
||||||
|
mime_type, _ = mimetypes.guess_type(image_path_in_zip)
|
||||||
|
if not mime_type:
|
||||||
|
# 备用:根据扩展名手动判断一些常见类型
|
||||||
|
ext = os.path.splitext(image_path_in_zip)[1].lower()
|
||||||
|
if ext == '.png':
|
||||||
|
mime_type = 'image/png'
|
||||||
|
elif ext in ['.jpg', '.jpeg']:
|
||||||
|
mime_type = 'image/jpeg'
|
||||||
|
elif ext == '.gif':
|
||||||
|
mime_type = 'image/gif'
|
||||||
|
elif ext == '.svg':
|
||||||
|
mime_type = 'image/svg+xml'
|
||||||
|
elif ext == '.webp':
|
||||||
|
mime_type = 'image/webp'
|
||||||
|
else:
|
||||||
|
print(f" 警告: 无法确定图片 '{image_path_in_zip}' 的MIME类型。跳过内联。")
|
||||||
|
return match.group(0) # 返回原始匹配
|
||||||
|
|
||||||
|
base64_encoded_data = base64.b64encode(image_bytes).decode('utf-8')
|
||||||
|
new_image_tag = f""
|
||||||
|
# print(f" 成功内联图片: {original_image_path} -> data:{mime_type[:20]}...")
|
||||||
|
return new_image_tag
|
||||||
|
except KeyError:
|
||||||
|
print(f" 警告: 图片 '{image_path_in_zip}' 在ZIP压缩包中未找到。原始链接将被保留。")
|
||||||
|
return match.group(0) # 图片不在zip中,返回原始匹配
|
||||||
|
except Exception as e_img:
|
||||||
|
print(f" 错误: 处理图片 '{image_path_in_zip}' 时发生错误: {e_img}。原始链接将被保留。")
|
||||||
|
return match.group(0)
|
||||||
|
|
||||||
|
# 正则表达式查找Markdown图片: 
|
||||||
|
# 修改了正则表达式,使其不贪婪地匹配alt文本和路径
|
||||||
|
image_regex = r"!\[(.*?)\]\((.*?)\)"
|
||||||
|
modified_md_content = re.sub(image_regex, replace_image_with_base64, md_content_text)
|
||||||
|
|
||||||
|
print("图片处理完成。")
|
||||||
|
return modified_md_content
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"HTTP 错误 (httpx): {e.response.status_code} - {e.request.url}")
|
||||||
|
print(f"响应内容: {e.response.text[:200]}...")
|
||||||
|
return None
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
print(f"下载ZIP文件时发生错误 (httpx): {e}")
|
||||||
|
return None
|
||||||
|
except zipfile.BadZipFile:
|
||||||
|
print("错误: 下载的文件不是一个有效的ZIP压缩文件或已损坏。")
|
||||||
|
return None
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
print(f"错误: 无法使用 '{encoding}' 编码解码文件 '{filename_in_zip}' 的内容。")
|
||||||
|
print("请尝试其他编码,如 'gbk', 'latin1' 等,或确认文件本身的编码。")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"发生未知错误: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc() # 打印完整的堆栈跟踪,便于调试
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pass
|
||||||
@@ -28,11 +28,11 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.error-message {
|
.error-message {
|
||||||
color: #d32f2f;
|
color: #d32f2f; /* Pico invalid color */
|
||||||
}
|
}
|
||||||
|
|
||||||
.success-message {
|
.success-message {
|
||||||
color: #2e7d32;
|
color: #2e7d32; /* Pico valid color */
|
||||||
}
|
}
|
||||||
|
|
||||||
.form-group {
|
.form-group {
|
||||||
@@ -65,9 +65,15 @@
|
|||||||
.checkbox-group {
|
.checkbox-group {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
|
gap: 1rem; /* Added gap for better spacing */
|
||||||
margin-bottom: 1rem;
|
margin-bottom: 1rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.checkbox-group label { /* Ensure checkboxes are aligned */
|
||||||
|
margin-right: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#resultArea {
|
#resultArea {
|
||||||
margin-top: 1.5rem;
|
margin-top: 1.5rem;
|
||||||
padding-top: 1rem;
|
padding-top: 1rem;
|
||||||
@@ -116,7 +122,6 @@
|
|||||||
display: none;
|
display: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Styles for drag and drop area */
|
|
||||||
#fileDropArea {
|
#fileDropArea {
|
||||||
border: 2px dashed #ccc;
|
border: 2px dashed #ccc;
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
@@ -126,16 +131,16 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
#fileDropArea.drag-over {
|
#fileDropArea.drag-over {
|
||||||
border-color: #1095c1; /* Pico primary color (定量替换 var(--pico-primary-focus)) */
|
border-color: #1095c1;
|
||||||
background-color: #e7f5fa; /* Pico primary background (定量替换 var(--pico-primary-background)) */
|
background-color: #e7f5fa;
|
||||||
}
|
}
|
||||||
|
|
||||||
#fileDropArea.file-selected {
|
#fileDropArea.file-selected {
|
||||||
border-color: #2e7d32; /* Pico success color (定量替换 var(--pico-form-element-valid-border-color, #2e7d32)) */
|
border-color: #2e7d32;
|
||||||
background-color: #e8f5e9; /* Light green (定量替换 var(--pico-form-element-valid-background-color, #e8f5e9)) */
|
background-color: #e8f5e9;
|
||||||
}
|
}
|
||||||
|
|
||||||
#fileDropArea p { /* General style for <p> inside drop area */
|
#fileDropArea p {
|
||||||
margin: 0.5rem 0;
|
margin: 0.5rem 0;
|
||||||
color: #555;
|
color: #555;
|
||||||
}
|
}
|
||||||
@@ -149,19 +154,18 @@
|
|||||||
#fileNameDisplay.has-file {
|
#fileNameDisplay.has-file {
|
||||||
font-style: normal;
|
font-style: normal;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
color: #1a531d; /* Darker green or success color (定量替换 var(--pico-form-element-valid-border-color, #1a531d)) */
|
color: #1a531d;
|
||||||
}
|
}
|
||||||
|
|
||||||
#fileDropArea.input-error {
|
#fileDropArea.input-error, input.input-error, select.input-error { /* Extended to input/select */
|
||||||
border-color: #d32f2f !important; /* (定量替换 var(--pico-form-element-invalid-border-color, #d32f2f)) */
|
border-color: #d32f2f !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
#fileNameDisplay.input-error-text {
|
#fileNameDisplay.input-error-text {
|
||||||
color: #d32f2f !important; /* (定量替换 var(--pico-form-element-invalid-border-color, #d32f2f)) */
|
color: #d32f2f !important;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@media (max-width: 768px) {
|
@media (max-width: 768px) {
|
||||||
.form-grid {
|
.form-grid {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
@@ -176,7 +180,6 @@
|
|||||||
</h1>
|
</h1>
|
||||||
<form id="translateForm">
|
<form id="translateForm">
|
||||||
|
|
||||||
<!-- Modified File Input Area -->
|
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="file">文档选择</label>
|
<label for="file">文档选择</label>
|
||||||
<div id="fileDropArea">
|
<div id="fileDropArea">
|
||||||
@@ -206,17 +209,34 @@
|
|||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label>高级选项</label>
|
<label>选项</label>
|
||||||
<div class="checkbox-group">
|
<div class="checkbox-group">
|
||||||
<label for="formula_ocr"><input type="checkbox" id="formula_ocr" name="formula_ocr">公式识别</label>
|
<label for="formula_ocr"><input type="checkbox" id="formula_ocr" name="formula_ocr" role="switch">公式识别</label>
|
||||||
<label for="code_ocr"><input type="checkbox" id="code_ocr" name="code_ocr">代码识别</label>
|
<label for="code_ocr"><input type="checkbox" id="code_ocr" name="code_ocr"
|
||||||
<label for="refine_markdown"><input type="checkbox" id="refine_markdown"
|
role="switch">代码识别</label>
|
||||||
name="refine_markdown">修正文本(耗时)</label>
|
<label for="refine_markdown"><input type="checkbox" id="refine_markdown" name="refine_markdown"
|
||||||
|
role="switch">修正文本(耗时)</label>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<details>
|
<details>
|
||||||
<summary>API 配置</summary>
|
<summary>文档转换引擎配置</summary>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="convert_engin">转换引擎</label>
|
||||||
|
<select id="convert_engin" name="convert_engin">
|
||||||
|
<option value="mineru" selected>Mineru</option>
|
||||||
|
<option value="docling">Docling</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="form-group hidden" id="mineruTokenGroup">
|
||||||
|
<label for="mineru_token">Mineru Token</label>
|
||||||
|
<input type="password" id="mineru_token" name="mineru_token" placeholder="使用 Mineru 引擎时必须填写">
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>翻译API配置</summary>
|
||||||
<div class="form-grid">
|
<div class="form-grid">
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="platform_select">AI 平台</label>
|
<label for="platform_select">AI 平台</label>
|
||||||
@@ -225,8 +245,7 @@
|
|||||||
<option value="https://api.openai.com/v1">OpenAI</option>
|
<option value="https://api.openai.com/v1">OpenAI</option>
|
||||||
<option value="https://open.bigmodel.cn/api/paas/v4">智谱AI</option>
|
<option value="https://open.bigmodel.cn/api/paas/v4">智谱AI</option>
|
||||||
<option value="https://api.deepseek.com/v1">DeepSeek</option>
|
<option value="https://api.deepseek.com/v1">DeepSeek</option>
|
||||||
<option value="https://dashscope.aliyuncs.com/compatible-mode/v1">阿里云百炼
|
<option value="https://dashscope.aliyuncs.com/compatible-mode/v1">阿里云百炼</option>
|
||||||
</option>
|
|
||||||
<option value="https://www.dmxapi.cn/v1">DMXAPI</option>
|
<option value="https://www.dmxapi.cn/v1">DMXAPI</option>
|
||||||
<option value="https://openrouter.ai/api/v1">OpenRouter</option>
|
<option value="https://openrouter.ai/api/v1">OpenRouter</option>
|
||||||
<option value="https://ark.cn-beijing.volces.com/api/v3">火山引擎</option>
|
<option value="https://ark.cn-beijing.volces.com/api/v3">火山引擎</option>
|
||||||
@@ -235,14 +254,12 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="form-group hidden" id="baseUrlGroup">
|
<div class="form-group hidden" id="baseUrlGroup">
|
||||||
<label for="base_url">API 地址 (Base URL)</label>
|
<label for="base_url">API 地址 (Base URL)</label>
|
||||||
<input type="text" id="base_url" name="base_url"
|
<input type="text" id="base_url" name="base_url" placeholder="https://api.openai.com/v1">
|
||||||
placeholder="https://api.openai.com/v1">
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="apikey">API 密钥</label>
|
<label for="apikey">API 密钥</label>
|
||||||
<input type="password" id="apikey" name="apikey" placeholder="平台对应的API Key"
|
<input type="password" id="apikey" name="apikey" placeholder="平台对应的API Key" required>
|
||||||
required>
|
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="model_id">模型 ID</label>
|
<label for="model_id">模型 ID</label>
|
||||||
@@ -267,7 +284,7 @@
|
|||||||
</main>
|
</main>
|
||||||
<div id="previewModal" class="modal">
|
<div id="previewModal" class="modal">
|
||||||
<div class="modal-content">
|
<div class="modal-content">
|
||||||
<span id="closeModalBtn" style="cursor:pointer; float:right;">×</span>
|
<span id="closeModalBtn" style="cursor:pointer; float:right; font-size: 1.5rem; line-height: 1;">×</span>
|
||||||
<h3>HTML 预览</h3>
|
<h3>HTML 预览</h3>
|
||||||
<iframe id="previewFrame"></iframe>
|
<iframe id="previewFrame"></iframe>
|
||||||
<div class="button-group">
|
<div class="button-group">
|
||||||
@@ -277,7 +294,6 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<iframe id="printFrame" style="display:none;"></iframe>
|
<iframe id="printFrame" style="display:none;"></iframe>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
const platformSelect = document.getElementById('platform_select');
|
const platformSelect = document.getElementById('platform_select');
|
||||||
const baseUrlGroup = document.getElementById('baseUrlGroup');
|
const baseUrlGroup = document.getElementById('baseUrlGroup');
|
||||||
@@ -288,6 +304,11 @@
|
|||||||
const formulaCheckbox = document.getElementById('formula_ocr');
|
const formulaCheckbox = document.getElementById('formula_ocr');
|
||||||
const codeCheckbox = document.getElementById('code_ocr');
|
const codeCheckbox = document.getElementById('code_ocr');
|
||||||
const refineCheckbox = document.getElementById('refine_markdown');
|
const refineCheckbox = document.getElementById('refine_markdown');
|
||||||
|
|
||||||
|
const convertEnginSelect = document.getElementById('convert_engin');
|
||||||
|
const mineruTokenGroup = document.getElementById('mineruTokenGroup');
|
||||||
|
const mineruTokenInput = document.getElementById('mineru_token');
|
||||||
|
|
||||||
const form = document.getElementById('translateForm');
|
const form = document.getElementById('translateForm');
|
||||||
const submitButton = document.getElementById('submitButton');
|
const submitButton = document.getElementById('submitButton');
|
||||||
const logArea = document.getElementById('logArea');
|
const logArea = document.getElementById('logArea');
|
||||||
@@ -311,7 +332,6 @@
|
|||||||
|
|
||||||
let logPollIntervalId = null;
|
let logPollIntervalId = null;
|
||||||
let statusPollIntervalId = null;
|
let statusPollIntervalId = null;
|
||||||
// let lastLogCount = 0; // No longer needed for fetching logs
|
|
||||||
let isTranslating = false;
|
let isTranslating = false;
|
||||||
|
|
||||||
function saveToStorage(key, value) {
|
function saveToStorage(key, value) {
|
||||||
@@ -347,7 +367,34 @@
|
|||||||
saveToStorage('translator_last_platform', selectedPlatformValue);
|
saveToStorage('translator_last_platform', selectedPlatformValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
loadSettings();
|
function updateConvertEnginUI() {
|
||||||
|
const selectedEngin = convertEnginSelect.value;
|
||||||
|
if (selectedEngin === 'mineru') {
|
||||||
|
mineruTokenGroup.classList.remove('hidden');
|
||||||
|
mineruTokenInput.required = true;
|
||||||
|
mineruTokenInput.value = getFromStorage('translator_mineru_token');
|
||||||
|
} else {
|
||||||
|
mineruTokenGroup.classList.add('hidden');
|
||||||
|
mineruTokenInput.required = false;
|
||||||
|
// Optionally clear if not needed: mineruTokenInput.value = '';
|
||||||
|
}
|
||||||
|
saveToStorage('translator_convert_engin', selectedEngin);
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadSettings() {
|
||||||
|
platformSelect.value = getFromStorage('translator_last_platform', 'custom');
|
||||||
|
updatePlatformUI();
|
||||||
|
|
||||||
|
convertEnginSelect.value = getFromStorage('translator_convert_engin', 'mineru');
|
||||||
|
updateConvertEnginUI(); // Must be after setting convertEnginSelect.value
|
||||||
|
|
||||||
|
toLangSelect.value = getFromStorage('translator_to_lang', '中文');
|
||||||
|
formulaCheckbox.checked = getFromStorage('translator_formula_ocr') === 'true';
|
||||||
|
codeCheckbox.checked = getFromStorage('translator_code_ocr') === 'true';
|
||||||
|
refineCheckbox.checked = getFromStorage('translator_refine_markdown') === 'true';
|
||||||
|
}
|
||||||
|
|
||||||
|
loadSettings(); // Initial load
|
||||||
|
|
||||||
platformSelect.addEventListener('change', updatePlatformUI);
|
platformSelect.addEventListener('change', updatePlatformUI);
|
||||||
apikeyInput.addEventListener('input', (e) => saveToStorage(`translator_platform_${platformSelect.value}_apikey`, e.target.value));
|
apikeyInput.addEventListener('input', (e) => saveToStorage(`translator_platform_${platformSelect.value}_apikey`, e.target.value));
|
||||||
@@ -355,10 +402,14 @@
|
|||||||
baseUrlInput.addEventListener('input', (e) => {
|
baseUrlInput.addEventListener('input', (e) => {
|
||||||
if (platformSelect.value === 'custom') saveToStorage('translator_platform_custom_base_url', e.target.value);
|
if (platformSelect.value === 'custom') saveToStorage('translator_platform_custom_base_url', e.target.value);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
convertEnginSelect.addEventListener('change', updateConvertEnginUI);
|
||||||
|
mineruTokenInput.addEventListener('input', (e) => saveToStorage('translator_mineru_token', e.target.value));
|
||||||
|
|
||||||
toLangSelect.addEventListener('change', e => saveToStorage('translator_to_lang', e.target.value));
|
toLangSelect.addEventListener('change', e => saveToStorage('translator_to_lang', e.target.value));
|
||||||
formulaCheckbox.addEventListener('change', e => saveToStorage('translator_formula_ocr', e.target.checked));
|
formulaCheckbox.addEventListener('change', e => saveToStorage('translator_formula_ocr', e.target.checked.toString()));
|
||||||
codeCheckbox.addEventListener('change', e => saveToStorage('translator_code_ocr', e.target.checked));
|
codeCheckbox.addEventListener('change', e => saveToStorage('translator_code_ocr', e.target.checked.toString()));
|
||||||
refineCheckbox.addEventListener('change', e => saveToStorage('translator_refine_markdown', e.target.checked));
|
refineCheckbox.addEventListener('change', e => saveToStorage('translator_refine_markdown', e.target.checked.toString()));
|
||||||
|
|
||||||
[closeModalButton, closePreviewBtn].forEach(elem => elem.addEventListener('click', () => modal.style.display = 'none'));
|
[closeModalButton, closePreviewBtn].forEach(elem => elem.addEventListener('click', () => modal.style.display = 'none'));
|
||||||
window.addEventListener('click', (event) => {
|
window.addEventListener('click', (event) => {
|
||||||
@@ -374,9 +425,7 @@
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
fileDropArea.addEventListener('click', () => {
|
fileDropArea.addEventListener('click', () => fileInput.click());
|
||||||
fileInput.click();
|
|
||||||
});
|
|
||||||
|
|
||||||
fileInput.addEventListener('change', () => {
|
fileInput.addEventListener('change', () => {
|
||||||
if (fileInput.files.length > 0) {
|
if (fileInput.files.length > 0) {
|
||||||
@@ -422,7 +471,6 @@
|
|||||||
fileDropArea.addEventListener('drop', (e) => {
|
fileDropArea.addEventListener('drop', (e) => {
|
||||||
const dt = e.dataTransfer;
|
const dt = e.dataTransfer;
|
||||||
const files = dt.files;
|
const files = dt.files;
|
||||||
|
|
||||||
if (files.length > 0) {
|
if (files.length > 0) {
|
||||||
fileInput.files = files;
|
fileInput.files = files;
|
||||||
const event = new Event('change', {bubbles: true});
|
const event = new Event('change', {bubbles: true});
|
||||||
@@ -432,8 +480,7 @@
|
|||||||
|
|
||||||
async function pollLogs() {
|
async function pollLogs() {
|
||||||
try {
|
try {
|
||||||
// const response = await fetch(`/get-logs?since=${lastLogCount}`); // OLD
|
const response = await fetch('/get-logs');
|
||||||
const response = await fetch('/get-logs'); // NEW: No 'since' parameter
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
console.warn(`Log polling failed: ${response.status}`);
|
console.warn(`Log polling failed: ${response.status}`);
|
||||||
return;
|
return;
|
||||||
@@ -444,9 +491,8 @@
|
|||||||
const escapedLog = log.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
const escapedLog = log.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
||||||
logArea.innerHTML += escapedLog + "<br>";
|
logArea.innerHTML += escapedLog + "<br>";
|
||||||
});
|
});
|
||||||
logArea.scrollTop = logArea.scrollHeight; // Scroll to bottom
|
logArea.scrollTop = logArea.scrollHeight;
|
||||||
}
|
}
|
||||||
// lastLogCount = data.total_count; // OLD: No longer tracking count this way
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn("Error polling logs:", error);
|
console.warn("Error polling logs:", error);
|
||||||
}
|
}
|
||||||
@@ -561,7 +607,7 @@
|
|||||||
} else {
|
} else {
|
||||||
submitButton.textContent = '取消翻译';
|
submitButton.textContent = '取消翻译';
|
||||||
submitButton.classList.remove('primary');
|
submitButton.classList.remove('primary');
|
||||||
submitButton.classList.add('secondary');
|
submitButton.classList.add('secondary', 'contrast'); // Using contrast for cancel
|
||||||
isTranslating = true;
|
isTranslating = true;
|
||||||
submitButton.disabled = false;
|
submitButton.disabled = false;
|
||||||
submitButton.removeAttribute('aria-busy');
|
submitButton.removeAttribute('aria-busy');
|
||||||
@@ -576,10 +622,9 @@
|
|||||||
|
|
||||||
function startPolling() {
|
function startPolling() {
|
||||||
stopPolling();
|
stopPolling();
|
||||||
// lastLogCount = 0; // No longer needed
|
logArea.innerHTML = '';
|
||||||
logArea.innerHTML = ''; // Clear log area for new task
|
pollLogs();
|
||||||
pollLogs(); // Initial poll for logs
|
pollStatus();
|
||||||
pollStatus(); // Initial poll for status
|
|
||||||
logPollIntervalId = setInterval(pollLogs, 2000);
|
logPollIntervalId = setInterval(pollLogs, 2000);
|
||||||
statusPollIntervalId = setInterval(pollStatus, 1500);
|
statusPollIntervalId = setInterval(pollStatus, 1500);
|
||||||
}
|
}
|
||||||
@@ -589,16 +634,7 @@
|
|||||||
if (statusPollIntervalId) clearInterval(statusPollIntervalId);
|
if (statusPollIntervalId) clearInterval(statusPollIntervalId);
|
||||||
logPollIntervalId = null;
|
logPollIntervalId = null;
|
||||||
statusPollIntervalId = null;
|
statusPollIntervalId = null;
|
||||||
setTimeout(pollLogs, 500);
|
setTimeout(pollLogs, 500); // One last poll for logs
|
||||||
}
|
|
||||||
|
|
||||||
function loadSettings() {
|
|
||||||
platformSelect.value = getFromStorage('translator_last_platform', 'custom');
|
|
||||||
updatePlatformUI();
|
|
||||||
toLangSelect.value = getFromStorage('translator_to_lang', '中文');
|
|
||||||
formulaCheckbox.checked = getFromStorage('translator_formula_ocr') === 'true';
|
|
||||||
codeCheckbox.checked = getFromStorage('translator_code_ocr') === 'true';
|
|
||||||
refineCheckbox.checked = getFromStorage('translator_refine_markdown') === 'true';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function cancelTranslation() {
|
async function cancelTranslation() {
|
||||||
@@ -609,13 +645,13 @@
|
|||||||
try {
|
try {
|
||||||
const response = await fetch('/cancel-translate', {method: 'POST'});
|
const response = await fetch('/cancel-translate', {method: 'POST'});
|
||||||
const result = await response.json();
|
const result = await response.json();
|
||||||
|
|
||||||
if (response.ok && result.cancelled) {
|
if (response.ok && result.cancelled) {
|
||||||
statusMsg.textContent = result.message || '取消请求已发送。';
|
statusMsg.textContent = result.message || '取消请求已发送。';
|
||||||
statusMsg.className = '';
|
statusMsg.className = ''; // Clear error class
|
||||||
} else {
|
} else {
|
||||||
statusMsg.textContent = result.message || '取消失败。';
|
statusMsg.textContent = result.message || '取消失败。';
|
||||||
statusMsg.className = 'error-message';
|
statusMsg.className = 'error-message';
|
||||||
|
// Re-enable button if cancellation failed to register server-side
|
||||||
submitButton.disabled = false;
|
submitButton.disabled = false;
|
||||||
submitButton.textContent = '取消翻译';
|
submitButton.textContent = '取消翻译';
|
||||||
submitButton.removeAttribute('aria-busy');
|
submitButton.removeAttribute('aria-busy');
|
||||||
@@ -628,6 +664,7 @@
|
|||||||
submitButton.textContent = '取消翻译';
|
submitButton.textContent = '取消翻译';
|
||||||
submitButton.removeAttribute('aria-busy');
|
submitButton.removeAttribute('aria-busy');
|
||||||
}
|
}
|
||||||
|
// Status poller will eventually update the button state correctly
|
||||||
}
|
}
|
||||||
|
|
||||||
form.addEventListener('submit', async function (event) {
|
form.addEventListener('submit', async function (event) {
|
||||||
@@ -638,6 +675,10 @@
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clear previous input errors
|
||||||
|
[fileDropArea, mineruTokenInput].forEach(el => el.classList.remove('input-error'));
|
||||||
|
fileNameDisplay.classList.remove('input-error-text');
|
||||||
|
|
||||||
if (fileInput.files.length === 0) {
|
if (fileInput.files.length === 0) {
|
||||||
statusMsg.textContent = '请选择一个文件进行翻译。';
|
statusMsg.textContent = '请选择一个文件进行翻译。';
|
||||||
statusMsg.className = 'error-message';
|
statusMsg.className = 'error-message';
|
||||||
@@ -648,14 +689,18 @@
|
|||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
fileDropArea.classList.remove('input-error');
|
fileDropArea.classList.remove('input-error');
|
||||||
fileNameDisplay.classList.remove('input-error-text');
|
fileNameDisplay.classList.remove('input-error-text');
|
||||||
if (fileNameDisplay.textContent === '请选择文件!') {
|
if (fileNameDisplay.textContent === '请选择文件!') fileNameDisplay.textContent = '未选择文件';
|
||||||
fileNameDisplay.textContent = '未选择文件';
|
if (fileInput.files.length === 0) fileDropPrompt.classList.remove('hidden');
|
||||||
}
|
}, 3000);
|
||||||
if (fileInput.files.length === 0) {
|
return;
|
||||||
fileDropPrompt.classList.remove('hidden');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}, 3000);
|
if (convertEnginSelect.value === 'mineru' && !mineruTokenInput.value.trim()) {
|
||||||
|
statusMsg.textContent = '使用 Mineru 引擎时,必须填写 Mineru Token。';
|
||||||
|
statusMsg.className = 'error-message';
|
||||||
|
mineruTokenInput.classList.add('input-error');
|
||||||
|
mineruTokenInput.focus();
|
||||||
|
setTimeout(() => mineruTokenInput.classList.remove('input-error'), 3000);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -667,9 +712,10 @@
|
|||||||
statusMsg.textContent = '正在提交任务...';
|
statusMsg.textContent = '正在提交任务...';
|
||||||
statusMsg.className = '';
|
statusMsg.className = '';
|
||||||
downloadBtns.style.display = 'none';
|
downloadBtns.style.display = 'none';
|
||||||
// lastLogCount = 0; // No longer needed
|
|
||||||
|
|
||||||
const formData = new FormData(form);
|
const formData = new FormData(form);
|
||||||
|
// FormData automatically includes convert_engin and mineru_token due to 'name' attributes
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch('/translate', {method: 'POST', body: formData});
|
const response = await fetch('/translate', {method: 'POST', body: formData});
|
||||||
const result = await response.json();
|
const result = await response.json();
|
||||||
@@ -678,8 +724,9 @@
|
|||||||
statusMsg.className = '';
|
statusMsg.className = '';
|
||||||
submitButton.textContent = '取消翻译';
|
submitButton.textContent = '取消翻译';
|
||||||
submitButton.classList.remove('primary');
|
submitButton.classList.remove('primary');
|
||||||
submitButton.classList.add('secondary');
|
submitButton.classList.add('secondary', 'contrast');
|
||||||
isTranslating = true;
|
isTranslating = true;
|
||||||
|
submitButton.disabled = false; // Enable cancel button
|
||||||
submitButton.removeAttribute('aria-busy');
|
submitButton.removeAttribute('aria-busy');
|
||||||
startPolling();
|
startPolling();
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,24 +1,25 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from io import BytesIO
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Literal
|
from typing import Literal
|
||||||
|
|
||||||
import markdown2
|
import markdown2
|
||||||
from docling.datamodel.document import DocumentStream
|
|
||||||
|
|
||||||
from docutranslate.agents import Agent, AgentArgs
|
from docutranslate.agents import Agent, AgentArgs
|
||||||
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
from docutranslate.agents import MDRefineAgent, MDTranslateAgent
|
||||||
from docutranslate.utils.convert import file2markdown_embed_images
|
from docutranslate.converter import Document, ConverterDocling, ConverterMineru
|
||||||
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
|
||||||
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
|
from docutranslate.utils.markdown_utils import uris2placeholder, placeholder2_uris, MaskDict
|
||||||
from docutranslate.logger import translater_logger
|
from docutranslate.logger import translater_logger
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class FileTranslater:
|
class FileTranslater:
|
||||||
def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000, base_url="", key=None,
|
def __init__(self, file_path: Path | str | None = None, chunksize: int = 2000,
|
||||||
model_id="", temperature=0.7, max_concurrent=20, docling_artifact: Path | str | None = None,
|
base_url="", key=None, model_id="", temperature=0.7,
|
||||||
timeout=2000, tips=True):
|
max_concurrent=20, timeout=2000,
|
||||||
|
convert_engin: Literal["docling", "mineru"] = "docling",
|
||||||
|
docling_artifact: Path | str | None = None,
|
||||||
|
mineru_token: str = None,
|
||||||
|
tips=True):
|
||||||
|
self.convert_engin = convert_engin
|
||||||
|
self.mineru_token = mineru_token.strip() if mineru_token is not None else None
|
||||||
if isinstance(file_path, str):
|
if isinstance(file_path, str):
|
||||||
file_path = Path(file_path)
|
file_path = Path(file_path)
|
||||||
self.file_path: Path = file_path
|
self.file_path: Path = file_path
|
||||||
@@ -33,9 +34,9 @@ class FileTranslater:
|
|||||||
self.temperature = temperature
|
self.temperature = temperature
|
||||||
self.docling_artifact = docling_artifact
|
self.docling_artifact = docling_artifact
|
||||||
if docling_artifact is None:
|
if docling_artifact is None:
|
||||||
artifact_path=Path("./artifact")
|
artifact_path = Path("./docling_artifact")
|
||||||
if artifact_path.is_dir():
|
if artifact_path.is_dir():
|
||||||
translater_logger.info("检测到artifact文件夹")
|
translater_logger.info("检测到docling_artifact文件夹")
|
||||||
self.docling_artifact = artifact_path
|
self.docling_artifact = artifact_path
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
if tips:
|
if tips:
|
||||||
@@ -66,7 +67,7 @@ class FileTranslater:
|
|||||||
translater_logger.info(f"markdown分为{len(chunks)}块")
|
translater_logger.info(f"markdown分为{len(chunks)}块")
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def default_agent_params(self) -> AgentArgs:
|
def _default_agent_params(self) -> AgentArgs:
|
||||||
result: AgentArgs = {
|
result: AgentArgs = {
|
||||||
"baseurl": self.base_url,
|
"baseurl": self.base_url,
|
||||||
"key": self.key,
|
"key": self.key,
|
||||||
@@ -77,16 +78,48 @@ class FileTranslater:
|
|||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _convert2markdown(self, document: Document, formula: bool, code: bool, artifact: Path = None) -> str:
|
||||||
|
translater_logger.info(f"正在使用{self.convert_engin}转换文件为markdown")
|
||||||
|
if self.convert_engin == "docling":
|
||||||
|
if artifact is None:
|
||||||
|
artifact = self.docling_artifact
|
||||||
|
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact)
|
||||||
|
result = mdconverter.convert(document)
|
||||||
|
else:
|
||||||
|
if self.mineru_token is None:
|
||||||
|
raise Exception("mineru_token未配置")
|
||||||
|
if code:
|
||||||
|
translater_logger.info("mineru暂不支持code识别")
|
||||||
|
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
||||||
|
result = mdconverter.convert(document)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _convert2markdown_async(self, document: Document, formula: bool, code: bool,
|
||||||
|
artifact: Path = None) -> str:
|
||||||
|
if self.convert_engin == "docling":
|
||||||
|
if artifact is None:
|
||||||
|
artifact = self.docling_artifact
|
||||||
|
mdconverter = ConverterDocling(formula=formula, code=code, artifact=artifact)
|
||||||
|
result = await mdconverter.convert_async(document)
|
||||||
|
else:
|
||||||
|
if self.mineru_token is None:
|
||||||
|
raise Exception("mineru_token未配置")
|
||||||
|
if code:
|
||||||
|
translater_logger.info("mineru暂不支持code识别")
|
||||||
|
mdconverter = ConverterMineru(token=self.mineru_token, formula=formula)
|
||||||
|
result = await mdconverter.convert_async(document)
|
||||||
|
return result
|
||||||
|
|
||||||
def read_bytes(self, name: str, file: bytes, formula=True, code=True, save=False,
|
def read_bytes(self, name: str, file: bytes, formula=True, code=True, save=False,
|
||||||
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
refine_agent: Agent | None = None):
|
refine_agent: Agent | None = None):
|
||||||
ds = DocumentStream(name=name, stream=BytesIO(file))
|
document = Document(filename=name, filebytes=file)
|
||||||
file_path = Path(name)
|
file_path = Path(name)
|
||||||
# 如果是markdown,直接读取
|
# 如果是markdown,直接读取
|
||||||
if file_path.suffix == ".md":
|
if file_path.suffix == ".md":
|
||||||
self.markdown = file.decode()
|
self.markdown = file.decode()
|
||||||
else:
|
else:
|
||||||
self.markdown = file2markdown_embed_images(ds, formula, code, artifacts_path=self.docling_artifact)
|
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
|
||||||
if refine:
|
if refine:
|
||||||
self.refine_markdown_by_agent(refine_agent)
|
self.refine_markdown_by_agent(refine_agent)
|
||||||
if save:
|
if save:
|
||||||
@@ -96,6 +129,26 @@ class FileTranslater:
|
|||||||
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
async def read_bytes_async(self, name: str, file: bytes, formula=True, code=True, save=False,
|
||||||
|
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
|
refine_agent: Agent | None = None):
|
||||||
|
document = Document(filename=name, filebytes=file)
|
||||||
|
file_path = Path(name)
|
||||||
|
# 如果是markdown,直接读取
|
||||||
|
if file_path.suffix == ".md":
|
||||||
|
self.markdown = file.decode()
|
||||||
|
else:
|
||||||
|
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
|
||||||
|
artifact=self.docling_artifact)
|
||||||
|
if refine:
|
||||||
|
await self.refine_markdown_by_agent_async(refine_agent)
|
||||||
|
if save:
|
||||||
|
if save_format == "html":
|
||||||
|
self.save_as_html(filename=f"{file_path.stem}.html")
|
||||||
|
else:
|
||||||
|
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||||
|
return self
|
||||||
|
|
||||||
def read_file(self, file_path: Path | str | None = None, formula=True, code=True, save=False,
|
def read_file(self, file_path: Path | str | None = None, formula=True, code=True, save=False,
|
||||||
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
refine_agent: Agent | None = None):
|
refine_agent: Agent | None = None):
|
||||||
@@ -112,7 +165,8 @@ class FileTranslater:
|
|||||||
with open(file_path, "r") as f:
|
with open(file_path, "r") as f:
|
||||||
self.markdown = f.read()
|
self.markdown = f.read()
|
||||||
else:
|
else:
|
||||||
self.markdown = file2markdown_embed_images(file_path, formula, code, artifacts_path=self.docling_artifact)
|
document = Document(file_path)
|
||||||
|
self.markdown = self._convert2markdown(document, formula=formula, code=code, artifact=self.docling_artifact)
|
||||||
if refine:
|
if refine:
|
||||||
self.refine_markdown_by_agent(refine_agent)
|
self.refine_markdown_by_agent(refine_agent)
|
||||||
if save:
|
if save:
|
||||||
@@ -122,12 +176,40 @@ class FileTranslater:
|
|||||||
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
async def read_file_async(self, file_path: Path | str | None = None, formula=True, code=True, save=False,
|
||||||
|
save_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
|
refine_agent: Agent | None = None):
|
||||||
|
if file_path is None:
|
||||||
|
if self.file_path is None:
|
||||||
|
translater_logger.debug("未设置文件路径")
|
||||||
|
raise Exception("未设置文件路径")
|
||||||
|
file_path = self.file_path
|
||||||
|
if isinstance(file_path, str):
|
||||||
|
file_path = Path(file_path)
|
||||||
|
translater_logger.info(f"读取文件:{file_path.name}")
|
||||||
|
# 如果是markdown,直接读取
|
||||||
|
if file_path.suffix == ".md":
|
||||||
|
with open(file_path, "r") as f:
|
||||||
|
self.markdown = f.read()
|
||||||
|
else:
|
||||||
|
document = Document(file_path)
|
||||||
|
self.markdown = await self._convert2markdown_async(document, formula=formula, code=code,
|
||||||
|
artifact=self.docling_artifact)
|
||||||
|
if refine:
|
||||||
|
await self.refine_markdown_by_agent_async(refine_agent)
|
||||||
|
if save:
|
||||||
|
if save_format == "html":
|
||||||
|
self.save_as_html(filename=f"{file_path.stem}.html")
|
||||||
|
else:
|
||||||
|
self.save_as_markdown(filename=f"{file_path.stem}.md")
|
||||||
|
return self
|
||||||
|
|
||||||
def refine_markdown_by_agent(self, refine_agent: Agent | None = None) -> str:
|
def refine_markdown_by_agent(self, refine_agent: Agent | None = None) -> str:
|
||||||
translater_logger.info("正在修正markdown")
|
translater_logger.info("正在修正markdown")
|
||||||
self._mask_uris_in_markdown()
|
self._mask_uris_in_markdown()
|
||||||
chuncks = self._split_markdown_into_chunks()
|
chuncks = self._split_markdown_into_chunks()
|
||||||
if refine_agent is None:
|
if refine_agent is None:
|
||||||
refine_agent = MDRefineAgent(**self.default_agent_params())
|
refine_agent = MDRefineAgent(**self._default_agent_params())
|
||||||
result: list[str] = refine_agent.send_prompts(chuncks)
|
result: list[str] = refine_agent.send_prompts(chuncks)
|
||||||
self.markdown = join_markdown_texts(result)
|
self.markdown = join_markdown_texts(result)
|
||||||
self._unmask_uris_in_markdown()
|
self._unmask_uris_in_markdown()
|
||||||
@@ -139,20 +221,19 @@ class FileTranslater:
|
|||||||
self._mask_uris_in_markdown()
|
self._mask_uris_in_markdown()
|
||||||
chuncks = self._split_markdown_into_chunks()
|
chuncks = self._split_markdown_into_chunks()
|
||||||
if translate_agent is None:
|
if translate_agent is None:
|
||||||
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
|
translate_agent = MDTranslateAgent(to_lang=to_lang, **self._default_agent_params())
|
||||||
result: list[str] = translate_agent.send_prompts(chuncks)
|
result: list[str] = translate_agent.send_prompts(chuncks)
|
||||||
self.markdown = join_markdown_texts(result)
|
self.markdown = join_markdown_texts(result)
|
||||||
self._unmask_uris_in_markdown()
|
self._unmask_uris_in_markdown()
|
||||||
translater_logger.info("翻译完成")
|
translater_logger.info("翻译完成")
|
||||||
return self.markdown
|
return self.markdown
|
||||||
|
|
||||||
|
|
||||||
async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None) -> str:
|
async def refine_markdown_by_agent_async(self, refine_agent: Agent | None = None) -> str:
|
||||||
translater_logger.info("正在修正markdown")
|
translater_logger.info("正在修正markdown")
|
||||||
self._mask_uris_in_markdown()
|
self._mask_uris_in_markdown()
|
||||||
chuncks = self._split_markdown_into_chunks()
|
chuncks = self._split_markdown_into_chunks()
|
||||||
if refine_agent is None:
|
if refine_agent is None:
|
||||||
refine_agent = MDRefineAgent(**self.default_agent_params())
|
refine_agent = MDRefineAgent(**self._default_agent_params())
|
||||||
result: list[str] = await refine_agent.send_prompts_async(chuncks)
|
result: list[str] = await refine_agent.send_prompts_async(chuncks)
|
||||||
self.markdown = join_markdown_texts(result)
|
self.markdown = join_markdown_texts(result)
|
||||||
self._unmask_uris_in_markdown()
|
self._unmask_uris_in_markdown()
|
||||||
@@ -164,7 +245,7 @@ class FileTranslater:
|
|||||||
self._mask_uris_in_markdown()
|
self._mask_uris_in_markdown()
|
||||||
chuncks = self._split_markdown_into_chunks()
|
chuncks = self._split_markdown_into_chunks()
|
||||||
if translate_agent is None:
|
if translate_agent is None:
|
||||||
translate_agent = MDTranslateAgent(to_lang=to_lang, **self.default_agent_params())
|
translate_agent = MDTranslateAgent(to_lang=to_lang, **self._default_agent_params())
|
||||||
result: list[str] = await translate_agent.send_prompts_async(chuncks)
|
result: list[str] = await translate_agent.send_prompts_async(chuncks)
|
||||||
self.markdown = join_markdown_texts(result)
|
self.markdown = join_markdown_texts(result)
|
||||||
self._unmask_uris_in_markdown()
|
self._unmask_uris_in_markdown()
|
||||||
@@ -217,6 +298,7 @@ class FileTranslater:
|
|||||||
|
|
||||||
def export_to_html(self, title="title") -> str:
|
def export_to_html(self, title="title") -> str:
|
||||||
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
|
markdowner = markdown2.Markdown(extras=['tables', 'fenced-code-blocks', 'mermaid', "code-friendly"])
|
||||||
|
# TODO:实现完全本地化css和js
|
||||||
# language=html
|
# language=html
|
||||||
html = f"""<!DOCTYPE html>
|
html = f"""<!DOCTYPE html>
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
@@ -291,6 +373,7 @@ class FileTranslater:
|
|||||||
filename = f"{file_path.stem}_{to_lang}.html"
|
filename = f"{file_path.stem}_{to_lang}.html"
|
||||||
self.save_as_html(filename=filename, output_dir=output_dir)
|
self.save_as_html(filename=filename, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def translate_file_async(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
async def translate_file_async(self, file_path: Path | str | None = None, to_lang="中文", output_dir="./output",
|
||||||
formula=True,
|
formula=True,
|
||||||
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
|
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
@@ -317,6 +400,7 @@ class FileTranslater:
|
|||||||
filename = f"{file_path.stem}_{to_lang}.html"
|
filename = f"{file_path.stem}_{to_lang}.html"
|
||||||
self.save_as_html(filename=filename, output_dir=output_dir)
|
self.save_as_html(filename=filename, output_dir=output_dir)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def translate_bytes(self, name: str, file: bytes, to_lang="中文", output_dir="./output",
|
def translate_bytes(self, name: str, file: bytes, to_lang="中文", output_dir="./output",
|
||||||
formula=True,
|
formula=True,
|
||||||
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
|
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
@@ -338,13 +422,8 @@ class FileTranslater:
|
|||||||
formula=True,
|
formula=True,
|
||||||
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
|
code=True, output_format: Literal["markdown", "html"] = "markdown", refine=False,
|
||||||
refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=True):
|
refine_agent: Agent | None = None, translate_agent: Agent | None = None, save=True):
|
||||||
await asyncio.to_thread(
|
await self.read_bytes_async(name=name, file=file, formula=formula, code=code)
|
||||||
self.read_bytes,
|
|
||||||
name=name,
|
|
||||||
file=file,
|
|
||||||
formula=formula,
|
|
||||||
code=code
|
|
||||||
)
|
|
||||||
if refine:
|
if refine:
|
||||||
await self.refine_markdown_by_agent_async(refine_agent)
|
await self.refine_markdown_by_agent_async(refine_agent)
|
||||||
await self.translate_markdown_by_agent_async(translate_agent, to_lang=to_lang)
|
await self.translate_markdown_by_agent_async(translate_agent, to_lang=to_lang)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "docutranslate"
|
name = "docutranslate"
|
||||||
version = "0.2.19"
|
version = "0.2.20"
|
||||||
description = "文件翻译工具"
|
description = "文件翻译工具"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
|
|||||||
Reference in New Issue
Block a user