diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index c1aad58..fd2a3c0 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -6,6 +6,8 @@
+
+
@@ -31,54 +33,54 @@
- {
+ "keyToString": {
+ "DefaultHtmlFileTemplate": "HTML File",
+ "JavaScript 调试.output.html (1).executor": "Run",
+ "JavaScript 调试.output.html.executor": "Run",
+ "JavaScript 调试.regex.md_中文.html.executor": "Run",
+ "JavaScript 调试.regex_中文.html.executor": "Run",
+ "JavaScript 调试.test2.html.executor": "Run",
+ "JavaScript 调试.test2_英文.html.executor": "Run",
+ "JavaScript 调试.test4-1_中文.html.executor": "Run",
+ "JavaScript 调试.互联网认证授权机制.html.executor": "Run",
+ "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
+ "JavaScript 调试.毕业论文_英文.html.executor": "Run",
+ "ModuleVcsDetector.initialDetectionPerformed": "true",
+ "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
+ "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
+ "Python.PDFtranslater (1).executor": "Run",
+ "Python.PDFtranslater (2).executor": "Run",
+ "Python.agent.executor": "Debug",
+ "Python.agent_utils.executor": "Run",
+ "Python.app (1).executor": "Run",
+ "Python.app.executor": "Run",
+ "Python.app2.executor": "Run",
+ "Python.app_test (1).executor": "Run",
+ "Python.convert.executor": "Run",
+ "Python.markdown_splitter.executor": "Debug",
+ "Python.markdown_utils.executor": "Run",
+ "Python.test.executor": "Run",
+ "Python.test1.executor": "Run",
+ "Python.test2.executor": "Run",
+ "Python.test3.executor": "Run",
+ "Python.test4.executor": "Run",
+ "Python.translater.executor": "Run",
+ "Python.切分测试.executor": "Run",
+ "RunOnceActivity.ShowReadmeOnStart": "true",
+ "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
+ "RunOnceActivity.git.unshallow": "true",
+ "git-widget-placeholder": "main",
+ "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate",
+ "node.js.detected.package.eslint": "true",
+ "node.js.detected.package.tslint": "true",
+ "node.js.selected.package.eslint": "(autodetect)",
+ "node.js.selected.package.tslint": "(autodetect)",
+ "nodejs_package_manager_path": "npm",
+ "settings.editor.selected.configurable": "preferences.pluginManager",
+ "vue.rearranger.settings.migration": "true"
}
-}]]>
+}
@@ -95,7 +97,7 @@
-
+
@@ -541,11 +543,11 @@
+
-
@@ -612,7 +614,9 @@
-
+
+
+
@@ -621,7 +625,7 @@
-
+
@@ -631,9 +635,9 @@
-
-
+
+
diff --git a/README.md b/README.md
index 8977137..ee2ebda 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,11 @@
文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译
+# 整合包
+
+对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases)
+上下载最新的整合包,该整合包点击即用,您所需的只是获取某个ai平台的api-key。
+
# 安装
使用pip
@@ -17,12 +22,15 @@
# 支持的文件格式
-| 输入格式 | 输出格式 |
-|------------|--------------|
-| PDF(非扫描版) | Markdown(推荐) |
-| Markdown | HTML |
-| HTML、XHTML | |
-| CSV | |
+| 输入格式 | 输出格式 |
+|----------------|--------------|
+| PDF(非扫描版) | Markdown(推荐) |
+| Markdown | HTML |
+| HTML、XHTML | PDF(仅交互界面支持) |
+| CSV | |
+| DOC、DOCX(部分支持) | |
+
+> 如果想不使用交互界面获取pdf,可以先下载HTML文件,用浏览器打开并打印
# 前置条件
diff --git a/docutranslate/utils/convert.py b/docutranslate/utils/convert.py
index 668f92f..74abec1 100644
--- a/docutranslate/utils/convert.py
+++ b/docutranslate/utils/convert.py
@@ -1,11 +1,13 @@
import os
from huggingface_hub.errors import LocalEntryNotFoundError
from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice
from docling_core.types.doc import ImageRefMode
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.document import DocumentStream
+
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docutranslate.logger import translater_logger
IMAGE_RESOLUTION_SCALE = 4
@@ -13,15 +15,19 @@ IMAGE_RESOLUTION_SCALE = 4
def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
translater_logger.info(f"正在将文档转换为markdown")
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
- # pipeline_options.do_ocr=False
+ pipeline_options.do_ocr=False
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
if formula:
pipeline_options.do_formula_enrichment=True
if code:
pipeline_options.do_code_enrichment=True
+ pipeline_options.accelerator_options= AcceleratorOptions(
+ num_threads=8, device=AcceleratorDevice.AUTO
+ )
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+
})
try:
result = converter.convert(file_path).document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
diff --git a/pyproject.toml b/pyproject.toml
index c70c156..6b960f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
-version = "0.2.6"
+version = "0.2.6.post1"
description = "文件翻译工具"
readme = "README.md"
requires-python = ">=3.10"