diff --git a/.idea/workspace.xml b/.idea/workspace.xml index c1aad58..fd2a3c0 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -6,6 +6,8 @@ + + - { + "keyToString": { + "DefaultHtmlFileTemplate": "HTML File", + "JavaScript 调试.output.html (1).executor": "Run", + "JavaScript 调试.output.html.executor": "Run", + "JavaScript 调试.regex.md_中文.html.executor": "Run", + "JavaScript 调试.regex_中文.html.executor": "Run", + "JavaScript 调试.test2.html.executor": "Run", + "JavaScript 调试.test2_英文.html.executor": "Run", + "JavaScript 调试.test4-1_中文.html.executor": "Run", + "JavaScript 调试.互联网认证授权机制.html.executor": "Run", + "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run", + "JavaScript 调试.毕业论文_英文.html.executor": "Run", + "ModuleVcsDetector.initialDetectionPerformed": "true", + "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run", + "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run", + "Python.PDFtranslater (1).executor": "Run", + "Python.PDFtranslater (2).executor": "Run", + "Python.agent.executor": "Debug", + "Python.agent_utils.executor": "Run", + "Python.app (1).executor": "Run", + "Python.app.executor": "Run", + "Python.app2.executor": "Run", + "Python.app_test (1).executor": "Run", + "Python.convert.executor": "Run", + "Python.markdown_splitter.executor": "Debug", + "Python.markdown_utils.executor": "Run", + "Python.test.executor": "Run", + "Python.test1.executor": "Run", + "Python.test2.executor": "Run", + "Python.test3.executor": "Run", + "Python.test4.executor": "Run", + "Python.translater.executor": "Run", + "Python.切分测试.executor": "Run", + "RunOnceActivity.ShowReadmeOnStart": "true", + "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true", + "RunOnceActivity.git.unshallow": "true", + "git-widget-placeholder": "main", + "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate", + "node.js.detected.package.eslint": "true", + "node.js.detected.package.tslint": "true", + "node.js.selected.package.eslint": "(autodetect)", + "node.js.selected.package.tslint": "(autodetect)", + "nodejs_package_manager_path": "npm", + "settings.editor.selected.configurable": "preferences.pluginManager", + "vue.rearranger.settings.migration": "true" } -}]]> +} @@ -95,7 +97,7 @@ - + + - @@ -612,7 +614,9 @@ - + + + @@ -621,7 +625,7 @@ - + @@ -631,9 +635,9 @@ - - + + diff --git a/README.md b/README.md index 8977137..ee2ebda 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,11 @@ 文件翻译工具,借助[docling](https://github.com/docling-project/docling)与大语言模型实现多种格式文件的翻译 +# 整合包 + +对于只使用基本翻译功能的用户,可以在[github releases](https://github.com/xunbu/docutranslate/releases) +上下载最新的整合包,该整合包点击即用,您所需的只是获取某个ai平台的api-key。 + # 安装 使用pip @@ -17,12 +22,15 @@ # 支持的文件格式 -| 输入格式 | 输出格式 | -|------------|--------------| -| PDF(非扫描版) | Markdown(推荐) | -| Markdown | HTML | -| HTML、XHTML | | -| CSV | | +| 输入格式 | 输出格式 | +|----------------|--------------| +| PDF(非扫描版) | Markdown(推荐) | +| Markdown | HTML | +| HTML、XHTML | PDF(仅交互界面支持) | +| CSV | | +| DOC、DOCX(部分支持) | | + +> 如果想不使用交互界面获取pdf,可以先下载HTML文件,用浏览器打开并打印 # 前置条件 diff --git a/docutranslate/utils/convert.py b/docutranslate/utils/convert.py index 668f92f..74abec1 100644 --- a/docutranslate/utils/convert.py +++ b/docutranslate/utils/convert.py @@ -1,11 +1,13 @@ import os from huggingface_hub.errors import LocalEntryNotFoundError from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice from docling_core.types.doc import ImageRefMode from pathlib import Path from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.document import DocumentStream + +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docutranslate.logger import translater_logger IMAGE_RESOLUTION_SCALE = 4 @@ -13,15 +15,19 @@ IMAGE_RESOLUTION_SCALE = 4 def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=False, code=False,artifacts_path:Path|str|None=None) -> str: translater_logger.info(f"正在将文档转换为markdown") pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) - # pipeline_options.do_ocr=False + pipeline_options.do_ocr=False pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE pipeline_options.generate_picture_images = True if formula: pipeline_options.do_formula_enrichment=True if code: pipeline_options.do_code_enrichment=True + pipeline_options.accelerator_options= AcceleratorOptions( + num_threads=8, device=AcceleratorDevice.AUTO + ) converter = DocumentConverter(format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + }) try: result = converter.convert(file_path).document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) diff --git a/pyproject.toml b/pyproject.toml index c70c156..6b960f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docutranslate" -version = "0.2.6" +version = "0.2.6.post1" description = "文件翻译工具" readme = "README.md" requires-python = ">=3.10"