diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index a3b8fbf..e6cf8ca 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -6,10 +6,8 @@
-
-
+
-
@@ -39,62 +37,62 @@
- {
+ "keyToString": {
+ "DefaultHtmlFileTemplate": "HTML File",
+ "JavaScript 调试.output.html (1).executor": "Run",
+ "JavaScript 调试.output.html.executor": "Run",
+ "JavaScript 调试.regex.md_中文.html.executor": "Run",
+ "JavaScript 调试.regex_中文.html.executor": "Run",
+ "JavaScript 调试.test.html.executor": "Run",
+ "JavaScript 调试.test2.html.executor": "Run",
+ "JavaScript 调试.test2_英文.html.executor": "Run",
+ "JavaScript 调试.test4-1_中文.html.executor": "Run",
+ "JavaScript 调试.互联网认证授权机制.html.executor": "Run",
+ "JavaScript 调试.互联网认证授权机制_英文.html.executor": "Run",
+ "JavaScript 调试.毕业论文_英文.html.executor": "Run",
+ "ModuleVcsDetector.initialDetectionPerformed": "true",
+ "Python 测试.Python 测试 (markdown_mask.py 内).executor": "Run",
+ "Python 测试.markdown_mask.Test.test_basic_link_masking 的 Python 测试.executor": "Run",
+ "Python 测试.pytest (test_html.py 内).executor": "Run",
+ "Python.2test2 (1).executor": "Run",
+ "Python.PDFtranslater (1).executor": "Run",
+ "Python.PDFtranslater (2).executor": "Run",
+ "Python.agent.executor": "Debug",
+ "Python.agent_utils.executor": "Run",
+ "Python.app (1).executor": "Run",
+ "Python.app.executor": "Run",
+ "Python.app2.executor": "Run",
+ "Python.app_test (1).executor": "Run",
+ "Python.convert.executor": "Run",
+ "Python.markdown_splitter.executor": "Debug",
+ "Python.markdown_utils.executor": "Run",
+ "Python.test.executor": "Run",
+ "Python.test1.executor": "Run",
+ "Python.test2.executor": "Run",
+ "Python.test3.executor": "Run",
+ "Python.test4.executor": "Run",
+ "Python.testhtml.executor": "Run",
+ "Python.translater.executor": "Run",
+ "Python.切分测试.executor": "Run",
+ "RunOnceActivity.ShowReadmeOnStart": "true",
+ "RunOnceActivity.TerminalTabsStorage.copyFrom.TerminalArrangementManager": "true",
+ "RunOnceActivity.git.unshallow": "true",
+ "git-widget-placeholder": "main",
+ "last_opened_file_path": "C:/Users/jxgm/Desktop/FileTranslate/dist/DocuTranslate",
+ "node.js.detected.package.eslint": "true",
+ "node.js.detected.package.tslint": "true",
+ "node.js.selected.package.eslint": "(autodetect)",
+ "node.js.selected.package.tslint": "(autodetect)",
+ "nodejs_package_manager_path": "npm",
+ "settings.editor.selected.configurable": "preferences.pluginManager",
+ "vue.rearranger.settings.migration": "true"
}
-}]]>
+}
-
+
@@ -562,7 +560,9 @@
-
+
+
+
@@ -570,11 +570,11 @@
-
+
-
+
@@ -582,9 +582,9 @@
-
-
+
+
diff --git a/docutranslate/app.py b/docutranslate/app.py
index d50141a..10e1fb4 100644
--- a/docutranslate/app.py
+++ b/docutranslate/app.py
@@ -2,8 +2,10 @@ import asyncio
import io
import logging
import time
+import urllib
from pathlib import Path
from typing import List, Dict, Any, Optional
+from urllib.parse import quote
import uvicorn
from fastapi import FastAPI, File, Form, UploadFile, Request, HTTPException
@@ -1061,7 +1063,7 @@ async def download_markdown(filename_with_ext: str):
return StreamingResponse(
io.StringIO(current_state["markdown_content"]),
media_type="text/markdown",
- headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""}
+ headers={"Content-Disposition": f"attachment; filename*=UTF-8''{quote(actual_filename, safe='', encoding='utf-8')}"}
)
@@ -1079,7 +1081,7 @@ async def download_html(filename_with_ext: str):
return HTMLResponse(
content=current_state["html_content"],
media_type="text/html",
- headers={"Content-Disposition": f"attachment; filename=\"{actual_filename}\""}
+ headers={"Content-Disposition": f"attachment; filename*=UTF-8''{quote(actual_filename, safe='', encoding='utf-8')}"}
)
diff --git a/docutranslate/utils/convert.py b/docutranslate/utils/convert.py
index f3d23fb..6b2b633 100644
--- a/docutranslate/utils/convert.py
+++ b/docutranslate/utils/convert.py
@@ -1,32 +1,37 @@
import os
-from huggingface_hub.errors import LocalEntryNotFoundError
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions, AcceleratorDevice
-from docling_core.types.doc import ImageRefMode
from pathlib import Path
-from docling.document_converter import DocumentConverter, PdfFormatOption
+
+from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentStream
+from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.datamodel.settings import settings
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+from huggingface_hub.errors import LocalEntryNotFoundError
+
from docutranslate.logger import translater_logger
+
IMAGE_RESOLUTION_SCALE = 4
-def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=False, code=False,artifacts_path:Path|str|None=None) -> str:
+def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False,
+ artifacts_path: Path | str | None = None) -> str:
translater_logger.info(f"正在将文档转换为markdown")
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
- pipeline_options.do_ocr=False
+ pipeline_options.do_ocr = False
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options.generate_picture_images = True
- pipeline_options.table_structure_options.do_cell_matching=False
+ # pipeline_options.table_structure_options.mode = TableFormerMode.FAST
+ pipeline_options.table_structure_options.do_cell_matching = False
if formula:
- pipeline_options.do_formula_enrichment=True
+ pipeline_options.do_formula_enrichment = True
if code:
- pipeline_options.do_code_enrichment=True
+ pipeline_options.do_code_enrichment = True
# pipeline_options.accelerator_options= AcceleratorOptions(
# num_threads=4, device=AcceleratorDevice.AUTO
# )
- #打印时间
- settings.debug.profile_pipeline_timings=True
+ # 打印时间
+ settings.debug.profile_pipeline_timings = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
@@ -38,10 +43,11 @@ def file2markdown_embed_images(file_path: Path | str|DocumentStream, formula=Fal
translater_logger.info(f"无法连接huggingface,正在尝试换源")
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
conversion_result = converter.convert(file_path)
- result=conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
+ result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
translater_logger.info(f"已转换为markdown")
translater_logger.info(f"pdf转换耗时: {conversion_result.timings["pipeline_total"].times}")
return result
+
if __name__ == '__main__':
- pass
\ No newline at end of file
+ pass
diff --git a/pyproject.toml b/pyproject.toml
index 70ab53a..89b1df8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "docutranslate"
-version = "0.2.16"
+version = "0.2.17"
description = "文件翻译工具"
readme = "README.md"
requires-python = ">=3.10"