small fix
This commit is contained in:
6
.idea/workspace.xml
generated
6
.idea/workspace.xml
generated
@@ -6,7 +6,9 @@
|
|||||||
<component name="ChangeListManager">
|
<component name="ChangeListManager">
|
||||||
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
<list default="true" id="6b18b44a-df57-4212-a857-9e291ebe5dd2" name="更改" comment="">
|
||||||
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/app.py" afterDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/docutranslate/static/index.html" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/static/index.html" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/docutranslate/static/index.html" beforeDir="false" afterPath="$PROJECT_DIR$/docutranslate/static/index.html" afterDir="false" />
|
||||||
|
<change beforePath="$PROJECT_DIR$/docutranslate/utils/convert.py" beforeDir="false" />
|
||||||
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
|
<change beforePath="$PROJECT_DIR$/pyproject.toml" beforeDir="false" afterPath="$PROJECT_DIR$/pyproject.toml" afterDir="false" />
|
||||||
</list>
|
</list>
|
||||||
<option name="SHOW_DIALOG" value="false" />
|
<option name="SHOW_DIALOG" value="false" />
|
||||||
@@ -602,7 +604,7 @@
|
|||||||
<workItem from="1747739438735" duration="826000" />
|
<workItem from="1747739438735" duration="826000" />
|
||||||
<workItem from="1747740341909" duration="145000" />
|
<workItem from="1747740341909" duration="145000" />
|
||||||
<workItem from="1747752718385" duration="81000" />
|
<workItem from="1747752718385" duration="81000" />
|
||||||
<workItem from="1747754618316" duration="14549000" />
|
<workItem from="1747754618316" duration="16566000" />
|
||||||
</task>
|
</task>
|
||||||
<servers />
|
<servers />
|
||||||
</component>
|
</component>
|
||||||
@@ -621,7 +623,7 @@
|
|||||||
</option>
|
</option>
|
||||||
</component>
|
</component>
|
||||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747798850500" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$app_test__1_.coverage" NAME="app_test (1) 覆盖结果" MODIFIED="1747802494815" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
<SUITE FILE_PATH="coverage/filetranslate$test.coverage" NAME="test 覆盖结果" MODIFIED="1747472297913" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
|
||||||
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
<SUITE FILE_PATH="coverage/filetranslate$convert.coverage" NAME="convert 覆盖结果" MODIFIED="1746963490689" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/docutranslate/utils" />
|
||||||
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
<SUITE FILE_PATH="coverage/PDFtranslate$PDFtranslater__1_.coverage" NAME="PDFtranslater (1) 覆盖结果" MODIFIED="1746633258205" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/pdftranslate_packages" />
|
||||||
|
|||||||
@@ -177,20 +177,18 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
|||||||
# --- API Endpoints ---
|
# --- API Endpoints ---
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
async def main_page(request: Request):
|
async def main_page(request: Request):
|
||||||
# Serve index.html from the static directory or root project directory
|
|
||||||
# Assuming index.html is at the same level as app.py or in STATIC_DIR
|
|
||||||
# For simplicity, if index.html is at root:
|
|
||||||
# return FileResponse(Path(__file__).parent / "index.html")
|
|
||||||
# If using Jinja2Templates and index.html is in "templates" folder:
|
|
||||||
# return templates.TemplateResponse("index.html", {"request": request})
|
|
||||||
# Using FileResponse for index.html directly:
|
|
||||||
index_path = Path("index.html") # Adjust if index.html is elsewhere
|
index_path = Path("index.html") # Adjust if index.html is elsewhere
|
||||||
if not index_path.exists():
|
if not index_path.exists():
|
||||||
# Fallback to static dir if not in root
|
# Fallback to static dir if not in root
|
||||||
index_path = STATIC_DIR / "index.html"
|
index_path = STATIC_DIR / "index.html"
|
||||||
if not index_path.exists():
|
if not index_path.exists():
|
||||||
raise HTTPException(status_code=404, detail="index.html not found")
|
raise HTTPException(status_code=404, detail="index.html not found")
|
||||||
return FileResponse(index_path)
|
no_cache_headers = {
|
||||||
|
"Cache-Control": "no-store, no-cache, must-revalidate, max-age=0",
|
||||||
|
"Pragma": "no-cache", # 兼容 HTTP/1.0
|
||||||
|
"Expires": "0", # 兼容旧版代理/缓存
|
||||||
|
}
|
||||||
|
return FileResponse(index_path,headers=no_cache_headers)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/translate")
|
@app.post("/translate")
|
||||||
@@ -408,7 +406,7 @@ async def download_html(filename_with_ext: str):
|
|||||||
|
|
||||||
def run_app():
|
def run_app():
|
||||||
print("正在启动 DocuTranslate WebUI")
|
print("正在启动 DocuTranslate WebUI")
|
||||||
print("请访问 http://127.0.0.1:8010")
|
print("请访问 http://127.0.0.1:8010 (ctrl+点击链接即可打开)")
|
||||||
uvicorn.run(app, host="127.0.0.1", port=8010, workers=1)
|
uvicorn.run(app, host="127.0.0.1", port=8010, workers=1)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -229,7 +229,7 @@
|
|||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="convert_engin">转换引擎</label>
|
<label for="convert_engin">转换引擎</label>
|
||||||
<select id="convert_engin" name="convert_engin">
|
<select id="convert_engin" name="convert_engin">
|
||||||
<option value="mineru" selected>Mineru</option>
|
<option value="mineru" selected>minerU</option>
|
||||||
<option value="docling" id="docling">Docling</option>
|
<option value="docling" id="docling">Docling</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
@@ -360,6 +360,7 @@
|
|||||||
return defaultValue;
|
return defaultValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//api访问地址到获取地址的映射
|
//api访问地址到获取地址的映射
|
||||||
const apiHrefMap = {
|
const apiHrefMap = {
|
||||||
"https://openrouter.ai/api/v1": "https://openrouter.ai/settings/keys",
|
"https://openrouter.ai/api/v1": "https://openrouter.ai/settings/keys",
|
||||||
@@ -513,12 +514,14 @@
|
|||||||
console.warn(`get engine list failed: ${response.status}`);
|
console.warn(`get engine list failed: ${response.status}`);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const status = await response.json();
|
const enginList = await response.json();
|
||||||
statusMsg.textContent = '正在初始化';
|
statusMsg.textContent = '正在初始化';
|
||||||
status.forEach((engin) => {
|
let options = convertEnginSelect.querySelectors(`option[value="${engin}"]`);
|
||||||
let option = convertEnginSelect.querySelector(`option[value="${engin}"]`);
|
options.forEach((option) => {
|
||||||
|
if (!enginList.includes(option.value)) {
|
||||||
option.disabled = true;
|
option.disabled = true;
|
||||||
option.textContent += "(不可用)"
|
option.textContent += "(不可用)"
|
||||||
|
}
|
||||||
})
|
})
|
||||||
if (status.includes(convertEnginSelect.value)) {
|
if (status.includes(convertEnginSelect.value)) {
|
||||||
convertEnginSelect.value = "mineru";
|
convertEnginSelect.value = "mineru";
|
||||||
|
|||||||
@@ -1,53 +0,0 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
|
||||||
from docling.datamodel.document import DocumentStream
|
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
||||||
from docling_core.types.doc import ImageRefMode
|
|
||||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
|
||||||
|
|
||||||
from docutranslate.logger import translater_logger
|
|
||||||
|
|
||||||
IMAGE_RESOLUTION_SCALE = 4
|
|
||||||
|
|
||||||
|
|
||||||
def file2markdown_embed_images(file_path: Path | str | DocumentStream, formula=False, code=False,
|
|
||||||
artifacts_path: Path | str | None = None) -> str:
|
|
||||||
translater_logger.info(f"正在将文档转换为markdown")
|
|
||||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
|
||||||
pipeline_options.do_ocr = False
|
|
||||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
|
||||||
pipeline_options.generate_picture_images = True
|
|
||||||
# pipeline_options.table_structure_options.mode = TableFormerMode.FAST
|
|
||||||
pipeline_options.table_structure_options.do_cell_matching = False
|
|
||||||
if formula:
|
|
||||||
pipeline_options.do_formula_enrichment = True
|
|
||||||
if code:
|
|
||||||
pipeline_options.do_code_enrichment = True
|
|
||||||
# pipeline_options.accelerator_options= AcceleratorOptions(
|
|
||||||
# num_threads=4, device=AcceleratorDevice.AUTO
|
|
||||||
# )
|
|
||||||
# 打印时间
|
|
||||||
settings.debug.profile_pipeline_timings = True
|
|
||||||
converter = DocumentConverter(format_options={
|
|
||||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
||||||
|
|
||||||
})
|
|
||||||
try:
|
|
||||||
conversion_result = converter.convert(file_path)
|
|
||||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
|
||||||
except LocalEntryNotFoundError:
|
|
||||||
translater_logger.info(f"无法连接huggingface,正在尝试换源")
|
|
||||||
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
|
||||||
conversion_result = converter.convert(file_path)
|
|
||||||
result = conversion_result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
|
|
||||||
translater_logger.info(f"已转换为markdown")
|
|
||||||
translater_logger.info(f"pdf转换耗时: {conversion_result.timings["pipeline_total"].times}")
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
pass
|
|
||||||
@@ -8,7 +8,6 @@ dependencies = [
|
|||||||
"httpx",
|
"httpx",
|
||||||
"markdown2",
|
"markdown2",
|
||||||
"fastapi[standard]>=0.115.12",
|
"fastapi[standard]>=0.115.12",
|
||||||
"docling>=2.33.0",
|
|
||||||
]
|
]
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
docutranslate = "docutranslate.cli:main"
|
docutranslate = "docutranslate.cli:main"
|
||||||
|
|||||||
Reference in New Issue
Block a user