修正版本号
This commit is contained in:
12
README.md
12
README.md
@@ -28,9 +28,9 @@
|
||||
|
||||
1. `uv init`
|
||||
2. `uv add docutranslate`
|
||||
3. `uv add docling`#如果需要使用docling进行文档解析
|
||||
3. `uv add docutranslate[docling]`#如果需要使用docling进行文档解析
|
||||
|
||||
使用git
|
||||
使用git(需下载uv)
|
||||
|
||||
1. `git clone https://github.com/xunbu/docutranslate.git`
|
||||
2. `uv sync`
|
||||
@@ -59,9 +59,11 @@
|
||||
|
||||
使用minerU将文档转换为markdown时,需要在minerU平台申请token
|
||||
|
||||
1. 打开[minerU官网](https://mineru.net/apiManage/docs)申请token
|
||||
1. 打开[minerU官网](https://mineru.net/apiManage/docs)申请API
|
||||
2. 申请成功后,在[API Token管理界面](https://mineru.net/apiManage/token)创建API Token
|
||||
|
||||
> mineru token有14天有效期,若过期请创建新的token
|
||||
|
||||
## 使用docling引擎注意事项
|
||||
|
||||
使用docling将文档转换为markdown时,需要下载模型到本地(也可以提前下载,见FAQ),因此可能会遇到一些网络问题
|
||||
@@ -185,8 +187,8 @@ from docutranslate import FileTranslater
|
||||
translater = FileTranslater(base_url="<baseurl>", # 默认的模型baseurl
|
||||
key="<api-key>", # 默认的大语言模型平台api-key
|
||||
model_id="<model-id>", # 默认的模型id
|
||||
chunksize=3000, # markdown分块长度(单位byte),分块越大效果越好(也越慢),不建议超过8000
|
||||
max_concurrent=30, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
|
||||
chunk_size=3000, # markdown分块长度(单位byte),分块越大效果越好(也越慢),不建议超过8000
|
||||
concurrent=30, # 并发数,受到ai平台并发量限制,如果文章很长建议适当加大到20以上
|
||||
timeout=2000, # 调用api的超时时间
|
||||
docling_artifact=None, # 使用提前下载好的docling模型
|
||||
convert_engin="mineru", # 可选minerU或docling
|
||||
|
||||
@@ -15,6 +15,7 @@ from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse, Fil
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from docutranslate import FileTranslater, __version__
|
||||
from docutranslate.logger import translater_logger
|
||||
from docutranslate.translater import default_params
|
||||
from docutranslate.utils.resource_utils import resource_path
|
||||
from docutranslate.global_values import available_packages
|
||||
|
||||
@@ -119,6 +120,9 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
||||
base_url=params['base_url'],
|
||||
key=params['apikey'],
|
||||
model_id=params['model_id'],
|
||||
chunk_size=params['chunk_size'],
|
||||
concurrent=params['concurrent'],
|
||||
temperature=params['temperature'],
|
||||
convert_engin=params['convert_engin'],
|
||||
mineru_token=params['mineru_token'],
|
||||
)
|
||||
@@ -135,7 +139,8 @@ async def _perform_translation(params: Dict[str, Any], file_contents: bytes, ori
|
||||
|
||||
md_content = ft.export_to_markdown()
|
||||
try:
|
||||
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js", timeout=3)
|
||||
await httpx_client.head("https://s4.zstatic.net/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js",
|
||||
timeout=3)
|
||||
html_content = ft.export_to_html(title=current_state["original_filename_stem"], cdn=True)
|
||||
except (httpx.TimeoutException, httpx.RequestError) as e:
|
||||
translater_logger.info(f"连接s4.zstatic.net失败,错误信息:{e}")
|
||||
@@ -213,6 +218,9 @@ async def handle_translate(
|
||||
refine_markdown: bool = Form(False),
|
||||
convert_engin: str = Form(...),
|
||||
mineru_token: Optional[str] = Form(None),
|
||||
chunk_size: int = Form(...),
|
||||
concurrent: int = Form(...),
|
||||
temperature: float = Form(...),
|
||||
custom_prompt_translate: Optional[str] = Form(None),
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
@@ -283,6 +291,9 @@ async def handle_translate(
|
||||
"code_ocr": code_ocr, "refine_markdown": refine_markdown,
|
||||
"convert_engin": convert_engin,
|
||||
"mineru_token": mineru_token,
|
||||
"chunk_size":chunk_size,
|
||||
"concurrent":concurrent,
|
||||
"temperature":temperature,
|
||||
"custom_prompt_translate": custom_prompt_translate,
|
||||
}
|
||||
|
||||
@@ -420,6 +431,11 @@ async def download_html(filename_with_ext: str):
|
||||
)
|
||||
|
||||
|
||||
@app.get("/translate/default_param")
|
||||
def get_default_param():
|
||||
return JSONResponse(content=default_params)
|
||||
|
||||
|
||||
@app.get("/meta")
|
||||
async def get_app_version():
|
||||
return JSONResponse(content={"version": __version__})
|
||||
|
||||
@@ -267,6 +267,10 @@
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.clickable {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.form-grid {
|
||||
grid-template-columns: 1fr;
|
||||
@@ -409,6 +413,28 @@
|
||||
<details>
|
||||
<summary>高级选项</summary>
|
||||
<div class="form-group">
|
||||
<div>
|
||||
<label for="chunk-size-slider"><span
|
||||
style="display:inline-block;min-width: 15vw">分块大小(bytes):</span><span
|
||||
id="chunk-size-display"></span><span class="clickable" id="chunk-size-reset"
|
||||
style="color: #2e7d32">🗘</span></label>
|
||||
<input type="range" id="chunk-size-slider" name="chunk_size" min="1000" max="6000" step="100">
|
||||
</div>
|
||||
<div>
|
||||
<label for="concurrent-slider"><span
|
||||
style="display:inline-block;min-width: 15vw">并发请求数:</span><span
|
||||
id="concurrent-display"></span><span class="clickable" id="concurrent-reset"
|
||||
style="color: #2e7d32">🗘</span></label>
|
||||
<input type="range" id="concurrent-slider" name="concurrent" min="1" max="60" step="1">
|
||||
</div>
|
||||
<div>
|
||||
<label for="temperature-slider"><span
|
||||
style="display:inline-block;min-width: 15vw">temperature:</span><span
|
||||
id="temperature-display"></span><span class="clickable" id="temperature-reset"
|
||||
style="color: #2e7d32">🗘</span></label>
|
||||
<input type="range" id="temperature-slider" name="temperature" min="0" max="1" step="0.1">
|
||||
</div>
|
||||
|
||||
<label for="custom_prompt_translate"></label>
|
||||
<textarea class="prompt-area" type="text" id="custom_prompt_translate"
|
||||
name="custom_prompt_translate" placeholder="翻译提示"></textarea>
|
||||
@@ -470,7 +496,7 @@
|
||||
</div>
|
||||
|
||||
<iframe id="printFrame" style="display:none;"></iframe>
|
||||
<script>
|
||||
<script type="module">
|
||||
const platformSelect = document.getElementById('platform_select');
|
||||
const apiHref = document.getElementById('api_href')
|
||||
const baseUrlGroup = document.getElementById('baseUrlGroup');
|
||||
@@ -486,6 +512,17 @@
|
||||
const mineruTokenGroup = document.getElementById('mineruTokenGroup');
|
||||
const mineruTokenInput = document.getElementById('mineru_token');
|
||||
|
||||
const chunkSizeSlider = document.getElementById('chunk-size-slider')
|
||||
const chunkSizeDisplay = document.getElementById('chunk-size-display')
|
||||
const chunkSizeReset = document.getElementById('chunk-size-reset')
|
||||
const concurrentSlider = document.getElementById('concurrent-slider')
|
||||
const concurrentDisplay = document.getElementById('concurrent-display')
|
||||
const concurrentReset = document.getElementById("concurrent-reset")
|
||||
const temperatureSlider = document.getElementById('temperature-slider')
|
||||
const temperatureDisplay = document.getElementById('temperature-display')
|
||||
const temperatureReset = document.getElementById("temperature-reset")
|
||||
|
||||
|
||||
const form = document.getElementById('translateForm');
|
||||
const submitButton = document.getElementById('submitButton');
|
||||
const logArea = document.getElementById('logArea');
|
||||
@@ -521,6 +558,64 @@
|
||||
let statusPollIntervalId = null;
|
||||
let isTranslating = false;
|
||||
|
||||
|
||||
let default_param;
|
||||
|
||||
// 初始化调用
|
||||
async function init() {
|
||||
try {
|
||||
const response = await fetch("/meta")
|
||||
const meta = await response.json();
|
||||
versionDisplay.textContent = `版本号:${meta.version}`;
|
||||
} catch (error) {
|
||||
console.warn("获取版本号失败", error);
|
||||
}
|
||||
try {
|
||||
const response = await fetch('/get-engin-list')
|
||||
if (!response.ok) {
|
||||
console.warn(`get engine list failed: ${response.status}`);
|
||||
return;
|
||||
}
|
||||
const enginList = await response.json();
|
||||
statusMsg.textContent = '正在初始化';
|
||||
let options = convertEnginSelect.querySelectorAll(`option`);
|
||||
let currentEngineDisabled = false;
|
||||
options.forEach((option) => {
|
||||
if (!enginList.includes(option.value)) {
|
||||
option.disabled = true;
|
||||
option.textContent += " (不可用)";
|
||||
if (option.value === convertEnginSelect.value) {
|
||||
currentEngineDisabled = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
if (currentEngineDisabled) {
|
||||
const mineruOption = convertEnginSelect.querySelector('option[value="mineru"]');
|
||||
if (mineruOption && !mineruOption.disabled) {
|
||||
convertEnginSelect.value = "mineru";
|
||||
} else {
|
||||
const firstAvailable = convertEnginSelect.querySelector('option:not([disabled])');
|
||||
if (firstAvailable) convertEnginSelect.value = firstAvailable.value;
|
||||
}
|
||||
updateConvertEnginUI();
|
||||
}
|
||||
statusMsg.textContent = '初始化完成';
|
||||
} catch (error) {
|
||||
console.warn("Error get engin-list", error);
|
||||
statusMsg.textContent = '引擎列表初始化失败';
|
||||
statusMsg.className = 'error-message';
|
||||
}
|
||||
try {
|
||||
const response = await fetch("/translate/default_param")
|
||||
default_param = await response.json();
|
||||
} catch (error) {
|
||||
statusMsg.textContent = error.toString();
|
||||
statusMsg.className = 'error-message';
|
||||
}
|
||||
}
|
||||
|
||||
await init()
|
||||
|
||||
function saveToStorage(key, value) {
|
||||
try {
|
||||
localStorage.setItem(key, value);
|
||||
@@ -572,6 +667,7 @@
|
||||
saveToStorage('translator_last_platform', selectedPlatformValue);
|
||||
}
|
||||
|
||||
|
||||
function updateConvertEnginUI() {
|
||||
const selectedEngin = convertEnginSelect.value;
|
||||
if (selectedEngin === 'mineru') {
|
||||
@@ -585,11 +681,50 @@
|
||||
saveToStorage('translator_convert_engin', selectedEngin);
|
||||
}
|
||||
|
||||
function updateChunkSizeUI() {
|
||||
let value = chunkSizeSlider.value
|
||||
chunkSizeDisplay.textContent = value;
|
||||
if (value !== default_param["chunk_size"].toString()) {
|
||||
chunkSizeReset.style.visibility = 'visible';
|
||||
} else {
|
||||
chunkSizeReset.style.visibility = 'hidden';
|
||||
}
|
||||
saveToStorage('chunk_size', value)
|
||||
}
|
||||
|
||||
function updateTemperatureUI() {
|
||||
let value = temperatureSlider.value
|
||||
temperatureDisplay.textContent = value;
|
||||
if (value !== default_param["temperature"].toString()) {
|
||||
temperatureReset.style.visibility = 'visible';
|
||||
} else {
|
||||
temperatureReset.style.visibility = 'hidden';
|
||||
}
|
||||
saveToStorage('temperature', value)
|
||||
}
|
||||
|
||||
function updateConcurrentUI() {
|
||||
let value = concurrentSlider.value
|
||||
concurrentDisplay.textContent = value;
|
||||
if (value !== default_param["concurrent"].toString()) {
|
||||
concurrentReset.style.visibility = 'visible';
|
||||
} else {
|
||||
concurrentReset.style.visibility = 'hidden';
|
||||
}
|
||||
saveToStorage('concurrent', value)
|
||||
}
|
||||
|
||||
function loadSettings() {
|
||||
platformSelect.value = getFromStorage('translator_last_platform', 'custom');
|
||||
updatePlatformUI();
|
||||
convertEnginSelect.value = getFromStorage('translator_convert_engin', 'mineru');
|
||||
updateConvertEnginUI();
|
||||
chunkSizeSlider.value = getFromStorage("chunk_size", default_param["chunk_size"])
|
||||
updateChunkSizeUI()
|
||||
concurrentSlider.value = getFromStorage("concurrent", default_param["concurrent"])
|
||||
updateConcurrentUI()
|
||||
temperatureSlider.value = getFromStorage("temperature", default_param["temperature"])
|
||||
updateTemperatureUI()
|
||||
toLangSelect.value = getFromStorage('translator_to_lang', '中文');
|
||||
formulaCheckbox.checked = getFromStorage('translator_formula_ocr') === 'true';
|
||||
codeCheckbox.checked = getFromStorage('translator_code_ocr') === 'true';
|
||||
@@ -710,52 +845,6 @@
|
||||
}
|
||||
}, false);
|
||||
|
||||
// 初始化调用
|
||||
(async () => {
|
||||
try {
|
||||
const response = await fetch("/meta")
|
||||
let meta = await response.json();
|
||||
versionDisplay.textContent = `版本号:${meta.version}`;
|
||||
} catch (error) {
|
||||
console.warn("获取版本号失败", error);
|
||||
}
|
||||
try {
|
||||
const response = await fetch('/get-engin-list')
|
||||
if (!response.ok) {
|
||||
console.warn(`get engine list failed: ${response.status}`);
|
||||
return;
|
||||
}
|
||||
const enginList = await response.json();
|
||||
statusMsg.textContent = '正在初始化';
|
||||
let options = convertEnginSelect.querySelectorAll(`option`);
|
||||
let currentEngineDisabled = false;
|
||||
options.forEach((option) => {
|
||||
if (!enginList.includes(option.value)) {
|
||||
option.disabled = true;
|
||||
option.textContent += " (不可用)";
|
||||
if (option.value === convertEnginSelect.value) {
|
||||
currentEngineDisabled = true;
|
||||
}
|
||||
}
|
||||
});
|
||||
if (currentEngineDisabled) {
|
||||
const mineruOption = convertEnginSelect.querySelector('option[value="mineru"]');
|
||||
if (mineruOption && !mineruOption.disabled) {
|
||||
convertEnginSelect.value = "mineru";
|
||||
} else {
|
||||
const firstAvailable = convertEnginSelect.querySelector('option:not([disabled])');
|
||||
if (firstAvailable) convertEnginSelect.value = firstAvailable.value;
|
||||
}
|
||||
updateConvertEnginUI();
|
||||
}
|
||||
statusMsg.textContent = '初始化完成';
|
||||
} catch (error) {
|
||||
console.warn("Error get engin-list", error);
|
||||
statusMsg.textContent = '引擎列表初始化失败';
|
||||
statusMsg.className = 'error-message';
|
||||
}
|
||||
})();
|
||||
|
||||
|
||||
async function pollLogs() {
|
||||
try {
|
||||
@@ -990,6 +1079,23 @@
|
||||
}
|
||||
}
|
||||
|
||||
chunkSizeSlider.addEventListener('input', updateChunkSizeUI)
|
||||
chunkSizeReset.addEventListener('click', () => {
|
||||
chunkSizeSlider.value = default_param["chunk_size"]
|
||||
updateChunkSizeUI()
|
||||
})
|
||||
concurrentSlider.addEventListener('input', updateConcurrentUI)
|
||||
concurrentReset.addEventListener('click', () => {
|
||||
concurrentSlider.value = default_param["concurrent"]
|
||||
updateConcurrentUI()
|
||||
})
|
||||
|
||||
temperatureSlider.addEventListener('input', updateTemperatureUI)
|
||||
temperatureReset.addEventListener('click', () => {
|
||||
temperatureSlider.value = default_param["temperature"]
|
||||
updateTemperatureUI()
|
||||
})
|
||||
|
||||
submitButton.addEventListener('click', async function (event) {
|
||||
event.preventDefault();
|
||||
console.log(fileInput)
|
||||
|
||||
@@ -18,11 +18,16 @@ DOCLING_FLAG = True if available_packages.get("docling") else False
|
||||
if DOCLING_FLAG:
|
||||
from docutranslate.converter import ConverterDocling
|
||||
|
||||
default_params={
|
||||
"chunk_size":3000,
|
||||
"concurrent":30,
|
||||
"temperature":0.7,
|
||||
}
|
||||
|
||||
class FileTranslater:
|
||||
def __init__(self, file_path: Path | str | None = None, chunksize: int = 3000,
|
||||
base_url="", key=None, model_id="", temperature=0.7,
|
||||
max_concurrent=30, timeout=2000,
|
||||
def __init__(self, file_path: Path | str | None = None, chunk_size: int = default_params["chunk_size"],
|
||||
base_url:str|None=None, key=None, model_id:str|None=None, temperature=default_params["temperature"],
|
||||
concurrent:int=default_params["concurrent"], timeout=2000,
|
||||
convert_engin: Literal["docling", "mineru"] = "mineru",
|
||||
docling_artifact: Path | str | None = None,
|
||||
mineru_token: str = None, cache=True):
|
||||
@@ -30,11 +35,11 @@ class FileTranslater:
|
||||
self.mineru_token = mineru_token.strip() if mineru_token is not None else None
|
||||
self._mask_dict = MaskDict()
|
||||
self.markdown: str = ""
|
||||
self.chunksize = chunksize
|
||||
self.max_concurrent = max_concurrent
|
||||
self.base_url: str = base_url
|
||||
self.key: str = key if key is not None else "xx"
|
||||
self.model_id: str = model_id
|
||||
self.chunk_size = chunk_size
|
||||
self.concurrent = concurrent
|
||||
self.base_url= base_url
|
||||
self.key = key if key is not None else "xx"
|
||||
self.model_id = model_id
|
||||
self.temperature = temperature
|
||||
self.docling_artifact = docling_artifact
|
||||
if docling_artifact is None:
|
||||
@@ -67,17 +72,21 @@ class FileTranslater:
|
||||
return self
|
||||
|
||||
def _split_markdown_into_chunks(self) -> list[str]:
|
||||
chunks: list[str] = split_markdown_text(self.markdown, self.chunksize)
|
||||
chunks: list[str] = split_markdown_text(self.markdown, self.chunk_size)
|
||||
translater_logger.info(f"markdown分为{len(chunks)}块")
|
||||
return chunks
|
||||
|
||||
def _default_agent_params(self) -> AgentArgs:
|
||||
if self.base_url is None:
|
||||
raise Exception("base_url为空")
|
||||
if self.model_id is None:
|
||||
raise Exception("model_id为空")
|
||||
result: AgentArgs = {
|
||||
"baseurl": self.base_url,
|
||||
"key": self.key,
|
||||
"model_id": self.model_id,
|
||||
"temperature": self.temperature,
|
||||
"max_concurrent": self.max_concurrent,
|
||||
"max_concurrent": self.concurrent,
|
||||
"timeout": self.timeout
|
||||
}
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user