feat:兼容qwen-mt模型

This commit is contained in:
toy
2026-02-10 15:50:42 +08:00
parent 86a9958f58
commit 9e82daa2a1
3 changed files with 400 additions and 14 deletions

39
AGENTS.md Normal file
View File

@@ -0,0 +1,39 @@
# Repository Guidelines
## Project Structure & Module Organization
Core Python code is in `docutranslate/`.
- `docutranslate/workflow/`: file-type workflows (`*_workflow.py`) that orchestrate conversion, translation, and export.
- `docutranslate/converter/`, `docutranslate/translator/`, `docutranslate/exporter/`: pipeline stages.
- `docutranslate/app.py`: FastAPI/Web UI backend entrypoint.
- `docutranslate/cli.py`: CLI entry (`docutranslate` command).
- `docutranslate/static/` and `docutranslate/template/`: bundled frontend/static assets.
- Packaging/build files live at root: `pyproject.toml`, `Dockerfile`, `*.spec`, `.github/workflows/`.
## Build, Test, and Development Commands
- `uv sync`: install project dependencies from `pyproject.toml`/`uv.lock`.
- `uv run docutranslate -i`: start local Web UI + API (default `127.0.0.1:8010`).
- `uv run docutranslate -i -p 8011 --cors`: run on a custom port with CORS enabled.
- `docker run -d -p 8010:8010 xunbu/docutranslate:latest`: run the published container locally.
- `uv pip install pyinstaller && uv run pyinstaller lite.spec --noconfirm --clean -y`: build a lightweight desktop package (see also `full.spec`, `lite_mac.spec`).
## Coding Style & Naming Conventions
- Follow Python 3.11+ conventions and PEP 8: 4-space indentation, clear type-oriented config classes, small focused functions.
- Use `snake_case` for modules/functions/variables and `PascalCase` for classes.
- Keep workflow naming consistent: `xxx_workflow.py`, matching config and workflow class names.
- Prefer explicit, composable configs over hard-coded provider values.
## Testing Guidelines
There is currently no first-party `tests/` suite or enforced coverage gate in this repository.
- For behavior changes, run a manual smoke test: start `docutranslate -i`, open `/docs`, and execute at least one translation path you touched.
- If you add automated tests, place them under `tests/` with `test_*.py` names and keep fixtures small and file-type specific.
## Commit & Pull Request Guidelines
Recent history favors short, imperative commit subjects (Chinese or English), for example: `Fix Gemini provider tag`, `Add regex dependency`, `Add Vietnamese`.
- Keep subject lines concise and action-focused.
- In PRs, include: what changed, why, how you validated it, and UI screenshots when `docutranslate/static/` or interface behavior changes.
- Link related issues and note any new env vars/API provider requirements.
## Security & Configuration Tips
- Never commit real API keys or tokens.
- Keep provider credentials in environment variables or local untracked config.
- For LAN exposure, use `--host 0.0.0.0` intentionally and restrict network access as needed.

View File

@@ -290,6 +290,223 @@ _COMPLEX_SCRIPT_PATTERN = re.compile(
r'[\u2e80-\u9fff\u0400-\u04ff\u0600-\u06ff\u0e00-\u0e7f\u0900-\u097f]' r'[\u2e80-\u9fff\u0400-\u04ff\u0600-\u06ff\u0e00-\u0e7f\u0900-\u097f]'
) )
def _normalize_mt_lang_key(lang: str) -> str:
key = str(lang).strip().lower()
key = key.replace("_", "-")
key = key.replace("'", "'").replace("'", "'")
key = key.replace("", "-").replace("", "-")
key = re.sub(r"\s+", " ", key)
return key
_MT_LANG_BY_CODE = {
"en": "English",
"zh": "Chinese",
"zh-tw": "Traditional Chinese",
"ru": "Russian",
"ja": "Japanese",
"ko": "Korean",
"es": "Spanish",
"fr": "French",
"pt": "Portuguese",
"de": "German",
"it": "Italian",
"th": "Thai",
"vi": "Vietnamese",
"id": "Indonesian",
"ms": "Malay",
"ar": "Arabic",
"hi": "Hindi",
"he": "Hebrew",
"my": "Burmese",
"ta": "Tamil",
"ur": "Urdu",
"bn": "Bengali",
"pl": "Polish",
"nl": "Dutch",
"ro": "Romanian",
"tr": "Turkish",
"km": "Khmer",
"lo": "Lao",
"yue": "Cantonese",
"cs": "Czech",
"el": "Greek",
"sv": "Swedish",
"hu": "Hungarian",
"da": "Danish",
"fi": "Finnish",
"uk": "Ukrainian",
"bg": "Bulgarian",
"sr": "Serbian",
"te": "Telugu",
"af": "Afrikaans",
"hy": "Armenian",
"as": "Assamese",
"ast": "Asturian",
"eu": "Basque",
"be": "Belarusian",
"bs": "Bosnian",
"ca": "Catalan",
"ceb": "Cebuano",
"hr": "Croatian",
"arz": "Egyptian Arabic",
"et": "Estonian",
"gl": "Galician",
"ka": "Georgian",
"gu": "Gujarati",
"is": "Icelandic",
"jv": "Javanese",
"kn": "Kannada",
"kk": "Kazakh",
"lv": "Latvian",
"lt": "Lithuanian",
"lb": "Luxembourgish",
"mk": "Macedonian",
"mai": "Maithili",
"mt": "Maltese",
"mr": "Marathi",
"acm": "Mesopotamian Arabic",
"ary": "Moroccan Arabic",
"ars": "Najdi Arabic",
"ne": "Nepali",
"az": "North Azerbaijani",
"apc": "North Levantine Arabic",
"uz": "Northern Uzbek",
"nb": "Norwegian Bokmål",
"nn": "Norwegian Nynorsk",
"oc": "Occitan",
"or": "Odia",
"pag": "Pangasinan",
"scn": "Sicilian",
"sd": "Sindhi",
"si": "Sinhala",
"sk": "Slovak",
"sl": "Slovenian",
"ajp": "South Levantine Arabic",
"sw": "Swahili",
"tl": "Tagalog",
"acq": "Ta'izzi-Adeni Arabic",
"sq": "Tosk Albanian",
"aeb": "Tunisian Arabic",
"vec": "Venetian",
"war": "Waray",
"cy": "Welsh",
"fa": "Western Persian",
}
_MT_LANG_BY_NAME = {
_normalize_mt_lang_key(name): name for name in set(_MT_LANG_BY_CODE.values())
}
_MT_LANG_ALIASES = {
# Existing UI/common aliases
"english": "English",
"英语": "English",
"英文": "English",
"简体中文": "Chinese",
"中文": "Chinese",
"simplified chinese": "Chinese",
"chinese": "Chinese",
"traditional chinese": "Traditional Chinese",
"繁体中文": "Traditional Chinese",
"zh-hans": "Chinese",
"zh-cn": "Chinese",
"zh-hant": "Traditional Chinese",
# Full Chinese aliases from qwen-mt language list
"俄语": "Russian",
"日语": "Japanese",
"韩语": "Korean",
"西班牙语": "Spanish",
"法语": "French",
"葡萄牙语": "Portuguese",
"德语": "German",
"意大利语": "Italian",
"泰语": "Thai",
"越南语": "Vietnamese",
"印度尼西亚语": "Indonesian",
"马来语": "Malay",
"阿拉伯语": "Arabic",
"印地语": "Hindi",
"希伯来语": "Hebrew",
"缅甸语": "Burmese",
"泰米尔语": "Tamil",
"乌尔都语": "Urdu",
"孟加拉语": "Bengali",
"波兰语": "Polish",
"荷兰语": "Dutch",
"罗马尼亚语": "Romanian",
"土耳其语": "Turkish",
"高棉语": "Khmer",
"老挝语": "Lao",
"粤语": "Cantonese",
"捷克语": "Czech",
"希腊语": "Greek",
"瑞典语": "Swedish",
"匈牙利语": "Hungarian",
"丹麦语": "Danish",
"芬兰语": "Finnish",
"乌克兰语": "Ukrainian",
"保加利亚语": "Bulgarian",
"塞尔维亚语": "Serbian",
"泰卢固语": "Telugu",
"南非荷兰语": "Afrikaans",
"亚美尼亚语": "Armenian",
"阿萨姆语": "Assamese",
"阿斯图里亚斯语": "Asturian",
"巴斯克语": "Basque",
"白俄罗斯语": "Belarusian",
"波斯尼亚语": "Bosnian",
"加泰罗尼亚语": "Catalan",
"宿务语": "Cebuano",
"克罗地亚语": "Croatian",
"埃及阿拉伯语": "Egyptian Arabic",
"爱沙尼亚语": "Estonian",
"加利西亚语": "Galician",
"格鲁吉亚语": "Georgian",
"古吉拉特语": "Gujarati",
"冰岛语": "Icelandic",
"爪哇语": "Javanese",
"卡纳达语": "Kannada",
"哈萨克语": "Kazakh",
"拉脱维亚语": "Latvian",
"立陶宛语": "Lithuanian",
"卢森堡语": "Luxembourgish",
"马其顿语": "Macedonian",
"马加希语": "Maithili",
"马耳他语": "Maltese",
"马拉地语": "Marathi",
"美索不达米亚阿拉伯语": "Mesopotamian Arabic",
"摩洛哥阿拉伯语": "Moroccan Arabic",
"内志阿拉伯语": "Najdi Arabic",
"尼泊尔语": "Nepali",
"北阿塞拜疆语": "North Azerbaijani",
"北黎凡特阿拉伯语": "North Levantine Arabic",
"北乌兹别克语": "Northern Uzbek",
"书面语挪威语": "Norwegian Bokmål",
"新挪威语": "Norwegian Nynorsk",
"奥克语": "Occitan",
"奥里亚语": "Odia",
"邦阿西楠语": "Pangasinan",
"西西里语": "Sicilian",
"信德语": "Sindhi",
"僧伽罗语": "Sinhala",
"斯洛伐克语": "Slovak",
"斯洛文尼亚语": "Slovenian",
"南黎凡特阿拉伯语": "South Levantine Arabic",
"斯瓦希里语": "Swahili",
"他加禄语": "Tagalog",
"塔伊兹-亚丁阿拉伯语": "Ta'izzi-Adeni Arabic",
"托斯克阿尔巴尼亚语": "Tosk Albanian",
"突尼斯阿拉伯语": "Tunisian Arabic",
"威尼斯语": "Venetian",
"瓦莱语": "Waray",
"威尔士语": "Welsh",
"西波斯语": "Western Persian",
# English punctuation/variant aliases
"norwegian bokmal": "Norwegian Bokmål",
"ta'izzi-adeni arabic": "Ta'izzi-Adeni Arabic",
}
class Agent: class Agent:
def __init__(self, config: AgentConfig): def __init__(self, config: AgentConfig):
@@ -316,6 +533,10 @@ class Agent:
self.rate_limiter = RateLimiter(rpm=config.rpm, tpm=config.tpm) self.rate_limiter = RateLimiter(rpm=config.rpm, tpm=config.tpm)
self.provider = config.provider if config.provider is not None else get_provider_by_domain(self.domain) self.provider = config.provider if config.provider is not None else get_provider_by_domain(self.domain)
self.is_mt_mode = "mt" in self.model_id.lower()
self.mt_source_lang = getattr(config, "source_lang", "auto")
self.mt_target_lang = getattr(config, "to_lang", None)
self.mt_domains = getattr(config, "custom_prompt", None)
def _estimate_tokens(self, text: str) -> int: def _estimate_tokens(self, text: str) -> int:
""" """
@@ -352,6 +573,43 @@ class Agent:
elif self.thinking == "disable": elif self.thinking == "disable":
data[field_thinking] = val_disable data[field_thinking] = val_disable
def _normalize_mt_lang(self, lang: str | None) -> str | None:
if lang is None:
return None
lang_text = str(lang).strip()
if not lang_text:
return None
key = _normalize_mt_lang_key(lang_text)
if key in _MT_LANG_BY_CODE:
return _MT_LANG_BY_CODE[key]
if key in _MT_LANG_BY_NAME:
return _MT_LANG_BY_NAME[key]
if key in _MT_LANG_ALIASES:
return _MT_LANG_ALIASES[key]
return lang_text
def _build_mt_translation_options(self) -> dict:
translation_options = {}
source_lang = self._normalize_mt_lang(self.mt_source_lang)
if source_lang:
translation_options["source_lang"] = source_lang
target_lang = self._normalize_mt_lang(self.mt_target_lang)
if target_lang:
translation_options["target_lang"] = target_lang
domains = str(self.mt_domains).strip() if self.mt_domains is not None else ""
if domains:
translation_options["domains"] = domains
return translation_options
def _build_mt_user_prompt(self, prompt: str, system_prompt: str) -> str:
# MT模式下直接返回原始prompt不添加任何system prompt
# MT模型会把整个user prompt当作待翻译内容
return prompt
def _prepare_request_data( def _prepare_request_data(
self, prompt: str, system_prompt: str, temperature=None, top_p=0.9, json_format=False self, prompt: str, system_prompt: str, temperature=None, top_p=0.9, json_format=False
): ):
@@ -361,6 +619,19 @@ class Agent:
"Content-Type": "application/json", "Content-Type": "application/json",
"Authorization": f"Bearer {self.key}", "Authorization": f"Bearer {self.key}",
} }
if self.is_mt_mode:
data = {
"model": self.model_id,
"messages": [
{"role": "user", "content": self._build_mt_user_prompt(prompt, system_prompt)},
],
}
translation_options = self._build_mt_translation_options()
if translation_options:
data["translation_options"] = translation_options
return headers, data
data = { data = {
"model": self.model_id, "model": self.model_id,
"messages": [ "messages": [

View File

@@ -32,15 +32,15 @@ For each Key-Value Pair in the JSON, translate the contents of the value into {t
> The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output. > The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
> If necessary, two segments can only be translated together, the translation should be proportionally allocated to the corresponding key's value based on the word count ratio of the segments. > If necessary, two segments can only be translated together, the translation should be proportionally allocated to the corresponding key's value based on the word count ratio of the segments.
Here is an example of the expected format: Here is an example of the expected format (Note: This is ONLY a format example, do NOT translate the example content):
<example> <example>
Input: Input:
```json ```json
{{ {{
"3":source, "EXAMPLE_KEY_1": "source text",
"4":source, "EXAMPLE_KEY_2": "source text"
}} }}
``` ```
@@ -48,8 +48,8 @@ Output(target language: {to_lang}):
```json ```json
{{ {{
"3":translation, "EXAMPLE_KEY_1": "translated text",
"4":translation, "EXAMPLE_KEY_2": "translated text"
}} }}
``` ```
For statements that must be combined during translation, employ merging at the minimal structural level. The total number of keys must remain unchanged after merging, and any empty values should be retained. For statements that must be combined during translation, employ merging at the minimal structural level. The total number of keys must remain unchanged after merging, and any empty values should be retained.
@@ -58,18 +58,20 @@ Below is an example of how merging should be done when necessary:
input: input:
```json ```json
{{ {{
"3":"汤姆说:“杰克你", "EXAMPLE_KEY_1":"汤姆说:“杰克你",
"4":"好”。" "EXAMPLE_KEY_2":"好”。"
}} }}
``` ```
output: output:
```json ```json
{{ {{
"3":"Tom says:\"Hello Jack.\"", "EXAMPLE_KEY_1":"Tom says:\"Hello Jack.\"",
"4":"" "EXAMPLE_KEY_2":""
}} }}
``` ```
</example> </example>
IMPORTANT: Only translate the content in the <input> section above. Do NOT include or translate the example content from this <example> section in your output.
Please return the translated JSON directly without including any additional information and preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are. Please return the translated JSON directly without including any additional information and preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are.
""" """
@@ -124,7 +126,12 @@ class SegmentsTranslateAgent(Agent):
- 如果键完全匹配,返回翻译结果。 - 如果键完全匹配,返回翻译结果。
- 如果键不匹配,构造一个部分成功的结果,并通过 PartialTranslationError 异常抛出,以触发重试。 - 如果键不匹配,构造一个部分成功的结果,并通过 PartialTranslationError 异常抛出,以触发重试。
- 其他错误如JSON解析失败、模型偷懒则抛出普通 ValueError 触发重试。 - 其他错误如JSON解析失败、模型偷懒则抛出普通 ValueError 触发重试。
- MT模式下如果返回的是纯文本而非JSON将其按行分割并映射到原始键。
""" """
# MT模式下直接解析origin_prompt为JSON纯净JSON没有<input>包装)
if self.is_mt_mode:
original_segments = origin_prompt
else:
original_segments = get_original_segments(origin_prompt) original_segments = get_original_segments(origin_prompt)
result = get_target_segments(result) result = get_target_segments(result)
if result == "": if result == "":
@@ -137,6 +144,37 @@ class SegmentsTranslateAgent(Agent):
original_chunk = json_repair.loads(original_segments) original_chunk = json_repair.loads(original_segments)
repaired_result = json_repair.loads(result) repaired_result = json_repair.loads(result)
# MT模式兼容处理各种非标准返回格式
if self.is_mt_mode:
# 如果是列表,尝试合并所有字典
if isinstance(repaired_result, list):
logger.debug(f"[MT模式] 返回结果是列表,包含 {len(repaired_result)} 个元素")
merged_result = {}
for item in repaired_result:
if isinstance(item, dict):
merged_result.update(item)
repaired_result = merged_result
# 如果返回的是纯文本(字符串),尝试将其映射到原始键
if isinstance(repaired_result, str):
original_keys = list(original_chunk.keys())
# 按行分割结果,去除空行
result_lines = [line.strip() for line in repaired_result.split('\n') if line.strip()]
# 如果只有一行结果但多个键,将整个结果分配给第一个键,其余为空
if len(result_lines) == 1 and len(original_keys) > 1:
repaired_result = {original_keys[0]: result_lines[0]}
for key in original_keys[1:]:
repaired_result[key] = ""
# 如果结果行数与键数匹配,逐行对应
elif len(result_lines) == len(original_keys):
repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))}
# 如果结果行数不匹配,将所有结果合并给第一个键
else:
repaired_result = {original_keys[0]: repaired_result}
for key in original_keys[1:]:
repaired_result[key] = ""
if not isinstance(repaired_result, dict): if not isinstance(repaired_result, dict):
raise AgentResultError(f"Agent返回结果不是dict的json形式, result: {result}") raise AgentResultError(f"Agent返回结果不是dict的json形式, result: {result}")
@@ -174,6 +212,32 @@ class SegmentsTranslateAgent(Agent):
return repaired_result return repaired_result
except (RuntimeError, JSONDecodeError) as e: except (RuntimeError, JSONDecodeError) as e:
# MT模式兼容如果JSON解析失败尝试将结果作为纯文本处理
if self.is_mt_mode:
try:
original_chunk = json_repair.loads(original_segments)
original_keys = list(original_chunk.keys())
result_lines = [line.strip() for line in result.split('\n') if line.strip()]
if len(result_lines) == 1 and len(original_keys) > 1:
repaired_result = {original_keys[0]: result_lines[0]}
for key in original_keys[1:]:
repaired_result[key] = ""
elif len(result_lines) == len(original_keys):
repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))}
else:
repaired_result = {original_keys[0]: result}
for key in original_keys[1:]:
repaired_result[key] = ""
# 验证结果
if set(repaired_result.keys()) != set(original_chunk.keys()):
raise AgentResultError(f"MT模式解析后键不匹配")
return repaired_result
except Exception as mt_e:
raise AgentResultError(f"MT模式纯文本处理失败: {mt_e.__repr__()}")
# 对于JSON解析等硬性错误继续抛出普通ValueError # 对于JSON解析等硬性错误继续抛出普通ValueError
raise AgentResultError(f"结果处理失败: {e.__repr__()}") raise AgentResultError(f"结果处理失败: {e.__repr__()}")
@@ -182,6 +246,10 @@ class SegmentsTranslateAgent(Agent):
处理在所有重试后仍然失败的请求。 处理在所有重试后仍然失败的请求。
作为备用方案,返回原文内容,并将所有值转换为字符串。 作为备用方案,返回原文内容,并将所有值转换为字符串。
""" """
# MT模式下直接解析origin_prompt为JSON纯净JSON没有<input>包装)
if self.is_mt_mode:
original_segments = origin_prompt
else:
original_segments = get_original_segments(origin_prompt) original_segments = get_original_segments(origin_prompt)
if original_segments == "": if original_segments == "":
return {} return {}
@@ -198,6 +266,10 @@ class SegmentsTranslateAgent(Agent):
def send_segments(self, segments: list[str], chunk_size: int) -> list[str]: def send_segments(self, segments: list[str], chunk_size: int) -> list[str]:
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
# MT模式下直接发送纯净JSON不添加额外提示词
if self.is_mt_mode:
prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
else:
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json, translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json,
pre_send_handler=self._pre_send_handler, pre_send_handler=self._pre_send_handler,
@@ -236,6 +308,10 @@ class SegmentsTranslateAgent(Agent):
async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]: async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]:
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments, indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
chunk_size) chunk_size)
# MT模式下直接发送纯净JSON不添加额外提示词
if self.is_mt_mode:
prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks]
else:
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks]
translated_chunks = await super().send_prompts_async(prompts=prompts, force_json=self.force_json, translated_chunks = await super().send_prompts_async(prompts=prompts, force_json=self.force_json,