From 9e82daa2a162abb20b511400b2ce330ef872bfbd Mon Sep 17 00:00:00 2001 From: toy Date: Tue, 10 Feb 2026 15:50:42 +0800 Subject: [PATCH] =?UTF-8?q?feat:=E5=85=BC=E5=AE=B9qwen-mt=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AGENTS.md | 39 ++++ docutranslate/agents/agent.py | 273 ++++++++++++++++++++++++- docutranslate/agents/segments_agent.py | 102 +++++++-- 3 files changed, 400 insertions(+), 14 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..825f43b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,39 @@ +# Repository Guidelines + +## Project Structure & Module Organization +Core Python code is in `docutranslate/`. +- `docutranslate/workflow/`: file-type workflows (`*_workflow.py`) that orchestrate conversion, translation, and export. +- `docutranslate/converter/`, `docutranslate/translator/`, `docutranslate/exporter/`: pipeline stages. +- `docutranslate/app.py`: FastAPI/Web UI backend entrypoint. +- `docutranslate/cli.py`: CLI entry (`docutranslate` command). +- `docutranslate/static/` and `docutranslate/template/`: bundled frontend/static assets. +- Packaging/build files live at root: `pyproject.toml`, `Dockerfile`, `*.spec`, `.github/workflows/`. + +## Build, Test, and Development Commands +- `uv sync`: install project dependencies from `pyproject.toml`/`uv.lock`. +- `uv run docutranslate -i`: start local Web UI + API (default `127.0.0.1:8010`). +- `uv run docutranslate -i -p 8011 --cors`: run on a custom port with CORS enabled. +- `docker run -d -p 8010:8010 xunbu/docutranslate:latest`: run the published container locally. +- `uv pip install pyinstaller && uv run pyinstaller lite.spec --noconfirm --clean -y`: build a lightweight desktop package (see also `full.spec`, `lite_mac.spec`). + +## Coding Style & Naming Conventions +- Follow Python 3.11+ conventions and PEP 8: 4-space indentation, clear type-oriented config classes, small focused functions. +- Use `snake_case` for modules/functions/variables and `PascalCase` for classes. +- Keep workflow naming consistent: `xxx_workflow.py`, matching config and workflow class names. +- Prefer explicit, composable configs over hard-coded provider values. + +## Testing Guidelines +There is currently no first-party `tests/` suite or enforced coverage gate in this repository. +- For behavior changes, run a manual smoke test: start `docutranslate -i`, open `/docs`, and execute at least one translation path you touched. +- If you add automated tests, place them under `tests/` with `test_*.py` names and keep fixtures small and file-type specific. + +## Commit & Pull Request Guidelines +Recent history favors short, imperative commit subjects (Chinese or English), for example: `Fix Gemini provider tag`, `Add regex dependency`, `Add Vietnamese`. +- Keep subject lines concise and action-focused. +- In PRs, include: what changed, why, how you validated it, and UI screenshots when `docutranslate/static/` or interface behavior changes. +- Link related issues and note any new env vars/API provider requirements. + +## Security & Configuration Tips +- Never commit real API keys or tokens. +- Keep provider credentials in environment variables or local untracked config. +- For LAN exposure, use `--host 0.0.0.0` intentionally and restrict network access as needed. diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index be405d5..762361b 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -290,6 +290,223 @@ _COMPLEX_SCRIPT_PATTERN = re.compile( r'[\u2e80-\u9fff\u0400-\u04ff\u0600-\u06ff\u0e00-\u0e7f\u0900-\u097f]' ) +def _normalize_mt_lang_key(lang: str) -> str: + key = str(lang).strip().lower() + key = key.replace("_", "-") + key = key.replace("'", "'").replace("'", "'") + key = key.replace("–", "-").replace("—", "-") + key = re.sub(r"\s+", " ", key) + return key + + +_MT_LANG_BY_CODE = { + "en": "English", + "zh": "Chinese", + "zh-tw": "Traditional Chinese", + "ru": "Russian", + "ja": "Japanese", + "ko": "Korean", + "es": "Spanish", + "fr": "French", + "pt": "Portuguese", + "de": "German", + "it": "Italian", + "th": "Thai", + "vi": "Vietnamese", + "id": "Indonesian", + "ms": "Malay", + "ar": "Arabic", + "hi": "Hindi", + "he": "Hebrew", + "my": "Burmese", + "ta": "Tamil", + "ur": "Urdu", + "bn": "Bengali", + "pl": "Polish", + "nl": "Dutch", + "ro": "Romanian", + "tr": "Turkish", + "km": "Khmer", + "lo": "Lao", + "yue": "Cantonese", + "cs": "Czech", + "el": "Greek", + "sv": "Swedish", + "hu": "Hungarian", + "da": "Danish", + "fi": "Finnish", + "uk": "Ukrainian", + "bg": "Bulgarian", + "sr": "Serbian", + "te": "Telugu", + "af": "Afrikaans", + "hy": "Armenian", + "as": "Assamese", + "ast": "Asturian", + "eu": "Basque", + "be": "Belarusian", + "bs": "Bosnian", + "ca": "Catalan", + "ceb": "Cebuano", + "hr": "Croatian", + "arz": "Egyptian Arabic", + "et": "Estonian", + "gl": "Galician", + "ka": "Georgian", + "gu": "Gujarati", + "is": "Icelandic", + "jv": "Javanese", + "kn": "Kannada", + "kk": "Kazakh", + "lv": "Latvian", + "lt": "Lithuanian", + "lb": "Luxembourgish", + "mk": "Macedonian", + "mai": "Maithili", + "mt": "Maltese", + "mr": "Marathi", + "acm": "Mesopotamian Arabic", + "ary": "Moroccan Arabic", + "ars": "Najdi Arabic", + "ne": "Nepali", + "az": "North Azerbaijani", + "apc": "North Levantine Arabic", + "uz": "Northern Uzbek", + "nb": "Norwegian Bokmål", + "nn": "Norwegian Nynorsk", + "oc": "Occitan", + "or": "Odia", + "pag": "Pangasinan", + "scn": "Sicilian", + "sd": "Sindhi", + "si": "Sinhala", + "sk": "Slovak", + "sl": "Slovenian", + "ajp": "South Levantine Arabic", + "sw": "Swahili", + "tl": "Tagalog", + "acq": "Ta'izzi-Adeni Arabic", + "sq": "Tosk Albanian", + "aeb": "Tunisian Arabic", + "vec": "Venetian", + "war": "Waray", + "cy": "Welsh", + "fa": "Western Persian", +} + +_MT_LANG_BY_NAME = { + _normalize_mt_lang_key(name): name for name in set(_MT_LANG_BY_CODE.values()) +} + +_MT_LANG_ALIASES = { + # Existing UI/common aliases + "english": "English", + "英语": "English", + "英文": "English", + "简体中文": "Chinese", + "中文": "Chinese", + "simplified chinese": "Chinese", + "chinese": "Chinese", + "traditional chinese": "Traditional Chinese", + "繁体中文": "Traditional Chinese", + "zh-hans": "Chinese", + "zh-cn": "Chinese", + "zh-hant": "Traditional Chinese", + # Full Chinese aliases from qwen-mt language list + "俄语": "Russian", + "日语": "Japanese", + "韩语": "Korean", + "西班牙语": "Spanish", + "法语": "French", + "葡萄牙语": "Portuguese", + "德语": "German", + "意大利语": "Italian", + "泰语": "Thai", + "越南语": "Vietnamese", + "印度尼西亚语": "Indonesian", + "马来语": "Malay", + "阿拉伯语": "Arabic", + "印地语": "Hindi", + "希伯来语": "Hebrew", + "缅甸语": "Burmese", + "泰米尔语": "Tamil", + "乌尔都语": "Urdu", + "孟加拉语": "Bengali", + "波兰语": "Polish", + "荷兰语": "Dutch", + "罗马尼亚语": "Romanian", + "土耳其语": "Turkish", + "高棉语": "Khmer", + "老挝语": "Lao", + "粤语": "Cantonese", + "捷克语": "Czech", + "希腊语": "Greek", + "瑞典语": "Swedish", + "匈牙利语": "Hungarian", + "丹麦语": "Danish", + "芬兰语": "Finnish", + "乌克兰语": "Ukrainian", + "保加利亚语": "Bulgarian", + "塞尔维亚语": "Serbian", + "泰卢固语": "Telugu", + "南非荷兰语": "Afrikaans", + "亚美尼亚语": "Armenian", + "阿萨姆语": "Assamese", + "阿斯图里亚斯语": "Asturian", + "巴斯克语": "Basque", + "白俄罗斯语": "Belarusian", + "波斯尼亚语": "Bosnian", + "加泰罗尼亚语": "Catalan", + "宿务语": "Cebuano", + "克罗地亚语": "Croatian", + "埃及阿拉伯语": "Egyptian Arabic", + "爱沙尼亚语": "Estonian", + "加利西亚语": "Galician", + "格鲁吉亚语": "Georgian", + "古吉拉特语": "Gujarati", + "冰岛语": "Icelandic", + "爪哇语": "Javanese", + "卡纳达语": "Kannada", + "哈萨克语": "Kazakh", + "拉脱维亚语": "Latvian", + "立陶宛语": "Lithuanian", + "卢森堡语": "Luxembourgish", + "马其顿语": "Macedonian", + "马加希语": "Maithili", + "马耳他语": "Maltese", + "马拉地语": "Marathi", + "美索不达米亚阿拉伯语": "Mesopotamian Arabic", + "摩洛哥阿拉伯语": "Moroccan Arabic", + "内志阿拉伯语": "Najdi Arabic", + "尼泊尔语": "Nepali", + "北阿塞拜疆语": "North Azerbaijani", + "北黎凡特阿拉伯语": "North Levantine Arabic", + "北乌兹别克语": "Northern Uzbek", + "书面语挪威语": "Norwegian Bokmål", + "新挪威语": "Norwegian Nynorsk", + "奥克语": "Occitan", + "奥里亚语": "Odia", + "邦阿西楠语": "Pangasinan", + "西西里语": "Sicilian", + "信德语": "Sindhi", + "僧伽罗语": "Sinhala", + "斯洛伐克语": "Slovak", + "斯洛文尼亚语": "Slovenian", + "南黎凡特阿拉伯语": "South Levantine Arabic", + "斯瓦希里语": "Swahili", + "他加禄语": "Tagalog", + "塔伊兹-亚丁阿拉伯语": "Ta'izzi-Adeni Arabic", + "托斯克阿尔巴尼亚语": "Tosk Albanian", + "突尼斯阿拉伯语": "Tunisian Arabic", + "威尼斯语": "Venetian", + "瓦莱语": "Waray", + "威尔士语": "Welsh", + "西波斯语": "Western Persian", + # English punctuation/variant aliases + "norwegian bokmal": "Norwegian Bokmål", + "ta'izzi-adeni arabic": "Ta'izzi-Adeni Arabic", +} + class Agent: def __init__(self, config: AgentConfig): @@ -316,6 +533,10 @@ class Agent: self.rate_limiter = RateLimiter(rpm=config.rpm, tpm=config.tpm) self.provider = config.provider if config.provider is not None else get_provider_by_domain(self.domain) + self.is_mt_mode = "mt" in self.model_id.lower() + self.mt_source_lang = getattr(config, "source_lang", "auto") + self.mt_target_lang = getattr(config, "to_lang", None) + self.mt_domains = getattr(config, "custom_prompt", None) def _estimate_tokens(self, text: str) -> int: """ @@ -352,6 +573,43 @@ class Agent: elif self.thinking == "disable": data[field_thinking] = val_disable + def _normalize_mt_lang(self, lang: str | None) -> str | None: + if lang is None: + return None + lang_text = str(lang).strip() + if not lang_text: + return None + key = _normalize_mt_lang_key(lang_text) + if key in _MT_LANG_BY_CODE: + return _MT_LANG_BY_CODE[key] + if key in _MT_LANG_BY_NAME: + return _MT_LANG_BY_NAME[key] + if key in _MT_LANG_ALIASES: + return _MT_LANG_ALIASES[key] + return lang_text + + def _build_mt_translation_options(self) -> dict: + translation_options = {} + + source_lang = self._normalize_mt_lang(self.mt_source_lang) + if source_lang: + translation_options["source_lang"] = source_lang + + target_lang = self._normalize_mt_lang(self.mt_target_lang) + if target_lang: + translation_options["target_lang"] = target_lang + + domains = str(self.mt_domains).strip() if self.mt_domains is not None else "" + if domains: + translation_options["domains"] = domains + + return translation_options + + def _build_mt_user_prompt(self, prompt: str, system_prompt: str) -> str: + # MT模式下,直接返回原始prompt,不添加任何system prompt + # MT模型会把整个user prompt当作待翻译内容 + return prompt + def _prepare_request_data( self, prompt: str, system_prompt: str, temperature=None, top_p=0.9, json_format=False ): @@ -361,6 +619,19 @@ class Agent: "Content-Type": "application/json", "Authorization": f"Bearer {self.key}", } + + if self.is_mt_mode: + data = { + "model": self.model_id, + "messages": [ + {"role": "user", "content": self._build_mt_user_prompt(prompt, system_prompt)}, + ], + } + translation_options = self._build_mt_translation_options() + if translation_options: + data["translation_options"] = translation_options + return headers, data + data = { "model": self.model_id, "messages": [ @@ -1150,4 +1421,4 @@ class Agent: f"总计: {token_stats['total_tokens'] / 1000:.2f}K" ) - return output_list \ No newline at end of file + return output_list diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index bd01804..a684f02 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -32,15 +32,15 @@ For each Key-Value Pair in the JSON, translate the contents of the value into {t > The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output. > If necessary, two segments can only be translated together, the translation should be proportionally allocated to the corresponding key's value based on the word count ratio of the segments. -Here is an example of the expected format: +Here is an example of the expected format (Note: This is ONLY a format example, do NOT translate the example content): Input: ```json {{ -"3":source, -"4":source, +"EXAMPLE_KEY_1": "source text", +"EXAMPLE_KEY_2": "source text" }} ``` @@ -48,8 +48,8 @@ Output(target language: {to_lang}): ```json {{ -"3":translation, -"4":translation, +"EXAMPLE_KEY_1": "translated text", +"EXAMPLE_KEY_2": "translated text" }} ``` For statements that must be combined during translation, employ merging at the minimal structural level. The total number of keys must remain unchanged after merging, and any empty values should be retained. @@ -58,18 +58,20 @@ Below is an example of how merging should be done when necessary: input: ```json {{ -"3":"汤姆说:“杰克你", -"4":"好”。" +"EXAMPLE_KEY_1":"汤姆说:“杰克你", +"EXAMPLE_KEY_2":"好”。" }} ``` output: ```json {{ -"3":"Tom says:\"Hello Jack.\"", -"4":"" +"EXAMPLE_KEY_1":"Tom says:\"Hello Jack.\"", +"EXAMPLE_KEY_2":"" }} ``` + +IMPORTANT: Only translate the content in the section above. Do NOT include or translate the example content from this section in your output. Please return the translated JSON directly without including any additional information and preserve special tags or untranslatable elements (such as code, brand names, technical terms) as they are. """ @@ -124,8 +126,13 @@ class SegmentsTranslateAgent(Agent): - 如果键完全匹配,返回翻译结果。 - 如果键不匹配,构造一个部分成功的结果,并通过 PartialTranslationError 异常抛出,以触发重试。 - 其他错误(如JSON解析失败、模型偷懒)则抛出普通 ValueError 触发重试。 + - MT模式下,如果返回的是纯文本而非JSON,将其按行分割并映射到原始键。 """ - original_segments = get_original_segments(origin_prompt) + # MT模式下直接解析origin_prompt为JSON(纯净JSON,没有包装) + if self.is_mt_mode: + original_segments = origin_prompt + else: + original_segments = get_original_segments(origin_prompt) result = get_target_segments(result) if result == "": if original_segments.strip() != "": @@ -137,6 +144,37 @@ class SegmentsTranslateAgent(Agent): original_chunk = json_repair.loads(original_segments) repaired_result = json_repair.loads(result) + # MT模式兼容:处理各种非标准返回格式 + if self.is_mt_mode: + # 如果是列表,尝试合并所有字典 + if isinstance(repaired_result, list): + logger.debug(f"[MT模式] 返回结果是列表,包含 {len(repaired_result)} 个元素") + merged_result = {} + for item in repaired_result: + if isinstance(item, dict): + merged_result.update(item) + repaired_result = merged_result + + # 如果返回的是纯文本(字符串),尝试将其映射到原始键 + if isinstance(repaired_result, str): + original_keys = list(original_chunk.keys()) + # 按行分割结果,去除空行 + result_lines = [line.strip() for line in repaired_result.split('\n') if line.strip()] + + # 如果只有一行结果但多个键,将整个结果分配给第一个键,其余为空 + if len(result_lines) == 1 and len(original_keys) > 1: + repaired_result = {original_keys[0]: result_lines[0]} + for key in original_keys[1:]: + repaired_result[key] = "" + # 如果结果行数与键数匹配,逐行对应 + elif len(result_lines) == len(original_keys): + repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))} + # 如果结果行数不匹配,将所有结果合并给第一个键 + else: + repaired_result = {original_keys[0]: repaired_result} + for key in original_keys[1:]: + repaired_result[key] = "" + if not isinstance(repaired_result, dict): raise AgentResultError(f"Agent返回结果不是dict的json形式, result: {result}") @@ -174,6 +212,32 @@ class SegmentsTranslateAgent(Agent): return repaired_result except (RuntimeError, JSONDecodeError) as e: + # MT模式兼容:如果JSON解析失败,尝试将结果作为纯文本处理 + if self.is_mt_mode: + try: + original_chunk = json_repair.loads(original_segments) + original_keys = list(original_chunk.keys()) + result_lines = [line.strip() for line in result.split('\n') if line.strip()] + + if len(result_lines) == 1 and len(original_keys) > 1: + repaired_result = {original_keys[0]: result_lines[0]} + for key in original_keys[1:]: + repaired_result[key] = "" + elif len(result_lines) == len(original_keys): + repaired_result = {original_keys[i]: result_lines[i] for i in range(len(original_keys))} + else: + repaired_result = {original_keys[0]: result} + for key in original_keys[1:]: + repaired_result[key] = "" + + # 验证结果 + if set(repaired_result.keys()) != set(original_chunk.keys()): + raise AgentResultError(f"MT模式解析后键不匹配") + + return repaired_result + except Exception as mt_e: + raise AgentResultError(f"MT模式纯文本处理失败: {mt_e.__repr__()}") + # 对于JSON解析等硬性错误,继续抛出普通ValueError raise AgentResultError(f"结果处理失败: {e.__repr__()}") @@ -182,7 +246,11 @@ class SegmentsTranslateAgent(Agent): 处理在所有重试后仍然失败的请求。 作为备用方案,返回原文内容,并将所有值转换为字符串。 """ - original_segments = get_original_segments(origin_prompt) + # MT模式下直接解析origin_prompt为JSON(纯净JSON,没有包装) + if self.is_mt_mode: + original_segments = origin_prompt + else: + original_segments = get_original_segments(origin_prompt) if original_segments == "": return {} try: @@ -198,7 +266,11 @@ class SegmentsTranslateAgent(Agent): def send_segments(self, segments: list[str], chunk_size: int) -> list[str]: indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) - prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] + # MT模式下直接发送纯净JSON,不添加额外提示词 + if self.is_mt_mode: + prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks] + else: + prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] translated_chunks = super().send_prompts(prompts=prompts, json_format=self.force_json, pre_send_handler=self._pre_send_handler, result_handler=self._result_handler, @@ -236,7 +308,11 @@ class SegmentsTranslateAgent(Agent): async def send_segments_async(self, segments: list[str], chunk_size: int) -> list[str]: indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments, chunk_size) - prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] + # MT模式下直接发送纯净JSON,不添加额外提示词 + if self.is_mt_mode: + prompts = [json.dumps(chunk, ensure_ascii=False, indent=0) for chunk in chunks] + else: + prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False, indent=0), self.to_lang) for chunk in chunks] translated_chunks = await super().send_prompts_async(prompts=prompts, force_json=self.force_json, pre_send_handler=self._pre_send_handler,