From 8780b25ae4c908cc9bafd59278449001220a45d6 Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 16 Oct 2025 22:20:44 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96glossary=5Fagent=E5=92=8Cmark?= =?UTF-8?q?down=5Fagent=E7=9A=84=E6=8F=90=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/__init__.py | 2 +- docutranslate/agents/glossary_agent.py | 88 ++++++++++++++++++-------- docutranslate/agents/markdown_agent.py | 54 ++++++---------- 3 files changed, 83 insertions(+), 61 deletions(-) diff --git a/docutranslate/__init__.py b/docutranslate/__init__.py index 59470a8..0ac4d14 100644 --- a/docutranslate/__init__.py +++ b/docutranslate/__init__.py @@ -1,3 +1,3 @@ # SPDX-FileCopyrightText: 2025 QinHan # SPDX-License-Identifier: MPL-2.0 -__version__="1.4.12" \ No newline at end of file +__version__="1.4.13a1" \ No newline at end of file diff --git a/docutranslate/agents/glossary_agent.py b/docutranslate/agents/glossary_agent.py index a5c3fa1..7ace10a 100644 --- a/docutranslate/agents/glossary_agent.py +++ b/docutranslate/agents/glossary_agent.py @@ -3,6 +3,7 @@ import asyncio import json +import re from dataclasses import dataclass from json import JSONDecodeError from logging import Logger @@ -14,6 +15,62 @@ from docutranslate.agents.agent import AgentResultError from docutranslate.utils.json_utils import segments2json_chunks +def generate_prompt(json_segments: str, to_lang: str): + return f""" +You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents. +Here is the input: + + +```json +{json_segments} +``` + +You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}. +Finally, output a glossary of Source Nouns:Target Nouns +> The source noun in the output glossary must exactly match the original term in original language, while target noun is the {to_lang} translation of the term +> Do not extract special tags or untranslatable elements (such as code, brand names, technical terms) +> The same source noun should only appear once in the glossary without repetition +> The Target Nouns + +Here is an example of the expected format: + + +Input: + +```json +{{ +"3":"text", +"4":"text" +}} +``` + +Output + +```json +{'[{"src": "Source Noun1", "dst": "Target Noun1"},\n {"src": "Source Noun2", "dst": "Target Noun2"}, \n{"src": "Source Noun3", "dst": "Target Noun3"}]'} +``` + + +Please return the translated JSON Array directly without including any additional information. +""" + + +def get_original_segments(prompt: str): + match = re.search(r'(.*)', prompt, re.DOTALL) + if match: + return match.group(1) + else: + raise ValueError("无法从prompt中提取初始文本") + + +def get_target_segments(result: str): + match = re.search(r'```json(.*)```', result, re.DOTALL) + if match: + return match.group(1) + else: + return result + + @dataclass class GlossaryAgentConfig(AgentConfig): to_lang: str @@ -27,34 +84,13 @@ class GlossaryAgent(Agent): self.system_prompt = f""" # Role You are a professional glossary extractor - -# Task -You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents. -You need to extract person names and location names from these paragraphs and translate these terms into {self.to_lang}. -Finally, output a glossary of original terms:translated terms - -# Requirements -- The original language is identified based on the context.The target language is {self.to_lang} -- The src in the output glossary must exactly match the original term in original language, while dst is the {self.to_lang} translation of the term -- Do not include special tags or tags formatted as `` in the glossary -- The same src should only appear once in the glossary without repetition -- Do not include common nouns in the glossary. - -# Output -The output format should be plain JSON text in a list format -{[{"src": "", "dst": ""}]} - -# Example1(Assuming the source language is English and the target language is Chinese in the example) -## Input -{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}} -## Output -{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'} """ self.custom_prompt = config.custom_prompt if config.custom_prompt: self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n' def _result_handler(self, result: str, origin_prompt: str, logger: Logger): + result = get_target_segments(result) if result == "": if origin_prompt.strip() != "": logger.error("result为空值但原文不为空") @@ -66,11 +102,11 @@ The output format should be plain JSON text in a list format raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}") return repaired_result except (RuntimeError, JSONDecodeError) as e: - # 将解析错误包装成 ValueError 以便被 send 方法捕获并重试 raise AgentResultError(f"结果不能正确解析: {e.__repr__()}") def _error_result_handler(self, origin_prompt: str, logger: Logger): - if origin_prompt == "": + origin_prompt = get_original_segments(origin_prompt) + if origin_prompt.strip() == "": return [] try: return json_repair.loads(origin_prompt) @@ -82,7 +118,7 @@ The output format should be plain JSON text in a list format self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}") result = {} indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) - prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks] + prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks] translated_chunks = super().send_prompts(prompts=prompts, result_handler=self._result_handler, error_result_handler=self._error_result_handler) @@ -106,7 +142,7 @@ The output format should be plain JSON text in a list format result = {} indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments, chunk_size) - prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks] + prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks] translated_chunks = await super().send_prompts_async(prompts=prompts, result_handler=self._result_handler, error_result_handler=self._error_result_handler) diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index 0d20f10..ef64b94 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -6,6 +6,22 @@ from dataclasses import dataclass from .agent import Agent, AgentConfig from ..glossary.glossary import Glossary +def generate_prompt(markdown_text: str, to_lang: str): + return f""" +Treat the text input as markdown text and translate it into {to_lang},output translation ONLY. +- NO explanations. NO notes. +- Do not change placeholders in the format of ``. +- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form. +- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it. +- Remove or correct any obviously abnormal characters, but without altering the original meaning. +- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows: + [1] Author A, Author B. "Original Title". Journal, 2023. + [2] 作者C. 《中文标题》. 期刊, 2022. +- Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text). + +The markdown text input: + {markdown_text} +""" @dataclass class MDTranslateAgentConfig(AgentConfig): @@ -17,43 +33,11 @@ class MDTranslateAgentConfig(AgentConfig): class MDTranslateAgent(Agent): def __init__(self, config: MDTranslateAgentConfig): super().__init__(config) + self.to_lang=config.to_lang self.system_prompt = f""" # Role You are a professional machine translation engine. - -# Task -Translate the input markdown text. -Target language: {config.to_lang} - -# Requirements -- The translation must be professional and accurate. -- Do not output any explanations or annotations. -- For personal names and proper nouns, use the most commonly used words for translation. If there are multiple common translations, choose the word that comes first in dictionary order. -- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form. -- Do not change placeholders in the format of ``. -- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it. -- Remove or correct any obviously abnormal characters, but without altering the original meaning. -- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows: - [1] Author A, Author B. "Original Title". Journal, 2023. - [2] 作者C. 《中文标题》. 期刊, 2022. - -# Output -The translated markdown text as plain text (not in a markdown code block, with no extraneous text). - -# Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language) -Input: -hello, what's your nam*@e? -![photo title]() -The equation is E=mc 2. This is famous. -1+1=2$$ -(c_0,c_1_1,c_2^2)is a coordinate. - -Output: -你好,你叫什么名字? -![图像标题]() -这个方程是 $E=mc^2$。这很有名。 -$$1+1=2$$ -\\((c_0,c_1,c_2^2)\\)是一个坐标。""" +""" self.custom_prompt = config.custom_prompt if config.custom_prompt: self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n' @@ -66,9 +50,11 @@ $$1+1=2$$ return system_prompt, prompt def send_chunks(self, prompts: list[str]): + prompts=[generate_prompt(prompt,self.to_lang) for prompt in prompts] return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler) async def send_chunks_async(self, prompts: list[str]): + prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts] return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler) def update_glossary_dict(self, update_dict: dict | None):