优化glossary_agent和markdown_agent的提示词

2025-10-16 22:20:44 +08:00
parent 324ad77a2e
commit 8780b25ae4
3 changed files with 83 additions and 61 deletions
--- a/docutranslate/init.py
+++ b/docutranslate/init.py
@@ -1,3 +1,3 @@
 # SPDX-FileCopyrightText: 2025 QinHan
 # SPDX-License-Identifier: MPL-2.0
-__version__="1.4.12"
+__version__="1.4.13a1"
--- a/docutranslate/agents/glossary_agent.py
+++ b/docutranslate/agents/glossary_agent.py
@@ -3,6 +3,7 @@
 import asyncio
 import json
 import re
 from dataclasses import dataclass
 from json import JSONDecodeError
 from logging import Logger
@@ -14,6 +15,62 @@ from docutranslate.agents.agent import AgentResultError
 from docutranslate.utils.json_utils import segments2json_chunks
 def generate_prompt(json_segments: str, to_lang: str):
    return f"""
 You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
 Here is the input:
 <input>
 ```json
 {json_segments}
 ```
 </input>
 You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}.
 Finally, output a glossary of Source Nouns:Target Nouns
 > The source noun in the output glossary must exactly match the original term in original language, while target noun is the {to_lang} translation of the term
 > Do not extract special tags or untranslatable elements (such as code, brand names, technical terms)
 > The same source noun should only appear once in the glossary without repetition
 > The Target Nouns
 Here is an example of the expected format:
 <example>
 Input:
 ```json
 {{
 "3":"text",
 "4":"text"
 }}
 ```
 Output
 ```json
 {'[{"src": "Source Noun1", "dst": "Target Noun1"},\n {"src": "Source Noun2", "dst": "Target Noun2"}, \n{"src": "Source Noun3", "dst": "Target Noun3"}]'}
 ```
 </example>
 Please return the translated JSON Array directly without including any additional information.
 """
 def get_original_segments(prompt: str):
    match = re.search(r'<input>(.*)</input>', prompt, re.DOTALL)
    if match:
        return match.group(1)
    else:
        raise ValueError("无法从prompt中提取初始文本")
 def get_target_segments(result: str):
    match = re.search(r'```json(.*)```', result, re.DOTALL)
    if match:
        return match.group(1)
    else:
        return result
@dataclass
 class GlossaryAgentConfig(AgentConfig):
    to_lang: str
@@ -27,34 +84,13 @@ class GlossaryAgent(Agent):
        self.system_prompt = f"""
 # Role
 You are a professional glossary extractor
 # Task
 You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
 You need to extract person names and location names from these paragraphs and translate these terms into {self.to_lang}.
 Finally, output a glossary of original terms:translated terms
 # Requirements
 - The original language is identified based on the context.The target language is {self.to_lang}
 - The src in the output glossary must exactly match the original term in original language, while dst is the {self.to_lang} translation of the term
 - Do not include special tags or tags formatted as `<ph-xxxxxx>` in the glossary
 - The same src should only appear once in the glossary without repetition
 - Do not include common nouns in the glossary.
 # Output
 The output format should be plain JSON text in a list format
 {[{"src": "<Original Term>", "dst": "<Translated Term>"}]}
 # Example1(Assuming the source language is English and the target language is Chinese in the example)
 ## Input
 {{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}}
 ## Output
 {r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'}
 """
        self.custom_prompt = config.custom_prompt
        if config.custom_prompt:
            self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
    def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
        result = get_target_segments(result)
        if result == "":
            if origin_prompt.strip() != "":
                logger.error("result为空值但原文不为空")
@@ -66,11 +102,11 @@ The output format should be plain JSON text in a list format
                raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}")
            return repaired_result
        except (RuntimeError, JSONDecodeError) as e:
            # 将解析错误包装成 ValueError 以便被 send 方法捕获并重试
            raise AgentResultError(f"结果不能正确解析: {e.__repr__()}")
    def _error_result_handler(self, origin_prompt: str, logger: Logger):
-        if origin_prompt == "":
+        origin_prompt = get_original_segments(origin_prompt)
        if origin_prompt.strip() == "":
            return []
        try:
            return json_repair.loads(origin_prompt)
@@ -82,7 +118,7 @@ The output format should be plain JSON text in a list format
        self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}")
        result = {}
        indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
-        prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
+        prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
        translated_chunks = super().send_prompts(prompts=prompts,
                                                 result_handler=self._result_handler,
                                                 error_result_handler=self._error_result_handler)
@@ -106,7 +142,7 @@ The output format should be plain JSON text in a list format
        result = {}
        indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
                                                                                 chunk_size)
-        prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
+        prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
        translated_chunks = await super().send_prompts_async(prompts=prompts,
                                                             result_handler=self._result_handler,
                                                             error_result_handler=self._error_result_handler)
--- a/docutranslate/agents/markdown_agent.py
+++ b/docutranslate/agents/markdown_agent.py
@@ -6,6 +6,22 @@ from dataclasses import dataclass
 from .agent import Agent, AgentConfig
 from ..glossary.glossary import Glossary
 def generate_prompt(markdown_text: str, to_lang: str):
    return f"""
 Treat the text input as markdown text and translate it into {to_lang},output translation ONLY. 
 - NO explanations. NO notes. 
 - Do not change placeholders in the format of `<ph-xxxxxx>`.
 - For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
 - All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
 - Remove or correct any obviously abnormal characters, but without altering the original meaning.
 - When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
  [1] Author A, Author B. "Original Title". Journal, 2023.
  [2] 作者C. 《中文标题》. 期刊, 2022.
 - Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text).
 The markdown text input:
 {markdown_text}
 """
@dataclass
 class MDTranslateAgentConfig(AgentConfig):
@@ -17,43 +33,11 @@ class MDTranslateAgentConfig(AgentConfig):
 class MDTranslateAgent(Agent):
    def __init__(self, config: MDTranslateAgentConfig):
        super().__init__(config)
        self.to_lang=config.to_lang
        self.system_prompt = f"""
 # Role
 You are a professional machine translation engine.
-
+"""
 # Task
 Translate the input markdown text.
 Target language: {config.to_lang}
 # Requirements
 - The translation must be professional and accurate.
 - Do not output any explanations or annotations.
 - For personal names and proper nouns, use the most commonly used words for translation. If there are multiple common translations, choose the word that comes first in dictionary order.
 - For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
 - Do not change placeholders in the format of `<ph-xxxxxx>`.
 - All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
 - Remove or correct any obviously abnormal characters, but without altering the original meaning.
 - When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
  [1] Author A, Author B. "Original Title". Journal, 2023.
  [2] 作者C. 《中文标题》. 期刊, 2022.
 # Output
 The translated markdown text as plain text (not in a markdown code block, with no extraneous text).
 # Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language)
 Input:
 hello, what's your nam*@e?
 ![photo title](<ph-abcdde>)
 The equation is E=mc 2. This is famous.
 1+1=2$$
 (c_0,c_1_1,c_2^2)is a coordinate.
 Output:
 你好，你叫什么名字？
 ![图像标题](<ph-abcdde>)
 这个方程是 $E=mc^2$。这很有名。
 $$1+1=2$$
 \\((c_0,c_1,c_2^2)\\)是一个坐标。"""
        self.custom_prompt = config.custom_prompt
        if config.custom_prompt:
            self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
@@ -66,9 +50,11 @@ $$1+1=2$$
        return system_prompt, prompt
    def send_chunks(self, prompts: list[str]):
        prompts=[generate_prompt(prompt,self.to_lang) for prompt in prompts]
        return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
    async def send_chunks_async(self, prompts: list[str]):
        prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts]
        return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
    def update_glossary_dict(self, update_dict: dict | None):