优化glossary_agent和markdown_agent的提示词

This commit is contained in:
xunbu
2025-10-16 22:20:44 +08:00
parent 324ad77a2e
commit 8780b25ae4
3 changed files with 83 additions and 61 deletions

View File

@@ -1,3 +1,3 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
__version__="1.4.12" __version__="1.4.13a1"

View File

@@ -3,6 +3,7 @@
import asyncio import asyncio
import json import json
import re
from dataclasses import dataclass from dataclasses import dataclass
from json import JSONDecodeError from json import JSONDecodeError
from logging import Logger from logging import Logger
@@ -14,6 +15,62 @@ from docutranslate.agents.agent import AgentResultError
from docutranslate.utils.json_utils import segments2json_chunks from docutranslate.utils.json_utils import segments2json_chunks
def generate_prompt(json_segments: str, to_lang: str):
return f"""
You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
Here is the input:
<input>
```json
{json_segments}
```
</input>
You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}.
Finally, output a glossary of Source Nouns:Target Nouns
> The source noun in the output glossary must exactly match the original term in original language, while target noun is the {to_lang} translation of the term
> Do not extract special tags or untranslatable elements (such as code, brand names, technical terms)
> The same source noun should only appear once in the glossary without repetition
> The Target Nouns
Here is an example of the expected format:
<example>
Input:
```json
{{
"3":"text",
"4":"text"
}}
```
Output
```json
{'[{"src": "Source Noun1", "dst": "Target Noun1"},\n {"src": "Source Noun2", "dst": "Target Noun2"}, \n{"src": "Source Noun3", "dst": "Target Noun3"}]'}
```
</example>
Please return the translated JSON Array directly without including any additional information.
"""
def get_original_segments(prompt: str):
match = re.search(r'<input>(.*)</input>', prompt, re.DOTALL)
if match:
return match.group(1)
else:
raise ValueError("无法从prompt中提取初始文本")
def get_target_segments(result: str):
match = re.search(r'```json(.*)```', result, re.DOTALL)
if match:
return match.group(1)
else:
return result
@dataclass @dataclass
class GlossaryAgentConfig(AgentConfig): class GlossaryAgentConfig(AgentConfig):
to_lang: str to_lang: str
@@ -27,34 +84,13 @@ class GlossaryAgent(Agent):
self.system_prompt = f""" self.system_prompt = f"""
# Role # Role
You are a professional glossary extractor You are a professional glossary extractor
# Task
You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
You need to extract person names and location names from these paragraphs and translate these terms into {self.to_lang}.
Finally, output a glossary of original terms:translated terms
# Requirements
- The original language is identified based on the context.The target language is {self.to_lang}
- The src in the output glossary must exactly match the original term in original language, while dst is the {self.to_lang} translation of the term
- Do not include special tags or tags formatted as `<ph-xxxxxx>` in the glossary
- The same src should only appear once in the glossary without repetition
- Do not include common nouns in the glossary.
# Output
The output format should be plain JSON text in a list format
{[{"src": "<Original Term>", "dst": "<Translated Term>"}]}
# Example1(Assuming the source language is English and the target language is Chinese in the example)
## Input
{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}}
## Output
{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'}
""" """
self.custom_prompt = config.custom_prompt self.custom_prompt = config.custom_prompt
if config.custom_prompt: if config.custom_prompt:
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n' self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
def _result_handler(self, result: str, origin_prompt: str, logger: Logger): def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
result = get_target_segments(result)
if result == "": if result == "":
if origin_prompt.strip() != "": if origin_prompt.strip() != "":
logger.error("result为空值但原文不为空") logger.error("result为空值但原文不为空")
@@ -66,11 +102,11 @@ The output format should be plain JSON text in a list format
raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}") raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}")
return repaired_result return repaired_result
except (RuntimeError, JSONDecodeError) as e: except (RuntimeError, JSONDecodeError) as e:
# 将解析错误包装成 ValueError 以便被 send 方法捕获并重试
raise AgentResultError(f"结果不能正确解析: {e.__repr__()}") raise AgentResultError(f"结果不能正确解析: {e.__repr__()}")
def _error_result_handler(self, origin_prompt: str, logger: Logger): def _error_result_handler(self, origin_prompt: str, logger: Logger):
if origin_prompt == "": origin_prompt = get_original_segments(origin_prompt)
if origin_prompt.strip() == "":
return [] return []
try: try:
return json_repair.loads(origin_prompt) return json_repair.loads(origin_prompt)
@@ -82,7 +118,7 @@ The output format should be plain JSON text in a list format
self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}") self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}")
result = {} result = {}
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks] prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
translated_chunks = super().send_prompts(prompts=prompts, translated_chunks = super().send_prompts(prompts=prompts,
result_handler=self._result_handler, result_handler=self._result_handler,
error_result_handler=self._error_result_handler) error_result_handler=self._error_result_handler)
@@ -106,7 +142,7 @@ The output format should be plain JSON text in a list format
result = {} result = {}
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments, indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
chunk_size) chunk_size)
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks] prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
translated_chunks = await super().send_prompts_async(prompts=prompts, translated_chunks = await super().send_prompts_async(prompts=prompts,
result_handler=self._result_handler, result_handler=self._result_handler,
error_result_handler=self._error_result_handler) error_result_handler=self._error_result_handler)

View File

@@ -6,6 +6,22 @@ from dataclasses import dataclass
from .agent import Agent, AgentConfig from .agent import Agent, AgentConfig
from ..glossary.glossary import Glossary from ..glossary.glossary import Glossary
def generate_prompt(markdown_text: str, to_lang: str):
return f"""
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
- NO explanations. NO notes.
- Do not change placeholders in the format of `<ph-xxxxxx>`.
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
[1] Author A, Author B. "Original Title". Journal, 2023.
[2] 作者C. 《中文标题》. 期刊, 2022.
- Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text).
The markdown text input:
{markdown_text}
"""
@dataclass @dataclass
class MDTranslateAgentConfig(AgentConfig): class MDTranslateAgentConfig(AgentConfig):
@@ -17,43 +33,11 @@ class MDTranslateAgentConfig(AgentConfig):
class MDTranslateAgent(Agent): class MDTranslateAgent(Agent):
def __init__(self, config: MDTranslateAgentConfig): def __init__(self, config: MDTranslateAgentConfig):
super().__init__(config) super().__init__(config)
self.to_lang=config.to_lang
self.system_prompt = f""" self.system_prompt = f"""
# Role # Role
You are a professional machine translation engine. You are a professional machine translation engine.
"""
# Task
Translate the input markdown text.
Target language: {config.to_lang}
# Requirements
- The translation must be professional and accurate.
- Do not output any explanations or annotations.
- For personal names and proper nouns, use the most commonly used words for translation. If there are multiple common translations, choose the word that comes first in dictionary order.
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
- Do not change placeholders in the format of `<ph-xxxxxx>`.
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
[1] Author A, Author B. "Original Title". Journal, 2023.
[2] 作者C. 《中文标题》. 期刊, 2022.
# Output
The translated markdown text as plain text (not in a markdown code block, with no extraneous text).
# Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language)
Input:
hello, what's your nam*@e?
![photo title](<ph-abcdde>)
The equation is E=mc 2. This is famous.
1+1=2$$
(c_0,c_1_1,c_2^2)is a coordinate.
Output:
你好,你叫什么名字?
![图像标题](<ph-abcdde>)
这个方程是 $E=mc^2$。这很有名。
$$1+1=2$$
\\((c_0,c_1,c_2^2)\\)是一个坐标。"""
self.custom_prompt = config.custom_prompt self.custom_prompt = config.custom_prompt
if config.custom_prompt: if config.custom_prompt:
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n' self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
@@ -66,9 +50,11 @@ $$1+1=2$$
return system_prompt, prompt return system_prompt, prompt
def send_chunks(self, prompts: list[str]): def send_chunks(self, prompts: list[str]):
prompts=[generate_prompt(prompt,self.to_lang) for prompt in prompts]
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler) return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
async def send_chunks_async(self, prompts: list[str]): async def send_chunks_async(self, prompts: list[str]):
prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts]
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler) return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
def update_glossary_dict(self, update_dict: dict | None): def update_glossary_dict(self, update_dict: dict | None):