优化glossary_agent和markdown_agent的提示词
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
__version__="1.4.12"
|
__version__="1.4.13a1"
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from json import JSONDecodeError
|
from json import JSONDecodeError
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
@@ -14,6 +15,62 @@ from docutranslate.agents.agent import AgentResultError
|
|||||||
from docutranslate.utils.json_utils import segments2json_chunks
|
from docutranslate.utils.json_utils import segments2json_chunks
|
||||||
|
|
||||||
|
|
||||||
|
def generate_prompt(json_segments: str, to_lang: str):
|
||||||
|
return f"""
|
||||||
|
You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
|
||||||
|
Here is the input:
|
||||||
|
|
||||||
|
<input>
|
||||||
|
```json
|
||||||
|
{json_segments}
|
||||||
|
```
|
||||||
|
</input>
|
||||||
|
You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}.
|
||||||
|
Finally, output a glossary of Source Nouns:Target Nouns
|
||||||
|
> The source noun in the output glossary must exactly match the original term in original language, while target noun is the {to_lang} translation of the term
|
||||||
|
> Do not extract special tags or untranslatable elements (such as code, brand names, technical terms)
|
||||||
|
> The same source noun should only appear once in the glossary without repetition
|
||||||
|
> The Target Nouns
|
||||||
|
|
||||||
|
Here is an example of the expected format:
|
||||||
|
|
||||||
|
<example>
|
||||||
|
Input:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{{
|
||||||
|
"3":"text",
|
||||||
|
"4":"text"
|
||||||
|
}}
|
||||||
|
```
|
||||||
|
|
||||||
|
Output
|
||||||
|
|
||||||
|
```json
|
||||||
|
{'[{"src": "Source Noun1", "dst": "Target Noun1"},\n {"src": "Source Noun2", "dst": "Target Noun2"}, \n{"src": "Source Noun3", "dst": "Target Noun3"}]'}
|
||||||
|
```
|
||||||
|
|
||||||
|
</example>
|
||||||
|
Please return the translated JSON Array directly without including any additional information.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_original_segments(prompt: str):
|
||||||
|
match = re.search(r'<input>(.*)</input>', prompt, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
else:
|
||||||
|
raise ValueError("无法从prompt中提取初始文本")
|
||||||
|
|
||||||
|
|
||||||
|
def get_target_segments(result: str):
|
||||||
|
match = re.search(r'```json(.*)```', result, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
else:
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GlossaryAgentConfig(AgentConfig):
|
class GlossaryAgentConfig(AgentConfig):
|
||||||
to_lang: str
|
to_lang: str
|
||||||
@@ -27,34 +84,13 @@ class GlossaryAgent(Agent):
|
|||||||
self.system_prompt = f"""
|
self.system_prompt = f"""
|
||||||
# Role
|
# Role
|
||||||
You are a professional glossary extractor
|
You are a professional glossary extractor
|
||||||
|
|
||||||
# Task
|
|
||||||
You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
|
|
||||||
You need to extract person names and location names from these paragraphs and translate these terms into {self.to_lang}.
|
|
||||||
Finally, output a glossary of original terms:translated terms
|
|
||||||
|
|
||||||
# Requirements
|
|
||||||
- The original language is identified based on the context.The target language is {self.to_lang}
|
|
||||||
- The src in the output glossary must exactly match the original term in original language, while dst is the {self.to_lang} translation of the term
|
|
||||||
- Do not include special tags or tags formatted as `<ph-xxxxxx>` in the glossary
|
|
||||||
- The same src should only appear once in the glossary without repetition
|
|
||||||
- Do not include common nouns in the glossary.
|
|
||||||
|
|
||||||
# Output
|
|
||||||
The output format should be plain JSON text in a list format
|
|
||||||
{[{"src": "<Original Term>", "dst": "<Translated Term>"}]}
|
|
||||||
|
|
||||||
# Example1(Assuming the source language is English and the target language is Chinese in the example)
|
|
||||||
## Input
|
|
||||||
{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}}
|
|
||||||
## Output
|
|
||||||
{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'}
|
|
||||||
"""
|
"""
|
||||||
self.custom_prompt = config.custom_prompt
|
self.custom_prompt = config.custom_prompt
|
||||||
if config.custom_prompt:
|
if config.custom_prompt:
|
||||||
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
|
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
|
||||||
|
|
||||||
def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
|
def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
|
||||||
|
result = get_target_segments(result)
|
||||||
if result == "":
|
if result == "":
|
||||||
if origin_prompt.strip() != "":
|
if origin_prompt.strip() != "":
|
||||||
logger.error("result为空值但原文不为空")
|
logger.error("result为空值但原文不为空")
|
||||||
@@ -66,11 +102,11 @@ The output format should be plain JSON text in a list format
|
|||||||
raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}")
|
raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}")
|
||||||
return repaired_result
|
return repaired_result
|
||||||
except (RuntimeError, JSONDecodeError) as e:
|
except (RuntimeError, JSONDecodeError) as e:
|
||||||
# 将解析错误包装成 ValueError 以便被 send 方法捕获并重试
|
|
||||||
raise AgentResultError(f"结果不能正确解析: {e.__repr__()}")
|
raise AgentResultError(f"结果不能正确解析: {e.__repr__()}")
|
||||||
|
|
||||||
def _error_result_handler(self, origin_prompt: str, logger: Logger):
|
def _error_result_handler(self, origin_prompt: str, logger: Logger):
|
||||||
if origin_prompt == "":
|
origin_prompt = get_original_segments(origin_prompt)
|
||||||
|
if origin_prompt.strip() == "":
|
||||||
return []
|
return []
|
||||||
try:
|
try:
|
||||||
return json_repair.loads(origin_prompt)
|
return json_repair.loads(origin_prompt)
|
||||||
@@ -82,7 +118,7 @@ The output format should be plain JSON text in a list format
|
|||||||
self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}")
|
self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}")
|
||||||
result = {}
|
result = {}
|
||||||
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||||
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
|
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
|
||||||
translated_chunks = super().send_prompts(prompts=prompts,
|
translated_chunks = super().send_prompts(prompts=prompts,
|
||||||
result_handler=self._result_handler,
|
result_handler=self._result_handler,
|
||||||
error_result_handler=self._error_result_handler)
|
error_result_handler=self._error_result_handler)
|
||||||
@@ -106,7 +142,7 @@ The output format should be plain JSON text in a list format
|
|||||||
result = {}
|
result = {}
|
||||||
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
||||||
chunk_size)
|
chunk_size)
|
||||||
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
|
prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
|
||||||
translated_chunks = await super().send_prompts_async(prompts=prompts,
|
translated_chunks = await super().send_prompts_async(prompts=prompts,
|
||||||
result_handler=self._result_handler,
|
result_handler=self._result_handler,
|
||||||
error_result_handler=self._error_result_handler)
|
error_result_handler=self._error_result_handler)
|
||||||
|
|||||||
@@ -6,6 +6,22 @@ from dataclasses import dataclass
|
|||||||
from .agent import Agent, AgentConfig
|
from .agent import Agent, AgentConfig
|
||||||
from ..glossary.glossary import Glossary
|
from ..glossary.glossary import Glossary
|
||||||
|
|
||||||
|
def generate_prompt(markdown_text: str, to_lang: str):
|
||||||
|
return f"""
|
||||||
|
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
|
||||||
|
- NO explanations. NO notes.
|
||||||
|
- Do not change placeholders in the format of `<ph-xxxxxx>`.
|
||||||
|
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
||||||
|
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
||||||
|
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
||||||
|
- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
|
||||||
|
[1] Author A, Author B. "Original Title". Journal, 2023.
|
||||||
|
[2] 作者C. 《中文标题》. 期刊, 2022.
|
||||||
|
- Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text).
|
||||||
|
|
||||||
|
The markdown text input:
|
||||||
|
{markdown_text}
|
||||||
|
"""
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MDTranslateAgentConfig(AgentConfig):
|
class MDTranslateAgentConfig(AgentConfig):
|
||||||
@@ -17,43 +33,11 @@ class MDTranslateAgentConfig(AgentConfig):
|
|||||||
class MDTranslateAgent(Agent):
|
class MDTranslateAgent(Agent):
|
||||||
def __init__(self, config: MDTranslateAgentConfig):
|
def __init__(self, config: MDTranslateAgentConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
self.to_lang=config.to_lang
|
||||||
self.system_prompt = f"""
|
self.system_prompt = f"""
|
||||||
# Role
|
# Role
|
||||||
You are a professional machine translation engine.
|
You are a professional machine translation engine.
|
||||||
|
"""
|
||||||
# Task
|
|
||||||
Translate the input markdown text.
|
|
||||||
Target language: {config.to_lang}
|
|
||||||
|
|
||||||
# Requirements
|
|
||||||
- The translation must be professional and accurate.
|
|
||||||
- Do not output any explanations or annotations.
|
|
||||||
- For personal names and proper nouns, use the most commonly used words for translation. If there are multiple common translations, choose the word that comes first in dictionary order.
|
|
||||||
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
|
||||||
- Do not change placeholders in the format of `<ph-xxxxxx>`.
|
|
||||||
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
|
||||||
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
|
||||||
- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
|
|
||||||
[1] Author A, Author B. "Original Title". Journal, 2023.
|
|
||||||
[2] 作者C. 《中文标题》. 期刊, 2022.
|
|
||||||
|
|
||||||
# Output
|
|
||||||
The translated markdown text as plain text (not in a markdown code block, with no extraneous text).
|
|
||||||
|
|
||||||
# Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language)
|
|
||||||
Input:
|
|
||||||
hello, what's your nam*@e?
|
|
||||||

|
|
||||||
The equation is E=mc 2. This is famous.
|
|
||||||
1+1=2$$
|
|
||||||
(c_0,c_1_1,c_2^2)is a coordinate.
|
|
||||||
|
|
||||||
Output:
|
|
||||||
你好,你叫什么名字?
|
|
||||||

|
|
||||||
这个方程是 $E=mc^2$。这很有名。
|
|
||||||
$$1+1=2$$
|
|
||||||
\\((c_0,c_1,c_2^2)\\)是一个坐标。"""
|
|
||||||
self.custom_prompt = config.custom_prompt
|
self.custom_prompt = config.custom_prompt
|
||||||
if config.custom_prompt:
|
if config.custom_prompt:
|
||||||
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
|
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
|
||||||
@@ -66,9 +50,11 @@ $$1+1=2$$
|
|||||||
return system_prompt, prompt
|
return system_prompt, prompt
|
||||||
|
|
||||||
def send_chunks(self, prompts: list[str]):
|
def send_chunks(self, prompts: list[str]):
|
||||||
|
prompts=[generate_prompt(prompt,self.to_lang) for prompt in prompts]
|
||||||
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
||||||
|
|
||||||
async def send_chunks_async(self, prompts: list[str]):
|
async def send_chunks_async(self, prompts: list[str]):
|
||||||
|
prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts]
|
||||||
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
||||||
|
|
||||||
def update_glossary_dict(self, update_dict: dict | None):
|
def update_glossary_dict(self, update_dict: dict | None):
|
||||||
|
|||||||
Reference in New Issue
Block a user