80 lines
3.4 KiB
Python
80 lines
3.4 KiB
Python
# SPDX-FileCopyrightText: 2025 QinHan
|
|
# SPDX-License-Identifier: MPL-2.0
|
|
import re
|
|
from dataclasses import dataclass
|
|
|
|
from .agent import Agent, AgentConfig
|
|
from ..glossary.glossary import Glossary
|
|
|
|
|
|
def get_original_markdown(prompt: str):
|
|
match = re.search(r'<input>\n(.*)\n</input>', prompt, re.DOTALL)
|
|
if match:
|
|
return match.group(1)
|
|
else:
|
|
raise ValueError("无法从prompt中提取初始文本")
|
|
|
|
|
|
def generate_prompt(markdown_text: str, to_lang: str):
|
|
return f"""
|
|
Treat the text input as markdown text and translate it into {to_lang},output translation ONLY.
|
|
- NO explanations. NO notes.
|
|
- Do not change placeholders in the format of `<ph-xxxxxx>`.
|
|
- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
|
|
- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
|
|
- Remove or correct any obviously abnormal characters, but without altering the original meaning.
|
|
- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
|
|
[1] Author A, Author B. "Original Title". Journal, 2023.
|
|
[2] 作者C. 《中文标题》. 期刊, 2022.
|
|
- Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text).
|
|
|
|
The markdown text input:
|
|
<input>
|
|
{markdown_text}
|
|
</input>
|
|
"""
|
|
|
|
|
|
@dataclass
|
|
class MDTranslateAgentConfig(AgentConfig):
|
|
to_lang: str
|
|
custom_prompt: str | None = None
|
|
glossary_dict: dict[str, str] | None = None
|
|
|
|
|
|
class MDTranslateAgent(Agent):
|
|
def __init__(self, config: MDTranslateAgentConfig):
|
|
super().__init__(config)
|
|
self.to_lang = config.to_lang
|
|
self.system_prompt = f"""
|
|
# Role
|
|
You are a professional machine translation engine.
|
|
"""
|
|
self.custom_prompt = config.custom_prompt
|
|
if config.custom_prompt:
|
|
self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
|
|
self.glossary_dict = config.glossary_dict
|
|
|
|
def _pre_send_handler(self, system_prompt, prompt):
|
|
if self.glossary_dict:
|
|
glossary = Glossary(glossary_dict=self.glossary_dict)
|
|
system_prompt += glossary.append_system_prompt(prompt)
|
|
return system_prompt, prompt
|
|
|
|
def send_chunks(self, prompts: list[str]):
|
|
prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts]
|
|
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler,
|
|
error_result_handler=lambda prompt, logger: get_original_markdown(prompt))
|
|
|
|
async def send_chunks_async(self, prompts: list[str]):
|
|
prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts]
|
|
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler,
|
|
error_result_handler=lambda prompt, logger: get_original_markdown(
|
|
prompt))
|
|
|
|
def update_glossary_dict(self, update_dict: dict | None):
|
|
if self.glossary_dict is None:
|
|
self.glossary_dict = {}
|
|
if update_dict is not None:
|
|
self.glossary_dict = update_dict | self.glossary_dict
|