From ea622086b9644427f0c595a84aa429ae9e160466 Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 16 Oct 2025 23:21:34 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96glossary=E6=8F=90=E7=A4=BA?= =?UTF-8?q?=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/glossary_agent.py | 39 +++++++++++--------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/docutranslate/agents/glossary_agent.py b/docutranslate/agents/glossary_agent.py index 7ace10a..e83064f 100644 --- a/docutranslate/agents/glossary_agent.py +++ b/docutranslate/agents/glossary_agent.py @@ -18,6 +18,9 @@ from docutranslate.utils.json_utils import segments2json_chunks def generate_prompt(json_segments: str, to_lang: str): return f""" You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents. +You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}. +Finally, output a glossary of original terms:translated terms + Here is the input: @@ -25,33 +28,25 @@ Here is the input: {json_segments} ``` -You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}. -Finally, output a glossary of Source Nouns:Target Nouns -> The source noun in the output glossary must exactly match the original term in original language, while target noun is the {to_lang} translation of the term -> Do not extract special tags or untranslatable elements (such as code, brand names, technical terms) -> The same source noun should only appear once in the glossary without repetition -> The Target Nouns -Here is an example of the expected format: + +- The original language is identified based on the context.The target language is {to_lang} +- The same src should only appear once in the glossary without repetition +- Do not include special tags or tags formatted as `` in the glossary +- Do not include common nouns in the glossary. +- No explanation in Translated Term. + + +The output format should be plain JSON text in a JSON array format: +{[{"src": "", "dst": ""}]} +Assuming the source language is English and the target language is Chinese in the example Input: - -```json -{{ -"3":"text", -"4":"text" -}} -``` - -Output - -```json -{'[{"src": "Source Noun1", "dst": "Target Noun1"},\n {"src": "Source Noun2", "dst": "Target Noun2"}, \n{"src": "Source Noun3", "dst": "Target Noun3"}]'} -``` - +{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}} +Output: +{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'} -Please return the translated JSON Array directly without including any additional information. """