From 8780b25ae4c908cc9bafd59278449001220a45d6 Mon Sep 17 00:00:00 2001
From: xunbu <xunbu3@qq.com>
Date: Thu, 16 Oct 2025 22:20:44 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96glossary=5Fagent=E5=92=8Cmark?=
 =?UTF-8?q?down=5Fagent=E7=9A=84=E6=8F=90=E7=A4=BA=E8=AF=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docutranslate/__init__.py              |  2 +-
 docutranslate/agents/glossary_agent.py | 88 ++++++++++++++++++--------
 docutranslate/agents/markdown_agent.py | 54 ++++++----------
 3 files changed, 83 insertions(+), 61 deletions(-)
diff --git a/docutranslate/__init__.py b/docutranslate/__init__.py
index 59470a8..0ac4d14 100644
--- a/docutranslate/__init__.py
+++ b/docutranslate/__init__.py
@@ -1,3 +1,3 @@
 # SPDX-FileCopyrightText: 2025 QinHan
 # SPDX-License-Identifier: MPL-2.0
-__version__="1.4.12"
\ No newline at end of file
+__version__="1.4.13a1"
\ No newline at end of file
diff --git a/docutranslate/agents/glossary_agent.py b/docutranslate/agents/glossary_agent.py
index a5c3fa1..7ace10a 100644
--- a/docutranslate/agents/glossary_agent.py
+++ b/docutranslate/agents/glossary_agent.py
@@ -3,6 +3,7 @@
 
 import asyncio
 import json
+import re
 from dataclasses import dataclass
 from json import JSONDecodeError
 from logging import Logger
@@ -14,6 +15,62 @@ from docutranslate.agents.agent import AgentResultError
 from docutranslate.utils.json_utils import segments2json_chunks
 
 
+def generate_prompt(json_segments: str, to_lang: str):
+    return f"""
+You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
+Here is the input:
+
+<input>
+```json
+{json_segments}
+```
+</input>
+You need to extract person names and location names from these paragraphs and translate these terms into {to_lang}.
+Finally, output a glossary of Source Nouns:Target Nouns
+> The source noun in the output glossary must exactly match the original term in original language, while target noun is the {to_lang} translation of the term
+> Do not extract special tags or untranslatable elements (such as code, brand names, technical terms)
+> The same source noun should only appear once in the glossary without repetition
+> The Target Nouns
+
+Here is an example of the expected format:
+
+<example>
+Input:
+
+```json
+{{
+"3":"text",
+"4":"text"
+}}
+```
+
+Output
+
+```json
+{'[{"src": "Source Noun1", "dst": "Target Noun1"},\n {"src": "Source Noun2", "dst": "Target Noun2"}, \n{"src": "Source Noun3", "dst": "Target Noun3"}]'}
+```
+
+</example>
+Please return the translated JSON Array directly without including any additional information.
+"""
+
+
+def get_original_segments(prompt: str):
+    match = re.search(r'<input>(.*)</input>', prompt, re.DOTALL)
+    if match:
+        return match.group(1)
+    else:
+        raise ValueError("无法从prompt中提取初始文本")
+
+
+def get_target_segments(result: str):
+    match = re.search(r'```json(.*)```', result, re.DOTALL)
+    if match:
+        return match.group(1)
+    else:
+        return result
+
+
 @dataclass
 class GlossaryAgentConfig(AgentConfig):
     to_lang: str
@@ -27,34 +84,13 @@ class GlossaryAgent(Agent):
         self.system_prompt = f"""
 # Role
 You are a professional glossary extractor
-
-# Task
-You will receive a JSON-formatted list of paragraphs where keys are paragraph numbers and values are paragraph contents.
-You need to extract person names and location names from these paragraphs and translate these terms into {self.to_lang}.
-Finally, output a glossary of original terms:translated terms
-
-# Requirements
-- The original language is identified based on the context.The target language is {self.to_lang}
-- The src in the output glossary must exactly match the original term in original language, while dst is the {self.to_lang} translation of the term
-- Do not include special tags or tags formatted as `<ph-xxxxxx>` in the glossary
-- The same src should only appear once in the glossary without repetition
-- Do not include common nouns in the glossary.
-
-# Output
-The output format should be plain JSON text in a list format
-{[{"src": "<Original Term>", "dst": "<Translated Term>"}]}
-
-# Example1(Assuming the source language is English and the target language is Chinese in the example)
-## Input
-{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}}
-## Output
-{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'}
 """
         self.custom_prompt = config.custom_prompt
         if config.custom_prompt:
             self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
 
     def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
+        result = get_target_segments(result)
         if result == "":
             if origin_prompt.strip() != "":
                 logger.error("result为空值但原文不为空")
@@ -66,11 +102,11 @@ The output format should be plain JSON text in a list format
                 raise AgentResultError(f"GlossaryAgent返回结果不是list的json形式, result: {result}")
             return repaired_result
         except (RuntimeError, JSONDecodeError) as e:
-            # 将解析错误包装成 ValueError 以便被 send 方法捕获并重试
             raise AgentResultError(f"结果不能正确解析: {e.__repr__()}")
 
     def _error_result_handler(self, origin_prompt: str, logger: Logger):
-        if origin_prompt == "":
+        origin_prompt = get_original_segments(origin_prompt)
+        if origin_prompt.strip() == "":
             return []
         try:
             return json_repair.loads(origin_prompt)
@@ -82,7 +118,7 @@ The output format should be plain JSON text in a list format
         self.logger.info(f"开始提取术语表,to_lang:{self.to_lang}")
         result = {}
         indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
-        prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
+        prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
         translated_chunks = super().send_prompts(prompts=prompts,
                                                  result_handler=self._result_handler,
                                                  error_result_handler=self._error_result_handler)
@@ -106,7 +142,7 @@ The output format should be plain JSON text in a list format
         result = {}
         indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
                                                                                  chunk_size)
-        prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
+        prompts = [generate_prompt(json.dumps(chunk, ensure_ascii=False), self.to_lang) for chunk in chunks]
         translated_chunks = await super().send_prompts_async(prompts=prompts,
                                                              result_handler=self._result_handler,
                                                              error_result_handler=self._error_result_handler)
diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py
index 0d20f10..ef64b94 100644
--- a/docutranslate/agents/markdown_agent.py
+++ b/docutranslate/agents/markdown_agent.py
@@ -6,6 +6,22 @@ from dataclasses import dataclass
 from .agent import Agent, AgentConfig
 from ..glossary.glossary import Glossary
 
+def generate_prompt(markdown_text: str, to_lang: str):
+    return f"""
+Treat the text input as markdown text and translate it into {to_lang},output translation ONLY. 
+- NO explanations. NO notes. 
+- Do not change placeholders in the format of `<ph-xxxxxx>`.
+- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
+- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
+- Remove or correct any obviously abnormal characters, but without altering the original meaning.
+- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
+  [1] Author A, Author B. "Original Title". Journal, 2023.
+  [2] 作者C. 《中文标题》. 期刊, 2022.
+- Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text).
+
+The markdown text input:
+ {markdown_text}
+"""
 
 @dataclass
 class MDTranslateAgentConfig(AgentConfig):
@@ -17,43 +33,11 @@ class MDTranslateAgentConfig(AgentConfig):
 class MDTranslateAgent(Agent):
     def __init__(self, config: MDTranslateAgentConfig):
         super().__init__(config)
+        self.to_lang=config.to_lang
         self.system_prompt = f"""
 # Role
 You are a professional machine translation engine.
-
-# Task
-Translate the input markdown text.
-Target language: {config.to_lang}
-
-# Requirements
-- The translation must be professional and accurate.
-- Do not output any explanations or annotations.
-- For personal names and proper nouns, use the most commonly used words for translation. If there are multiple common translations, choose the word that comes first in dictionary order.
-- For special tags or other non-translatable elements (like codes, brand names, specific jargon), keep them in their original form.
-- Do not change placeholders in the format of `<ph-xxxxxx>`.
-- All formulas, regardless of length, must be represented as valid, parsable LaTeX. They must be correctly enclosed by `$`, `\\(\\)`, or `$$`. If a formula is not formatted correctly, you must fix it.
-- Remove or correct any obviously abnormal characters, but without altering the original meaning.
-- When citing references, strictly preserve the original text; do not translate them. Examples of reference formats are as follows:
-  [1] Author A, Author B. "Original Title". Journal, 2023.
-  [2] 作者C. 《中文标题》. 期刊, 2022.
-
-# Output
-The translated markdown text as plain text (not in a markdown code block, with no extraneous text).
-
-# Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language)
-Input:
-hello, what's your nam*@e?
-![photo title](<ph-abcdde>)
-The equation is E=mc 2. This is famous.
-1+1=2$$
-(c_0,c_1_1,c_2^2)is a coordinate.
-
-Output:
-你好，你叫什么名字？
-![图像标题](<ph-abcdde>)
-这个方程是 $E=mc^2$。这很有名。
-$$1+1=2$$
-\\((c_0,c_1,c_2^2)\\)是一个坐标。"""
+"""
         self.custom_prompt = config.custom_prompt
         if config.custom_prompt:
             self.system_prompt += "\n# **Important rules or background** \n" + self.custom_prompt + '\nEND\n'
@@ -66,9 +50,11 @@ $$1+1=2$$
         return system_prompt, prompt
 
     def send_chunks(self, prompts: list[str]):
+        prompts=[generate_prompt(prompt,self.to_lang) for prompt in prompts]
         return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
 
     async def send_chunks_async(self, prompts: list[str]):
+        prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts]
         return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
 
     def update_glossary_dict(self, update_dict: dict | None):