From 9b478ebc4ae86fc7c408fe6f5f680504e25684f7 Mon Sep 17 00:00:00 2001 From: xunbu Date: Sat, 6 Sep 2025 18:13:07 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0json=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=A4=84=E7=90=86=EF=BC=8C=E5=A4=A7=E5=B9=85?= =?UTF-8?q?=E5=87=8F=E5=B0=91=E7=BC=BA=E9=94=AE=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/agent.py | 2 +- docutranslate/agents/segments_agent.py | 15 +++++++++------ docutranslate/utils/json_utils.py | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index c8efae5..8e4b03a 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -180,7 +180,6 @@ class Agent: ) response.raise_for_status() result = response.json()["choices"][0]["message"]["content"] - if retry_count > 0: self.logger.info(f"重试成功 (第 {retry_count}/{MAX_RETRY_COUNT} 次尝试)。") @@ -192,6 +191,7 @@ class Agent: should_retry = True # 专门捕获部分翻译错误(软错误) except PartialAgentResultError as e: + # print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}") self.logger.error(f"收到部分返回结果,将尝试重试: {e}") current_partial_result = e.partial_result should_retry = True diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index b78b5a8..162a160 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -12,7 +12,7 @@ from json_repair import json_repair from docutranslate.agents import AgentConfig, Agent from docutranslate.agents.agent import PartialAgentResultError, AgentResultError from docutranslate.glossary.glossary import Glossary -from docutranslate.utils.json_utils import segments2json_chunks +from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string @dataclass @@ -42,13 +42,15 @@ class SegmentsTranslateAgent(Agent): # Output - The translated sequence of segments, represented as JSON text (note: not a code block). The keys are the segment IDs, and the values are the translated segments. - The returned JSON text must be a dictionary of the form {{: }}. -- The segment IDs in the output must **exactly** match those in the input. And all segment IDs in input must appear in the output. +- (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output. # Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language) ## Input -{r'{"10":"hello","11":"apple","12":true,"13":"false","14":null}'} -## Output -{r'{"10":"你好","11":"苹果","12":true,"13":"错误","14":null}'} -> Warning: Never wrap the JSON text in ```. +{{"10":"Tom say:\"hello\"","11":“apple”,"12":true,"13":"false","14":null}} +## Correct Output +{{"10":"汤姆说:“你好”","11":"苹果","12":true,"13":"错误","14":null}} +## Incorrect Output +{{"10":"汤姆说:“你好”,"11":“苹果”,"12":true,"13":"错误"}} +> Warning: Never wrap the JSON text in ```, Never miss segment Translation. """ self.custom_prompt = config.custom_prompt if config.custom_prompt: @@ -73,6 +75,7 @@ class SegmentsTranslateAgent(Agent): raise AgentResultError("result为空值但原文不为空") return {} try: + result=fix_json_string(result) original_chunk = json.loads(origin_prompt) repaired_result = json_repair.loads(result) diff --git a/docutranslate/utils/json_utils.py b/docutranslate/utils/json_utils.py index b8f7eb0..061811a 100644 --- a/docutranslate/utils/json_utils.py +++ b/docutranslate/utils/json_utils.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2025 QinHan # SPDX-License-Identifier: MPL-2.0 import json +import re def get_json_size(js: dict) -> int: @@ -84,5 +85,22 @@ list[dict[str, str]], list[tuple[int, int]]]: return js, json_chunks_list, merged_indices_list +def fix_json_string(json_string): + def repl(m:re.Match): + result="" + if m.group(1): + result+='",' + result+=f'"{m.group(2)}":' + if m.group(3): + result+='"' + return result + fixed_json = re.sub( + r'([”"])?\s?[,|,]\s?[\"|“]\s?(\d+?)\s?[\"|”]\s?[:|:]\s?([\"|“])?', + repl, + json_string + ) + return fixed_json + + if __name__ == '__main__': print(get_json_size({"0": ""}))