增加json文本修复处理,大幅减少缺键错误

This commit is contained in:
xunbu
2025-09-06 18:13:07 +08:00
parent 2e6d45b7b6
commit 9b478ebc4a
3 changed files with 28 additions and 7 deletions

View File

@@ -180,7 +180,6 @@ class Agent:
) )
response.raise_for_status() response.raise_for_status()
result = response.json()["choices"][0]["message"]["content"] result = response.json()["choices"][0]["message"]["content"]
if retry_count > 0: if retry_count > 0:
self.logger.info(f"重试成功 (第 {retry_count}/{MAX_RETRY_COUNT} 次尝试)。") self.logger.info(f"重试成功 (第 {retry_count}/{MAX_RETRY_COUNT} 次尝试)。")
@@ -192,6 +191,7 @@ class Agent:
should_retry = True should_retry = True
# 专门捕获部分翻译错误(软错误) # 专门捕获部分翻译错误(软错误)
except PartialAgentResultError as e: except PartialAgentResultError as e:
# print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}")
self.logger.error(f"收到部分返回结果,将尝试重试: {e}") self.logger.error(f"收到部分返回结果,将尝试重试: {e}")
current_partial_result = e.partial_result current_partial_result = e.partial_result
should_retry = True should_retry = True

View File

@@ -12,7 +12,7 @@ from json_repair import json_repair
from docutranslate.agents import AgentConfig, Agent from docutranslate.agents import AgentConfig, Agent
from docutranslate.agents.agent import PartialAgentResultError, AgentResultError from docutranslate.agents.agent import PartialAgentResultError, AgentResultError
from docutranslate.glossary.glossary import Glossary from docutranslate.glossary.glossary import Glossary
from docutranslate.utils.json_utils import segments2json_chunks from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
@dataclass @dataclass
@@ -42,13 +42,15 @@ class SegmentsTranslateAgent(Agent):
# Output # Output
- The translated sequence of segments, represented as JSON text (note: not a code block). The keys are the segment IDs, and the values are the translated segments. - The translated sequence of segments, represented as JSON text (note: not a code block). The keys are the segment IDs, and the values are the translated segments.
- The returned JSON text must be a dictionary of the form {{<segment_id>: <translation>}}. - The returned JSON text must be a dictionary of the form {{<segment_id>: <translation>}}.
- The segment IDs in the output must **exactly** match those in the input. And all segment IDs in input must appear in the output. - (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
# Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language) # Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language)
## Input ## Input
{r'{"10":"hello","11":"apple","12":true,"13":"false","14":null}'} {{"10":"Tom say:\"hello\"","11":“apple”"12":true,"13":"false","14":null}}
## Output ## Correct Output
{r'{"10":"你好","11":"苹果","12":true,"13":"错误","14":null}'} {{"10":"汤姆说:“你好”","11":"苹果","12":true,"13":"错误","14":null}}
> Warning: Never wrap the JSON text in ```. ## Incorrect Output
{{"10":"汤姆说:“你好”,"11":“苹果”,"12":true,"13":"错误"}}
> Warning: Never wrap the JSON text in ```, Never miss segment Translation.
""" """
self.custom_prompt = config.custom_prompt self.custom_prompt = config.custom_prompt
if config.custom_prompt: if config.custom_prompt:
@@ -73,6 +75,7 @@ class SegmentsTranslateAgent(Agent):
raise AgentResultError("result为空值但原文不为空") raise AgentResultError("result为空值但原文不为空")
return {} return {}
try: try:
result=fix_json_string(result)
original_chunk = json.loads(origin_prompt) original_chunk = json.loads(origin_prompt)
repaired_result = json_repair.loads(result) repaired_result = json_repair.loads(result)

View File

@@ -1,6 +1,7 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
import json import json
import re
def get_json_size(js: dict) -> int: def get_json_size(js: dict) -> int:
@@ -84,5 +85,22 @@ list[dict[str, str]], list[tuple[int, int]]]:
return js, json_chunks_list, merged_indices_list return js, json_chunks_list, merged_indices_list
def fix_json_string(json_string):
def repl(m:re.Match):
result=""
if m.group(1):
result+='",'
result+=f'"{m.group(2)}":'
if m.group(3):
result+='"'
return result
fixed_json = re.sub(
r'([”"])?\s?[|,]\s?[\"|“]\s?(\d+?)\s?[\"|”]\s?[|:]\s?([\"|“])?',
repl,
json_string
)
return fixed_json
if __name__ == '__main__': if __name__ == '__main__':
print(get_json_size({"0": ""})) print(get_json_size({"0": ""}))