增加json文本修复处理，大幅减少缺键错误

2025-09-06 18:13:07 +08:00
parent 2e6d45b7b6
commit 9b478ebc4a
3 changed files with 28 additions and 7 deletions
--- a/docutranslate/agents/agent.py
+++ b/docutranslate/agents/agent.py
@@ -180,7 +180,6 @@ class Agent:
            )
            response.raise_for_status()
            result = response.json()["choices"][0]["message"]["content"]
            if retry_count > 0:
                self.logger.info(f"重试成功 (第 {retry_count}/{MAX_RETRY_COUNT} 次尝试)。")
@@ -192,6 +191,7 @@ class Agent:
            should_retry = True
        # 专门捕获部分翻译错误（软错误）
        except PartialAgentResultError as e:
            # print(f"【测试】\nprompt:\n{prompt}\nresp:\n{result}")
            self.logger.error(f"收到部分返回结果，将尝试重试: {e}")
            current_partial_result = e.partial_result
            should_retry = True
--- a/docutranslate/agents/segments_agent.py
+++ b/docutranslate/agents/segments_agent.py
@@ -12,7 +12,7 @@ from json_repair import json_repair
 from docutranslate.agents import AgentConfig, Agent
 from docutranslate.agents.agent import PartialAgentResultError, AgentResultError
 from docutranslate.glossary.glossary import Glossary
-from docutranslate.utils.json_utils import segments2json_chunks
+from docutranslate.utils.json_utils import segments2json_chunks, fix_json_string
@dataclass
@@ -42,13 +42,15 @@ class SegmentsTranslateAgent(Agent):
 # Output
 - The translated sequence of segments, represented as JSON text (note: not a code block). The keys are the segment IDs, and the values are the translated segments.
 - The returned JSON text must be a dictionary of the form {{<segment_id>: <translation>}}.
- The segment IDs in the output must **exactly** match those in the input. And all segment IDs in input must appear in the output.
+- (very important) The segment IDs in the output must exactly match those in the input. And all segment IDs in input must appear in the output.
 # Example(Assuming the target language is Chinese in the example, {config.to_lang} is the actual target language)
 ## Input
-{r'{"10":"hello","11":"apple","12":true,"13":"false","14":null}'}
+{{"10":"Tom say:\"hello\"","11":“apple”，"12":true,"13":"false","14":null}}
-## Output
+## Correct Output
-{r'{"10":"你好","11":"苹果","12":true,"13":"错误","14":null}'}
+{{"10":"汤姆说：“你好”","11":"苹果","12":true,"13":"错误","14":null}}
-> Warning: Never wrap the JSON text in ```.
+## Incorrect Output
 {{"10":"汤姆说:“你好”，"11":“苹果”，"12":true,"13":"错误"}}
 > Warning: Never wrap the JSON text in ```, Never miss segment Translation.
 """
        self.custom_prompt = config.custom_prompt
        if config.custom_prompt:
@@ -73,6 +75,7 @@ class SegmentsTranslateAgent(Agent):
                raise AgentResultError("result为空值但原文不为空")
            return {}
        try:
            result=fix_json_string(result)
            original_chunk = json.loads(origin_prompt)
            repaired_result = json_repair.loads(result)
--- a/docutranslate/utils/json_utils.py
+++ b/docutranslate/utils/json_utils.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2025 QinHan
 # SPDX-License-Identifier: MPL-2.0
 import json
 import re
 def get_json_size(js: dict) -> int:
@@ -84,5 +85,22 @@ list[dict[str, str]], list[tuple[int, int]]]:
    return js, json_chunks_list, merged_indices_list
 def fix_json_string(json_string):
    def repl(m:re.Match):
        result=""
        if m.group(1):
            result+='",'
        result+=f'"{m.group(2)}":'
        if m.group(3):
            result+='"'
        return result
    fixed_json = re.sub(
        r'([”"])?\s?[，|,]\s?[\"|“]\s?(\d+?)\s?[\"|”]\s?[：|:]\s?([\"|“])?',
        repl,
        json_string
    )
    return fixed_json
 if __name__ == '__main__':
    print(get_json_size({"0": ""}))