diff --git a/docutranslate/agents/glossary_agent.py b/docutranslate/agents/glossary_agent.py index e83064f..8f2b132 100644 --- a/docutranslate/agents/glossary_agent.py +++ b/docutranslate/agents/glossary_agent.py @@ -51,7 +51,7 @@ Output: def get_original_segments(prompt: str): - match = re.search(r'(.*)', prompt, re.DOTALL) + match = re.search(r'\n```json\n(.*)\n```\n', prompt, re.DOTALL) if match: return match.group(1) else: diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index ef64b94..6fb2dd9 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -1,11 +1,20 @@ # SPDX-FileCopyrightText: 2025 QinHan # SPDX-License-Identifier: MPL-2.0 - +import re from dataclasses import dataclass from .agent import Agent, AgentConfig from ..glossary.glossary import Glossary + +def get_original_markdown(prompt: str): + match = re.search(r'\n(.*)\n', prompt, re.DOTALL) + if match: + return match.group(1) + else: + raise ValueError("无法从prompt中提取初始文本") + + def generate_prompt(markdown_text: str, to_lang: str): return f""" Treat the text input as markdown text and translate it into {to_lang},output translation ONLY. @@ -20,9 +29,12 @@ Treat the text input as markdown text and translate it into {to_lang},output tra - Output the translated markdown text as plain text (not in a markdown code block, with no extraneous text). The markdown text input: + {markdown_text} + """ + @dataclass class MDTranslateAgentConfig(AgentConfig): to_lang: str @@ -33,7 +45,7 @@ class MDTranslateAgentConfig(AgentConfig): class MDTranslateAgent(Agent): def __init__(self, config: MDTranslateAgentConfig): super().__init__(config) - self.to_lang=config.to_lang + self.to_lang = config.to_lang self.system_prompt = f""" # Role You are a professional machine translation engine. @@ -50,12 +62,15 @@ You are a professional machine translation engine. return system_prompt, prompt def send_chunks(self, prompts: list[str]): - prompts=[generate_prompt(prompt,self.to_lang) for prompt in prompts] - return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler) + prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts] + return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler, + error_result_handler=lambda prompt, logger: get_original_markdown(prompt)) async def send_chunks_async(self, prompts: list[str]): prompts = [generate_prompt(prompt, self.to_lang) for prompt in prompts] - return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler) + return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler, + error_result_handler=lambda prompt, logger: get_original_markdown( + prompt)) def update_glossary_dict(self, update_dict: dict | None): if self.glossary_dict is None: diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index 46d1faf..3c6921b 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -22,11 +22,9 @@ You will receive a sequence of original text segments to be translated, represen Here is the input: - ```json {json_segments} ``` - For each Key-Value Pair in the JSON, translate the contents of the value into {to_lang}, Write the translation back into the value for that JSON. @@ -60,7 +58,7 @@ Please return the translated JSON directly without including any additional info """ def get_original_segments(prompt:str): - match = re.search(r'(.*)', prompt, re.DOTALL) + match = re.search(r'\n```json\n(.*)\n```\n', prompt, re.DOTALL) if match: return match.group(1) else: diff --git a/docutranslate/cacher/md_based_convert_cacher.py b/docutranslate/cacher/md_based_convert_cacher.py index 62f88da..de5ca9d 100644 --- a/docutranslate/cacher/md_based_convert_cacher.py +++ b/docutranslate/cacher/md_based_convert_cacher.py @@ -16,25 +16,29 @@ class MDBasedCovertCacher: self.cache_dict = OrderedDict() @staticmethod - def _get_hashcode(document: Document, convert_engin: str, convert_config: ConverterConfig|None) -> str: - if convert_config : - convert_config_hash=convert_config.gethash() + def _get_hashcode(document: Document, convert_engin: str, convert_config: ConverterConfig | None) -> str: + if convert_config: + convert_config_hash = convert_config.gethash() else: - convert_config_hash=None + convert_config_hash = None obj = (document.suffix, document.content, convert_engin, convert_config_hash) return str(hash(obj)) def get_cached_result(self, document: Document, convert_engin: str, convert_config: ConverterConfig) -> MarkdownDocument | None: - return self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config)) + d: MarkdownDocument | None = self.cache_dict.get(self._get_hashcode(document, convert_engin, convert_config)) + if d: + return d.copy() + else: + return None def cache_result(self, convert_result: MarkdownDocument, document: Document, convert_engin: str, convert_config: ConverterConfig) -> MarkdownDocument: hash_code = self._get_hashcode(document, convert_engin, convert_config) if len(self.cache_dict) > int(CACHE_NUM): self.cache_dict.popitem(last=False) - self.cache_dict[hash_code] = convert_result + self.cache_dict[hash_code] = convert_result.copy() return convert_result def clear(self): diff --git a/docutranslate/workflow/md_based_workflow.py b/docutranslate/workflow/md_based_workflow.py index d4c7b6b..df8382d 100644 --- a/docutranslate/workflow/md_based_workflow.py +++ b/docutranslate/workflow/md_based_workflow.py @@ -83,7 +83,7 @@ class MarkdownBasedWorkflow(Workflow[MarkdownBasedWorkflowConfig, Document, Mark for attachment in converter.attachments: self.attachment.add_attachment(attachment) # 缓存解析后文件 - md_based_convert_cacher.cache_result(document_md.copy(), self.document_original, convert_engin, convert_config) + md_based_convert_cacher.cache_result(document_md, self.document_original, convert_engin, convert_config) return document_md