From 56f5f4023e4ed864af2147949c1c1d6467c30541 Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 28 Aug 2025 11:52:25 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E8=87=AA=E5=8A=A8=E7=94=9F?= =?UTF-8?q?=E6=88=90=E6=9C=AF=E8=AF=AD=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docutranslate/agents/agent.py | 3 +- docutranslate/agents/glossary_agent.py | 104 ++++++++++++++++++ docutranslate/agents/markdown_agent.py | 10 +- docutranslate/agents/segments_agent.py | 6 + docutranslate/agents/txt_agent.py | 8 ++ .../translator/ai_translator/base.py | 24 +++- .../ai_translator/docx_translator.py | 9 ++ .../ai_translator/epub_translator.py | 10 +- .../ai_translator/html_translator.py | 8 ++ .../ai_translator/json_translator.py | 8 +- .../translator/ai_translator/md_translator.py | 8 ++ .../ai_translator/srt_translator.py | 9 +- .../ai_translator/txt_translator.py | 8 ++ .../ai_translator/xlsx_translator.py | 8 ++ docutranslate/workflow/base.py | 11 +- 15 files changed, 221 insertions(+), 13 deletions(-) create mode 100644 docutranslate/agents/glossary_agent.py diff --git a/docutranslate/agents/agent.py b/docutranslate/agents/agent.py index 0003bc7..9921b6f 100644 --- a/docutranslate/agents/agent.py +++ b/docutranslate/agents/agent.py @@ -255,9 +255,10 @@ class Agent: return prompt if error_result_handler is None else error_result_handler(prompt, self.logger) def _send_prompt_count(self, client: httpx.Client, prompt: str, system_prompt: None | str, count: PromptsCounter, + pre_send_handler, result_handler, error_result_handler) -> Any: - result = self.send(client, prompt, system_prompt, result_handler=result_handler, + result = self.send(client, prompt, system_prompt, pre_send_handler=pre_send_handler,result_handler=result_handler, error_result_handler=error_result_handler) count.add() return result diff --git a/docutranslate/agents/glossary_agent.py b/docutranslate/agents/glossary_agent.py new file mode 100644 index 0000000..321e2d0 --- /dev/null +++ b/docutranslate/agents/glossary_agent.py @@ -0,0 +1,104 @@ +import asyncio +import json +from dataclasses import dataclass +from json import JSONDecodeError +from logging import Logger + +import json_repair + +from docutranslate.agents import AgentConfig, Agent +from docutranslate.utils.json_utils import segments2json_chunks + + +@dataclass +class GlossaryAgentConfig(AgentConfig): + to_lang: str + + +class GlossaryAgent(Agent): + def __init__(self, config: GlossaryAgentConfig): + super().__init__(config) + self.system_prompt = f""" +# Role +You are a professional machine translation engine. +# 角色 +你是一个专业的术语表提取器 + +# Task +你会收到一个json格式的段落表,其中键是段落的序号,值是段落的内容。 +你需要从这些段落中提取**人名**和**地名**,并翻译这些名词为{config.to_lang}语言。 +最终输出一个名词原文:名词译文的术语表 + +# Requirements +- 特殊标签、形如``的标签不要添加到术语表 +- 输出术语表的src必须与名词原文完全一致,dst是该名词的{config.to_lang}的译文 +- 相同的src仅在术语表中添加一次,不能重复 + +# Output +输出格式是列表的json纯文本 +{[{"src": "<名词原文>", "dst": "<名词译文>"}]} + +#示例 +## 输入(翻译为中文): +{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}} +## 输出 +{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'} +""" + + def _result_handler(self, result: str, origin_prompt: str, logger: Logger): + try: + result = json_repair.loads(result) + if not isinstance(result, list): + raise ValueError("GlossaryAgent返回结果不是list的json形式") + except: + logger.error("结果不能正确解析") + return self._error_result_handler(origin_prompt, logger) + return result + + def _error_result_handler(self, origin_prompt: str, logger: Logger): + try: + return json_repair.loads(origin_prompt) + except: + logger.error("prompt不是json格式") + return origin_prompt + + def send_segments(self, segments: list[str], chunk_size: int): + self.logger.info("开始提取术语表") + result = {} + indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size) + prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks] + translated_chunks = super().send_prompts(prompts=prompts, + result_handler=self._result_handler, + error_result_handler=self._error_result_handler) + for chunk in translated_chunks: + chunk: list[dict[str, str]] + try: + glossary_dict = {d["src"]: d["dst"] for d in chunk} + result = result | glossary_dict + except JSONDecodeError as e: + self.logger.info(f"json解析错误,解析文本:{chunk},错误:{e.__repr__()}") + except Exception as e: + self.logger.info(f"send_segments发生错误:{e.__repr__()}") + + return result + + async def send_segments_async(self, segments: list[str], chunk_size: int): + self.logger.info("开始提取术语表") + result = {} + indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments, + chunk_size) + prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks] + translated_chunks = await super().send_prompts_async(prompts=prompts, + result_handler=self._result_handler, + error_result_handler=self._error_result_handler) + for chunk in translated_chunks: + chunk: list[dict[str, str]] + try: + glossary_dict = {d["src"]: d["dst"] for d in chunk} + result = result | glossary_dict + except JSONDecodeError as e: + self.logger.info(f"json解析错误,解析文本:{chunk},错误:{e.__repr__()}") + except Exception as e: + self.logger.info(f"send_segments发生错误:{e.__repr__()}") + print(f"术语表:\n{result}") + return result diff --git a/docutranslate/agents/markdown_agent.py b/docutranslate/agents/markdown_agent.py index 55a12df..d46f446 100644 --- a/docutranslate/agents/markdown_agent.py +++ b/docutranslate/agents/markdown_agent.py @@ -62,7 +62,15 @@ $$1+1=2$$ glossary = Glossary(glossary_dict=self.glossary_dict) system_prompt += glossary.append_system_prompt(prompt) return system_prompt, prompt + def send_chunks(self, prompts: list[str]): return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler) + async def send_chunks_async(self, prompts: list[str]): - return await super().send_prompts_async(prompts=prompts,pre_send_handler=self._pre_send_handler) \ No newline at end of file + return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler) + + def update_glossary_dict(self, update_dict: dict | None): + if self.glossary_dict is None: + self.glossary_dict = {} + if update_dict is not None: + self.glossary_dict = self.glossary_dict | update_dict diff --git a/docutranslate/agents/segments_agent.py b/docutranslate/agents/segments_agent.py index d0eef49..3ad034a 100644 --- a/docutranslate/agents/segments_agent.py +++ b/docutranslate/agents/segments_agent.py @@ -144,3 +144,9 @@ Warning: Never wrap the entire JSON object in quotes to make it a single string. # 添加剩余部分 result.extend(ls[last_end:]) return result + + def update_glossary_dict(self, update_dict: dict|None): + if self.glossary_dict is None: + self.glossary_dict = {} + if update_dict is not None: + self.glossary_dict = self.glossary_dict | update_dict diff --git a/docutranslate/agents/txt_agent.py b/docutranslate/agents/txt_agent.py index 44a5142..24156f1 100644 --- a/docutranslate/agents/txt_agent.py +++ b/docutranslate/agents/txt_agent.py @@ -42,7 +42,15 @@ The translated txt text as plain text. glossary = Glossary(glossary_dict=self.glossary_dict) system_prompt += glossary.append_system_prompt(prompt) return system_prompt, prompt + def send_chunks(self, prompts: list[str]): return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler) + async def send_chunks_async(self, prompts: list[str]): return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler) + + def update_glossary_dict(self, update_dict: dict|None): + if self.glossary_dict is None: + self.glossary_dict = {} + if update_dict is not None: + self.glossary_dict = self.glossary_dict | update_dict \ No newline at end of file diff --git a/docutranslate/translator/ai_translator/base.py b/docutranslate/translator/ai_translator/base.py index d79ca9c..144de03 100644 --- a/docutranslate/translator/ai_translator/base.py +++ b/docutranslate/translator/ai_translator/base.py @@ -1,8 +1,9 @@ from abc import abstractmethod -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TypeVar from docutranslate.agents.agent import ThinkingMode +from docutranslate.agents.glossary_agent import GlossaryAgentConfig, GlossaryAgent from docutranslate.ir.document import Document from docutranslate.translator.base import Translator, TranslatorConfig @@ -20,6 +21,8 @@ class AiTranslatorConfig(TranslatorConfig): chunk_size: int = 3000 concurrent: int = 30 glossary_dict: dict[str:str] | None = None + glossary_generate_enable: bool = True + glossary_agent_config: GlossaryAgentConfig | None = None T = TypeVar('T', bound=Document) @@ -32,7 +35,24 @@ class AiTranslator(Translator[T]): def __init__(self, config: AiTranslatorConfig): super().__init__(config=config) - + self.glossary_agent = None + if config.glossary_generate_enable: + if config.glossary_agent_config: + self.glossary_agent = GlossaryAgent(config.glossary_agent_config) + else: + glossary_agent_config = GlossaryAgentConfig( + to_lang=config.to_lang, + baseurl=config.base_url, + key=config.api_key, + model_id=config.model_id, + system_prompt=None, + temperature=config.temperature, + thinking=config.thinking, + max_concurrent=config.concurrent, + timeout=config.timeout, + logger=self.logger, + ) + self.glossary_agent = GlossaryAgent(glossary_agent_config) @abstractmethod def translate(self, document: T) -> Document: diff --git a/docutranslate/translator/ai_translator/docx_translator.py b/docutranslate/translator/ai_translator/docx_translator.py index 36c615b..4943c67 100644 --- a/docutranslate/translator/ai_translator/docx_translator.py +++ b/docutranslate/translator/ai_translator/docx_translator.py @@ -8,6 +8,7 @@ from docx.document import Document as DocumentObject from docx.text.paragraph import Paragraph from docx.text.run import Run +from docutranslate.agents.glossary_agent import GlossaryAgent, GlossaryAgentConfig from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.ir.document import Document from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator @@ -159,6 +160,10 @@ class DocxTranslator(AiTranslator): document.content = output_stream.getvalue() return self + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + # 调用翻译 agent translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) @@ -179,6 +184,10 @@ class DocxTranslator(AiTranslator): document.content = output_stream.getvalue() return self + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + # 异步调用翻译 agent translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) diff --git a/docutranslate/translator/ai_translator/epub_translator.py b/docutranslate/translator/ai_translator/epub_translator.py index 39b0190..d5e23b1 100644 --- a/docutranslate/translator/ai_translator/epub_translator.py +++ b/docutranslate/translator/ai_translator/epub_translator.py @@ -8,6 +8,7 @@ from typing import Self, Literal, List, Dict, Any from bs4 import BeautifulSoup +from docutranslate.agents.glossary_agent import GlossaryAgent, GlossaryAgentConfig from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.ir.document import Document from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator @@ -39,6 +40,7 @@ class EpubTranslator(AiTranslator): self.insert_mode = config.insert_mode self.separator = config.separator + def _pre_translate(self, document: Document) -> tuple[ Dict[str, bytes], List[Dict[str, Any]], List[str] ]: @@ -173,7 +175,9 @@ class EpubTranslator(AiTranslator): if not items_to_translate: self.logger.info("\n文件中没有找到需要翻译的纯文本内容。") return self - + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) document.content = self._after_translate( all_files, items_to_translate, translated_texts, original_texts @@ -191,6 +195,10 @@ class EpubTranslator(AiTranslator): self.logger.info("\n文件中没有找到需要翻译的纯文本内容。") return self + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + translated_texts = await self.translate_agent.send_segments_async( original_texts, self.chunk_size ) diff --git a/docutranslate/translator/ai_translator/html_translator.py b/docutranslate/translator/ai_translator/html_translator.py index 5f8af48..f8d6d2e 100644 --- a/docutranslate/translator/ai_translator/html_translator.py +++ b/docutranslate/translator/ai_translator/html_translator.py @@ -198,6 +198,10 @@ class HtmlTranslator(AiTranslator): document.content = soup.encode('utf-8') return self + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts) return self @@ -213,6 +217,10 @@ class HtmlTranslator(AiTranslator): document.content = await asyncio.to_thread(soup.encode, 'utf-8') return self + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) document.content = await asyncio.to_thread( self._after_translate, soup, translatable_items, translated_texts, original_texts diff --git a/docutranslate/translator/ai_translator/json_translator.py b/docutranslate/translator/ai_translator/json_translator.py index 57c016f..0011d29 100644 --- a/docutranslate/translator/ai_translator/json_translator.py +++ b/docutranslate/translator/ai_translator/json_translator.py @@ -75,7 +75,9 @@ class JsonTranslator(AiTranslator): return self original_texts = [match.value for match in all_matches] - + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) # 步骤 2: 批量翻译提取出的文本 translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) @@ -104,6 +106,10 @@ class JsonTranslator(AiTranslator): original_texts = [match.value for match in all_matches] + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + # 步骤 2: 批量翻译提取出的文本 translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) diff --git a/docutranslate/translator/ai_translator/md_translator.py b/docutranslate/translator/ai_translator/md_translator.py index af0de34..7cacb32 100644 --- a/docutranslate/translator/ai_translator/md_translator.py +++ b/docutranslate/translator/ai_translator/md_translator.py @@ -37,6 +37,9 @@ class MDTranslator(AiTranslator): self.logger.info("正在翻译markdown") with MDMaskUrisContext(document): chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(chunks, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) self.logger.info(f"markdown分为{len(chunks)}块") result: list[str] = self.translate_agent.send_chunks(chunks) content = join_markdown_texts(result) @@ -52,6 +55,11 @@ class MDTranslator(AiTranslator): self.logger.info("正在翻译markdown") with MDMaskUrisContext(document): chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size) + + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(chunks, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + self.logger.info(f"markdown分为{len(chunks)}块") result: list[str] = await self.translate_agent.send_chunks_async(chunks) diff --git a/docutranslate/translator/ai_translator/srt_translator.py b/docutranslate/translator/ai_translator/srt_translator.py index de8a24b..415f56c 100644 --- a/docutranslate/translator/ai_translator/srt_translator.py +++ b/docutranslate/translator/ai_translator/srt_translator.py @@ -4,6 +4,7 @@ from typing import Self, Literal import srt # 导入srt库来处理字幕文件 +from docutranslate.agents.glossary_agent import GlossaryAgentConfig, GlossaryAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.ir.document import Document from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator @@ -108,7 +109,9 @@ class SrtTranslator(AiTranslator): if not original_texts: self.logger.info("\n文件中没有找到需要翻译的字幕内容。") return self - + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) # --- 步骤 2: 调用翻译Agent --- translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) @@ -127,6 +130,10 @@ class SrtTranslator(AiTranslator): self.logger.info("\n文件中没有找到需要翻译的字幕内容。") return self + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + # --- 步骤 2: 调用翻译Agent (异步) --- translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) diff --git a/docutranslate/translator/ai_translator/txt_translator.py b/docutranslate/translator/ai_translator/txt_translator.py index e162a1d..576074c 100644 --- a/docutranslate/translator/ai_translator/txt_translator.py +++ b/docutranslate/translator/ai_translator/txt_translator.py @@ -33,6 +33,9 @@ class TXTTranslator(AiTranslator): def translate(self, document: Document) -> Self: self.logger.info("正在翻译txt") chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size) + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(chunks, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) self.logger.info(f"txt分为{len(chunks)}块") result: list[str] = self.translate_agent.send_chunks(chunks) content = "\n".join(result) @@ -43,6 +46,11 @@ class TXTTranslator(AiTranslator): async def translate_async(self, document: Document) -> Self: self.logger.info("正在翻译txt") chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size) + + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(chunks, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + self.logger.info(f"txt分为{len(chunks)}块") result: list[str] = await self.translate_agent.send_chunks_async(chunks) content = "\n".join(result) diff --git a/docutranslate/translator/ai_translator/xlsx_translator.py b/docutranslate/translator/ai_translator/xlsx_translator.py index 7015bde..f90ae61 100644 --- a/docutranslate/translator/ai_translator/xlsx_translator.py +++ b/docutranslate/translator/ai_translator/xlsx_translator.py @@ -155,6 +155,9 @@ class XlsxTranslator(AiTranslator): print("\n在指定区域中没有找到需要翻译的纯文本内容。") workbook.close() return self + if self.glossary_agent: + glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) # --- 步骤 2: 调用翻译函数 --- translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) @@ -168,6 +171,11 @@ class XlsxTranslator(AiTranslator): print("\n在指定区域中没有找到需要翻译的纯文本内容。") workbook.close() return self + + if self.glossary_agent: + glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size) + self.translate_agent.update_glossary_dict(glossary_dict) + # --- 步骤 2: 调用翻译函数 --- translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) diff --git a/docutranslate/workflow/base.py b/docutranslate/workflow/base.py index a6b8a09..5009051 100644 --- a/docutranslate/workflow/base.py +++ b/docutranslate/workflow/base.py @@ -12,15 +12,16 @@ from docutranslate.ir.document import Document class WorkflowConfig: logger: Logger | None = None + T_Config = TypeVar("T_Config", bound=WorkflowConfig) T_original = TypeVar('T_original', bound=Document) T_Translated = TypeVar('T_Translated', bound=Document) -class Workflow(ABC, Generic[T_Config,T_original, T_Translated]): - def __init__(self, config:T_Config): - self.config=config - self.logger=self.config.logger +class Workflow(ABC, Generic[T_Config, T_original, T_Translated]): + def __init__(self, config: T_Config): + self.config = config + self.logger = self.config.logger self.document_original: T_original | None = None self.document_translated: T_Translated | None = None @@ -56,5 +57,3 @@ class Workflow(ABC, Generic[T_Config,T_original, T_Translated]): output_path.write_bytes(docu.content) self.logger.info(f"文件已保存到{output_path.resolve()}") return self - -