实现自动生成术语表
This commit is contained in:
@@ -255,9 +255,10 @@ class Agent:
|
|||||||
return prompt if error_result_handler is None else error_result_handler(prompt, self.logger)
|
return prompt if error_result_handler is None else error_result_handler(prompt, self.logger)
|
||||||
|
|
||||||
def _send_prompt_count(self, client: httpx.Client, prompt: str, system_prompt: None | str, count: PromptsCounter,
|
def _send_prompt_count(self, client: httpx.Client, prompt: str, system_prompt: None | str, count: PromptsCounter,
|
||||||
|
pre_send_handler,
|
||||||
result_handler,
|
result_handler,
|
||||||
error_result_handler) -> Any:
|
error_result_handler) -> Any:
|
||||||
result = self.send(client, prompt, system_prompt, result_handler=result_handler,
|
result = self.send(client, prompt, system_prompt, pre_send_handler=pre_send_handler,result_handler=result_handler,
|
||||||
error_result_handler=error_result_handler)
|
error_result_handler=error_result_handler)
|
||||||
count.add()
|
count.add()
|
||||||
return result
|
return result
|
||||||
|
|||||||
104
docutranslate/agents/glossary_agent.py
Normal file
104
docutranslate/agents/glossary_agent.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from json import JSONDecodeError
|
||||||
|
from logging import Logger
|
||||||
|
|
||||||
|
import json_repair
|
||||||
|
|
||||||
|
from docutranslate.agents import AgentConfig, Agent
|
||||||
|
from docutranslate.utils.json_utils import segments2json_chunks
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class GlossaryAgentConfig(AgentConfig):
|
||||||
|
to_lang: str
|
||||||
|
|
||||||
|
|
||||||
|
class GlossaryAgent(Agent):
|
||||||
|
def __init__(self, config: GlossaryAgentConfig):
|
||||||
|
super().__init__(config)
|
||||||
|
self.system_prompt = f"""
|
||||||
|
# Role
|
||||||
|
You are a professional machine translation engine.
|
||||||
|
# 角色
|
||||||
|
你是一个专业的术语表提取器
|
||||||
|
|
||||||
|
# Task
|
||||||
|
你会收到一个json格式的段落表,其中键是段落的序号,值是段落的内容。
|
||||||
|
你需要从这些段落中提取**人名**和**地名**,并翻译这些名词为{config.to_lang}语言。
|
||||||
|
最终输出一个名词原文:名词译文的术语表
|
||||||
|
|
||||||
|
# Requirements
|
||||||
|
- 特殊标签、形如`<ph-xxxxxx>`的标签不要添加到术语表
|
||||||
|
- 输出术语表的src必须与名词原文完全一致,dst是该名词的{config.to_lang}的译文
|
||||||
|
- 相同的src仅在术语表中添加一次,不能重复
|
||||||
|
|
||||||
|
# Output
|
||||||
|
输出格式是列表的json纯文本
|
||||||
|
{[{"src": "<名词原文>", "dst": "<名词译文>"}]}
|
||||||
|
|
||||||
|
#示例
|
||||||
|
## 输入(翻译为中文):
|
||||||
|
{{"0":"Jobs likes apples","1":"Bill Gates is sunbathing in Shanghai."}}
|
||||||
|
## 输出
|
||||||
|
{r'[{"src": "Jobs", "dst": "乔布斯"}, {"src": "Bill Gates", "dst": "比尔盖茨"}, {"src": "Shanghai", "dst": "上海"}]'}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _result_handler(self, result: str, origin_prompt: str, logger: Logger):
|
||||||
|
try:
|
||||||
|
result = json_repair.loads(result)
|
||||||
|
if not isinstance(result, list):
|
||||||
|
raise ValueError("GlossaryAgent返回结果不是list的json形式")
|
||||||
|
except:
|
||||||
|
logger.error("结果不能正确解析")
|
||||||
|
return self._error_result_handler(origin_prompt, logger)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _error_result_handler(self, origin_prompt: str, logger: Logger):
|
||||||
|
try:
|
||||||
|
return json_repair.loads(origin_prompt)
|
||||||
|
except:
|
||||||
|
logger.error("prompt不是json格式")
|
||||||
|
return origin_prompt
|
||||||
|
|
||||||
|
def send_segments(self, segments: list[str], chunk_size: int):
|
||||||
|
self.logger.info("开始提取术语表")
|
||||||
|
result = {}
|
||||||
|
indexed_originals, chunks, merged_indices_list = segments2json_chunks(segments, chunk_size)
|
||||||
|
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
|
||||||
|
translated_chunks = super().send_prompts(prompts=prompts,
|
||||||
|
result_handler=self._result_handler,
|
||||||
|
error_result_handler=self._error_result_handler)
|
||||||
|
for chunk in translated_chunks:
|
||||||
|
chunk: list[dict[str, str]]
|
||||||
|
try:
|
||||||
|
glossary_dict = {d["src"]: d["dst"] for d in chunk}
|
||||||
|
result = result | glossary_dict
|
||||||
|
except JSONDecodeError as e:
|
||||||
|
self.logger.info(f"json解析错误,解析文本:{chunk},错误:{e.__repr__()}")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.info(f"send_segments发生错误:{e.__repr__()}")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def send_segments_async(self, segments: list[str], chunk_size: int):
|
||||||
|
self.logger.info("开始提取术语表")
|
||||||
|
result = {}
|
||||||
|
indexed_originals, chunks, merged_indices_list = await asyncio.to_thread(segments2json_chunks, segments,
|
||||||
|
chunk_size)
|
||||||
|
prompts = [json.dumps(chunk, ensure_ascii=False) for chunk in chunks]
|
||||||
|
translated_chunks = await super().send_prompts_async(prompts=prompts,
|
||||||
|
result_handler=self._result_handler,
|
||||||
|
error_result_handler=self._error_result_handler)
|
||||||
|
for chunk in translated_chunks:
|
||||||
|
chunk: list[dict[str, str]]
|
||||||
|
try:
|
||||||
|
glossary_dict = {d["src"]: d["dst"] for d in chunk}
|
||||||
|
result = result | glossary_dict
|
||||||
|
except JSONDecodeError as e:
|
||||||
|
self.logger.info(f"json解析错误,解析文本:{chunk},错误:{e.__repr__()}")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.info(f"send_segments发生错误:{e.__repr__()}")
|
||||||
|
print(f"术语表:\n{result}")
|
||||||
|
return result
|
||||||
@@ -62,7 +62,15 @@ $$1+1=2$$
|
|||||||
glossary = Glossary(glossary_dict=self.glossary_dict)
|
glossary = Glossary(glossary_dict=self.glossary_dict)
|
||||||
system_prompt += glossary.append_system_prompt(prompt)
|
system_prompt += glossary.append_system_prompt(prompt)
|
||||||
return system_prompt, prompt
|
return system_prompt, prompt
|
||||||
|
|
||||||
def send_chunks(self, prompts: list[str]):
|
def send_chunks(self, prompts: list[str]):
|
||||||
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
||||||
|
|
||||||
async def send_chunks_async(self, prompts: list[str]):
|
async def send_chunks_async(self, prompts: list[str]):
|
||||||
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
||||||
|
|
||||||
|
def update_glossary_dict(self, update_dict: dict | None):
|
||||||
|
if self.glossary_dict is None:
|
||||||
|
self.glossary_dict = {}
|
||||||
|
if update_dict is not None:
|
||||||
|
self.glossary_dict = self.glossary_dict | update_dict
|
||||||
|
|||||||
@@ -144,3 +144,9 @@ Warning: Never wrap the entire JSON object in quotes to make it a single string.
|
|||||||
# 添加剩余部分
|
# 添加剩余部分
|
||||||
result.extend(ls[last_end:])
|
result.extend(ls[last_end:])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def update_glossary_dict(self, update_dict: dict|None):
|
||||||
|
if self.glossary_dict is None:
|
||||||
|
self.glossary_dict = {}
|
||||||
|
if update_dict is not None:
|
||||||
|
self.glossary_dict = self.glossary_dict | update_dict
|
||||||
|
|||||||
@@ -42,7 +42,15 @@ The translated txt text as plain text.
|
|||||||
glossary = Glossary(glossary_dict=self.glossary_dict)
|
glossary = Glossary(glossary_dict=self.glossary_dict)
|
||||||
system_prompt += glossary.append_system_prompt(prompt)
|
system_prompt += glossary.append_system_prompt(prompt)
|
||||||
return system_prompt, prompt
|
return system_prompt, prompt
|
||||||
|
|
||||||
def send_chunks(self, prompts: list[str]):
|
def send_chunks(self, prompts: list[str]):
|
||||||
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
return super().send_prompts(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
||||||
|
|
||||||
async def send_chunks_async(self, prompts: list[str]):
|
async def send_chunks_async(self, prompts: list[str]):
|
||||||
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
return await super().send_prompts_async(prompts=prompts, pre_send_handler=self._pre_send_handler)
|
||||||
|
|
||||||
|
def update_glossary_dict(self, update_dict: dict|None):
|
||||||
|
if self.glossary_dict is None:
|
||||||
|
self.glossary_dict = {}
|
||||||
|
if update_dict is not None:
|
||||||
|
self.glossary_dict = self.glossary_dict | update_dict
|
||||||
@@ -1,8 +1,9 @@
|
|||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from typing import TypeVar
|
from typing import TypeVar
|
||||||
|
|
||||||
from docutranslate.agents.agent import ThinkingMode
|
from docutranslate.agents.agent import ThinkingMode
|
||||||
|
from docutranslate.agents.glossary_agent import GlossaryAgentConfig, GlossaryAgent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.translator.base import Translator, TranslatorConfig
|
from docutranslate.translator.base import Translator, TranslatorConfig
|
||||||
|
|
||||||
@@ -20,6 +21,8 @@ class AiTranslatorConfig(TranslatorConfig):
|
|||||||
chunk_size: int = 3000
|
chunk_size: int = 3000
|
||||||
concurrent: int = 30
|
concurrent: int = 30
|
||||||
glossary_dict: dict[str:str] | None = None
|
glossary_dict: dict[str:str] | None = None
|
||||||
|
glossary_generate_enable: bool = True
|
||||||
|
glossary_agent_config: GlossaryAgentConfig | None = None
|
||||||
|
|
||||||
|
|
||||||
T = TypeVar('T', bound=Document)
|
T = TypeVar('T', bound=Document)
|
||||||
@@ -32,7 +35,24 @@ class AiTranslator(Translator[T]):
|
|||||||
|
|
||||||
def __init__(self, config: AiTranslatorConfig):
|
def __init__(self, config: AiTranslatorConfig):
|
||||||
super().__init__(config=config)
|
super().__init__(config=config)
|
||||||
|
self.glossary_agent = None
|
||||||
|
if config.glossary_generate_enable:
|
||||||
|
if config.glossary_agent_config:
|
||||||
|
self.glossary_agent = GlossaryAgent(config.glossary_agent_config)
|
||||||
|
else:
|
||||||
|
glossary_agent_config = GlossaryAgentConfig(
|
||||||
|
to_lang=config.to_lang,
|
||||||
|
baseurl=config.base_url,
|
||||||
|
key=config.api_key,
|
||||||
|
model_id=config.model_id,
|
||||||
|
system_prompt=None,
|
||||||
|
temperature=config.temperature,
|
||||||
|
thinking=config.thinking,
|
||||||
|
max_concurrent=config.concurrent,
|
||||||
|
timeout=config.timeout,
|
||||||
|
logger=self.logger,
|
||||||
|
)
|
||||||
|
self.glossary_agent = GlossaryAgent(glossary_agent_config)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def translate(self, document: T) -> Document:
|
def translate(self, document: T) -> Document:
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from docx.document import Document as DocumentObject
|
|||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
from docx.text.run import Run
|
from docx.text.run import Run
|
||||||
|
|
||||||
|
from docutranslate.agents.glossary_agent import GlossaryAgent, GlossaryAgentConfig
|
||||||
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
||||||
@@ -159,6 +160,10 @@ class DocxTranslator(AiTranslator):
|
|||||||
document.content = output_stream.getvalue()
|
document.content = output_stream.getvalue()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
# 调用翻译 agent
|
# 调用翻译 agent
|
||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
|
||||||
@@ -179,6 +184,10 @@ class DocxTranslator(AiTranslator):
|
|||||||
document.content = output_stream.getvalue()
|
document.content = output_stream.getvalue()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
# 异步调用翻译 agent
|
# 异步调用翻译 agent
|
||||||
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from typing import Self, Literal, List, Dict, Any
|
|||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from docutranslate.agents.glossary_agent import GlossaryAgent, GlossaryAgentConfig
|
||||||
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
||||||
@@ -39,6 +40,7 @@ class EpubTranslator(AiTranslator):
|
|||||||
self.insert_mode = config.insert_mode
|
self.insert_mode = config.insert_mode
|
||||||
self.separator = config.separator
|
self.separator = config.separator
|
||||||
|
|
||||||
|
|
||||||
def _pre_translate(self, document: Document) -> tuple[
|
def _pre_translate(self, document: Document) -> tuple[
|
||||||
Dict[str, bytes], List[Dict[str, Any]], List[str]
|
Dict[str, bytes], List[Dict[str, Any]], List[str]
|
||||||
]:
|
]:
|
||||||
@@ -173,7 +175,9 @@ class EpubTranslator(AiTranslator):
|
|||||||
if not items_to_translate:
|
if not items_to_translate:
|
||||||
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
|
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
|
||||||
return self
|
return self
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
document.content = self._after_translate(
|
document.content = self._after_translate(
|
||||||
all_files, items_to_translate, translated_texts, original_texts
|
all_files, items_to_translate, translated_texts, original_texts
|
||||||
@@ -191,6 +195,10 @@ class EpubTranslator(AiTranslator):
|
|||||||
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
|
self.logger.info("\n文件中没有找到需要翻译的纯文本内容。")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
translated_texts = await self.translate_agent.send_segments_async(
|
translated_texts = await self.translate_agent.send_segments_async(
|
||||||
original_texts, self.chunk_size
|
original_texts, self.chunk_size
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -198,6 +198,10 @@ class HtmlTranslator(AiTranslator):
|
|||||||
document.content = soup.encode('utf-8')
|
document.content = soup.encode('utf-8')
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts)
|
document.content = self._after_translate(soup, translatable_items, translated_texts, original_texts)
|
||||||
return self
|
return self
|
||||||
@@ -213,6 +217,10 @@ class HtmlTranslator(AiTranslator):
|
|||||||
document.content = await asyncio.to_thread(soup.encode, 'utf-8')
|
document.content = await asyncio.to_thread(soup.encode, 'utf-8')
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
document.content = await asyncio.to_thread(
|
document.content = await asyncio.to_thread(
|
||||||
self._after_translate, soup, translatable_items, translated_texts, original_texts
|
self._after_translate, soup, translatable_items, translated_texts, original_texts
|
||||||
|
|||||||
@@ -75,7 +75,9 @@ class JsonTranslator(AiTranslator):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
original_texts = [match.value for match in all_matches]
|
original_texts = [match.value for match in all_matches]
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
# 步骤 2: 批量翻译提取出的文本
|
# 步骤 2: 批量翻译提取出的文本
|
||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
|
||||||
@@ -104,6 +106,10 @@ class JsonTranslator(AiTranslator):
|
|||||||
|
|
||||||
original_texts = [match.value for match in all_matches]
|
original_texts = [match.value for match in all_matches]
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
# 步骤 2: 批量翻译提取出的文本
|
# 步骤 2: 批量翻译提取出的文本
|
||||||
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
|
||||||
|
|||||||
@@ -37,6 +37,9 @@ class MDTranslator(AiTranslator):
|
|||||||
self.logger.info("正在翻译markdown")
|
self.logger.info("正在翻译markdown")
|
||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(chunks, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||||
result: list[str] = self.translate_agent.send_chunks(chunks)
|
result: list[str] = self.translate_agent.send_chunks(chunks)
|
||||||
content = join_markdown_texts(result)
|
content = join_markdown_texts(result)
|
||||||
@@ -52,6 +55,11 @@ class MDTranslator(AiTranslator):
|
|||||||
self.logger.info("正在翻译markdown")
|
self.logger.info("正在翻译markdown")
|
||||||
with MDMaskUrisContext(document):
|
with MDMaskUrisContext(document):
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), self.chunk_size)
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(chunks, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
self.logger.info(f"markdown分为{len(chunks)}块")
|
self.logger.info(f"markdown分为{len(chunks)}块")
|
||||||
result: list[str] = await self.translate_agent.send_chunks_async(chunks)
|
result: list[str] = await self.translate_agent.send_chunks_async(chunks)
|
||||||
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from typing import Self, Literal
|
|||||||
|
|
||||||
import srt # 导入srt库来处理字幕文件
|
import srt # 导入srt库来处理字幕文件
|
||||||
|
|
||||||
|
from docutranslate.agents.glossary_agent import GlossaryAgentConfig, GlossaryAgent
|
||||||
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
from docutranslate.ir.document import Document
|
from docutranslate.ir.document import Document
|
||||||
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
||||||
@@ -108,7 +109,9 @@ class SrtTranslator(AiTranslator):
|
|||||||
if not original_texts:
|
if not original_texts:
|
||||||
self.logger.info("\n文件中没有找到需要翻译的字幕内容。")
|
self.logger.info("\n文件中没有找到需要翻译的字幕内容。")
|
||||||
return self
|
return self
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
# --- 步骤 2: 调用翻译Agent ---
|
# --- 步骤 2: 调用翻译Agent ---
|
||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
|
||||||
@@ -127,6 +130,10 @@ class SrtTranslator(AiTranslator):
|
|||||||
self.logger.info("\n文件中没有找到需要翻译的字幕内容。")
|
self.logger.info("\n文件中没有找到需要翻译的字幕内容。")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
# --- 步骤 2: 调用翻译Agent (异步) ---
|
# --- 步骤 2: 调用翻译Agent (异步) ---
|
||||||
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,9 @@ class TXTTranslator(AiTranslator):
|
|||||||
def translate(self, document: Document) -> Self:
|
def translate(self, document: Document) -> Self:
|
||||||
self.logger.info("正在翻译txt")
|
self.logger.info("正在翻译txt")
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(chunks, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
self.logger.info(f"txt分为{len(chunks)}块")
|
self.logger.info(f"txt分为{len(chunks)}块")
|
||||||
result: list[str] = self.translate_agent.send_chunks(chunks)
|
result: list[str] = self.translate_agent.send_chunks(chunks)
|
||||||
content = "\n".join(result)
|
content = "\n".join(result)
|
||||||
@@ -43,6 +46,11 @@ class TXTTranslator(AiTranslator):
|
|||||||
async def translate_async(self, document: Document) -> Self:
|
async def translate_async(self, document: Document) -> Self:
|
||||||
self.logger.info("正在翻译txt")
|
self.logger.info("正在翻译txt")
|
||||||
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
chunks: list[str] = split_markdown_text(document.content.decode(), max_block_size=self.chunk_size)
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(chunks, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
self.logger.info(f"txt分为{len(chunks)}块")
|
self.logger.info(f"txt分为{len(chunks)}块")
|
||||||
result: list[str] = await self.translate_agent.send_chunks_async(chunks)
|
result: list[str] = await self.translate_agent.send_chunks_async(chunks)
|
||||||
content = "\n".join(result)
|
content = "\n".join(result)
|
||||||
|
|||||||
@@ -155,6 +155,9 @@ class XlsxTranslator(AiTranslator):
|
|||||||
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
|
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
|
||||||
workbook.close()
|
workbook.close()
|
||||||
return self
|
return self
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = self.glossary_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
# --- 步骤 2: 调用翻译函数 ---
|
# --- 步骤 2: 调用翻译函数 ---
|
||||||
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size)
|
||||||
|
|
||||||
@@ -168,6 +171,11 @@ class XlsxTranslator(AiTranslator):
|
|||||||
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
|
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
|
||||||
workbook.close()
|
workbook.close()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
glossary_dict = await self.glossary_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
self.translate_agent.update_glossary_dict(glossary_dict)
|
||||||
|
|
||||||
# --- 步骤 2: 调用翻译函数 ---
|
# --- 步骤 2: 调用翻译函数 ---
|
||||||
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size)
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from docutranslate.ir.document import Document
|
|||||||
class WorkflowConfig:
|
class WorkflowConfig:
|
||||||
logger: Logger | None = None
|
logger: Logger | None = None
|
||||||
|
|
||||||
|
|
||||||
T_Config = TypeVar("T_Config", bound=WorkflowConfig)
|
T_Config = TypeVar("T_Config", bound=WorkflowConfig)
|
||||||
T_original = TypeVar('T_original', bound=Document)
|
T_original = TypeVar('T_original', bound=Document)
|
||||||
T_Translated = TypeVar('T_Translated', bound=Document)
|
T_Translated = TypeVar('T_Translated', bound=Document)
|
||||||
@@ -56,5 +57,3 @@ class Workflow(ABC, Generic[T_Config,T_original, T_Translated]):
|
|||||||
output_path.write_bytes(docu.content)
|
output_path.write_bytes(docu.content)
|
||||||
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
self.logger.info(f"文件已保存到{output_path.resolve()}")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user