This commit is contained in:
xunbu
2025-08-27 13:39:40 +08:00
parent 3e39035b3c
commit b1d68c2fc0
8 changed files with 36 additions and 47 deletions

View File

@@ -1,20 +1,16 @@
import asyncio
from dataclasses import dataclass, field
from dataclasses import dataclass
from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple
import docx
from docx.document import Document as DocumentObject
from docx.oxml.ns import nsdecls
from docx.oxml import OxmlElement
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
def is_image_run(run: Run) -> bool:
@@ -32,7 +28,7 @@ class DocxTranslatorConfig(AiTranslatorConfig):
separator: str = "\n"
class DocxTranslator(Translator):
class DocxTranslator(AiTranslator):
"""
用于翻译 .docx 文件的翻译器。
此版本经过优化,可以处理图文混排的段落而不会丢失图片。
@@ -188,4 +184,4 @@ class DocxTranslator(Translator):
# 将翻译结果写回文档
document.content = await asyncio.to_thread(self._after_translate, doc, elements_to_translate, translated_texts,
original_texts)
return self
return self

View File

@@ -1,7 +1,7 @@
import asyncio
import os
import zipfile
import xml.etree.ElementTree as ET
import zipfile
from dataclasses import dataclass
from io import BytesIO
from typing import Self, Literal, List, Dict, Any
@@ -10,8 +10,7 @@ from bs4 import BeautifulSoup
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
@dataclass
@@ -20,7 +19,7 @@ class EpubTranslatorConfig(AiTranslatorConfig):
separator: str = "\n"
class EpubTranslator(Translator):
class EpubTranslator(AiTranslator):
"""
一个用于翻译 EPUB 文件中内容的翻译器。
此版本使用内置的 `zipfile` 和 `xml` 库,不依赖 `ebooklib`。
@@ -197,4 +196,4 @@ class EpubTranslator(Translator):
document.content = await asyncio.to_thread(
self._after_translate, all_files, items_to_translate, translated_texts, original_texts
)
return self
return self

View File

@@ -2,12 +2,11 @@ import asyncio
from dataclasses import dataclass
from typing import Self, Literal, Set, Dict, List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from bs4 import BeautifulSoup, NavigableString, Comment
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
# --- 规则定义 ---
@@ -16,16 +15,16 @@ from docutranslate.translator.base import Translator
# 在预处理阶段,这些标签及其所有子元素将被直接从文档中移除,以确保它们不会被意外修改。
NON_TRANSLATABLE_TAGS: Set[str] = {
'script', # JavaScript代码
'style', # CSS样式
'pre', # 预格式化文本,通常用于代码块
'code', # 行内代码
'kbd', # 键盘输入
'samp', # 示例输出
'var', # 变量
'noscript',# script未启用时的内容
'meta', # 元数据
'link', # 外部资源链接
'head', # 文档头部,通常不包含可见的可翻译内容
'style', # CSS样式
'pre', # 预格式化文本,通常用于代码块
'code', # 行内代码
'kbd', # 键盘输入
'samp', # 示例输出
'var', # 变量
'noscript', # script未启用时的内容
'meta', # 元数据
'link', # 外部资源链接
'head', # 文档头部,通常不包含可见的可翻译内容
}
# 2. 可翻译标签(白名单)
@@ -73,7 +72,7 @@ class HtmlTranslatorConfig(AiTranslatorConfig):
separator: str = " " # HTML中用空格作为默认分隔符可能更合适
class HtmlTranslator(Translator):
class HtmlTranslator(AiTranslator):
"""
一个用于翻译 HTML 文件内容的翻译器。
它采用黑白名单结合的策略,以最大程度地保留页面样式和功能:
@@ -133,7 +132,7 @@ class HtmlTranslator(Translator):
# --- 2b. 翻译安全标签内的安全属性 ---
attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', [])
for attr in set(attributes_to_check): # 使用set去重
for attr in set(attributes_to_check): # 使用set去重
if tag.has_attr(attr) and tag[attr].strip():
value = tag[attr]
translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr})
@@ -171,7 +170,7 @@ class HtmlTranslator(Translator):
new_content = translated_text + self.separator + original_text
else:
self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'")
new_content = original_text # 出错时恢复原文
new_content = original_text # 出错时恢复原文
# 根据类型将内容写回
if item['type'] == 'node':
@@ -217,4 +216,4 @@ class HtmlTranslator(Translator):
document.content = await asyncio.to_thread(
self._after_translate, soup, translatable_items, translated_texts, original_texts
)
return self
return self

View File

@@ -6,8 +6,7 @@ from jsonpath_ng.ext import parse
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
@dataclass
@@ -15,7 +14,7 @@ class JsonTranslatorConfig(AiTranslatorConfig):
json_paths: list[str]
class JsonTranslator(Translator):
class JsonTranslator(AiTranslator):
def __init__(self, config: JsonTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size

View File

@@ -6,8 +6,7 @@ from docutranslate.agents import MDTranslateAgent
from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
from docutranslate.context.md_mask_context import MDMaskUrisContext
from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
@@ -16,7 +15,7 @@ class MDTranslatorConfig(AiTranslatorConfig):
...
class MDTranslator(Translator):
class MDTranslator(AiTranslator):
def __init__(self, config: MDTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size
@@ -64,4 +63,4 @@ class MDTranslator(Translator):
await asyncio.to_thread(run)
self.logger.info("翻译完成")
return self
return self

View File

@@ -6,8 +6,7 @@ import srt # 导入srt库来处理字幕文件
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
@dataclass
@@ -16,7 +15,7 @@ class SrtTranslatorConfig(AiTranslatorConfig):
separator: str = "\n"
class SrtTranslator(Translator):
class SrtTranslator(AiTranslator):
"""
一个用于翻译 SRT (.srt) 字幕文件的翻译器。
它会提取每个字幕块的文本内容,进行翻译,然后根据配置将译文写回。
@@ -134,4 +133,4 @@ class SrtTranslator(Translator):
document.content = await asyncio.to_thread(
self._after_translate, subtitles, translated_texts, original_texts
)
return self
return self

View File

@@ -3,8 +3,7 @@ from typing import Self
from docutranslate.agents.txt_agent import TXTTranslateAgent, TXTTranslateAgentConfig
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.utils.markdown_splitter import split_markdown_text
@@ -13,7 +12,7 @@ class TXTTranslatorConfig(AiTranslatorConfig):
...
class TXTTranslator(Translator):
class TXTTranslator(AiTranslator):
def __init__(self, config: TXTTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size

View File

@@ -8,8 +8,7 @@ from openpyxl.cell import Cell
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig
from docutranslate.translator.base import Translator
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
@dataclass
@@ -23,7 +22,7 @@ class XlsxTranslatorConfig(AiTranslatorConfig):
translate_regions: Optional[List[str]] = None
class XlsxTranslator(Translator):
class XlsxTranslator(AiTranslator):
def __init__(self, config: XlsxTranslatorConfig):
super().__init__(config=config)
self.chunk_size = config.chunk_size