This commit is contained in:
xunbu
2025-08-27 13:39:40 +08:00
parent 3e39035b3c
commit b1d68c2fc0
8 changed files with 36 additions and 47 deletions

View File

@@ -1,20 +1,16 @@
import asyncio import asyncio
from dataclasses import dataclass, field from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple from typing import Self, Literal, List, Dict, Any, Tuple
import docx import docx
from docx.document import Document as DocumentObject from docx.document import Document as DocumentObject
from docx.oxml.ns import nsdecls
from docx.oxml import OxmlElement
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
from docx.text.run import Run from docx.text.run import Run
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
def is_image_run(run: Run) -> bool: def is_image_run(run: Run) -> bool:
@@ -32,7 +28,7 @@ class DocxTranslatorConfig(AiTranslatorConfig):
separator: str = "\n" separator: str = "\n"
class DocxTranslator(Translator): class DocxTranslator(AiTranslator):
""" """
用于翻译 .docx 文件的翻译器。 用于翻译 .docx 文件的翻译器。
此版本经过优化,可以处理图文混排的段落而不会丢失图片。 此版本经过优化,可以处理图文混排的段落而不会丢失图片。

View File

@@ -1,7 +1,7 @@
import asyncio import asyncio
import os import os
import zipfile
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import zipfile
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any from typing import Self, Literal, List, Dict, Any
@@ -10,8 +10,7 @@ from bs4 import BeautifulSoup
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
@dataclass @dataclass
@@ -20,7 +19,7 @@ class EpubTranslatorConfig(AiTranslatorConfig):
separator: str = "\n" separator: str = "\n"
class EpubTranslator(Translator): class EpubTranslator(AiTranslator):
""" """
一个用于翻译 EPUB 文件中内容的翻译器。 一个用于翻译 EPUB 文件中内容的翻译器。
此版本使用内置的 `zipfile` 和 `xml` 库,不依赖 `ebooklib`。 此版本使用内置的 `zipfile` 和 `xml` 库,不依赖 `ebooklib`。

View File

@@ -2,12 +2,11 @@ import asyncio
from dataclasses import dataclass from dataclasses import dataclass
from typing import Self, Literal, Set, Dict, List, Tuple from typing import Self, Literal, Set, Dict, List, Tuple
from bs4 import BeautifulSoup, NavigableString, Tag, Comment from bs4 import BeautifulSoup, NavigableString, Comment
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
# --- 规则定义 --- # --- 规则定义 ---
@@ -16,16 +15,16 @@ from docutranslate.translator.base import Translator
# 在预处理阶段,这些标签及其所有子元素将被直接从文档中移除,以确保它们不会被意外修改。 # 在预处理阶段,这些标签及其所有子元素将被直接从文档中移除,以确保它们不会被意外修改。
NON_TRANSLATABLE_TAGS: Set[str] = { NON_TRANSLATABLE_TAGS: Set[str] = {
'script', # JavaScript代码 'script', # JavaScript代码
'style', # CSS样式 'style', # CSS样式
'pre', # 预格式化文本,通常用于代码块 'pre', # 预格式化文本,通常用于代码块
'code', # 行内代码 'code', # 行内代码
'kbd', # 键盘输入 'kbd', # 键盘输入
'samp', # 示例输出 'samp', # 示例输出
'var', # 变量 'var', # 变量
'noscript',# script未启用时的内容 'noscript', # script未启用时的内容
'meta', # 元数据 'meta', # 元数据
'link', # 外部资源链接 'link', # 外部资源链接
'head', # 文档头部,通常不包含可见的可翻译内容 'head', # 文档头部,通常不包含可见的可翻译内容
} }
# 2. 可翻译标签(白名单) # 2. 可翻译标签(白名单)
@@ -73,7 +72,7 @@ class HtmlTranslatorConfig(AiTranslatorConfig):
separator: str = " " # HTML中用空格作为默认分隔符可能更合适 separator: str = " " # HTML中用空格作为默认分隔符可能更合适
class HtmlTranslator(Translator): class HtmlTranslator(AiTranslator):
""" """
一个用于翻译 HTML 文件内容的翻译器。 一个用于翻译 HTML 文件内容的翻译器。
它采用黑白名单结合的策略,以最大程度地保留页面样式和功能: 它采用黑白名单结合的策略,以最大程度地保留页面样式和功能:
@@ -133,7 +132,7 @@ class HtmlTranslator(Translator):
# --- 2b. 翻译安全标签内的安全属性 --- # --- 2b. 翻译安全标签内的安全属性 ---
attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', []) attributes_to_check = SAFE_ATTRIBUTES.get(tag.name, []) + SAFE_ATTRIBUTES.get('*', [])
for attr in set(attributes_to_check): # 使用set去重 for attr in set(attributes_to_check): # 使用set去重
if tag.has_attr(attr) and tag[attr].strip(): if tag.has_attr(attr) and tag[attr].strip():
value = tag[attr] value = tag[attr]
translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr}) translatable_items.append({'type': 'attribute', 'tag': tag, 'attribute': attr})
@@ -171,7 +170,7 @@ class HtmlTranslator(Translator):
new_content = translated_text + self.separator + original_text new_content = translated_text + self.separator + original_text
else: else:
self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'") self.logger.error(f"不正确的HtmlTranslatorConfig参数: insert_mode='{self.insert_mode}'")
new_content = original_text # 出错时恢复原文 new_content = original_text # 出错时恢复原文
# 根据类型将内容写回 # 根据类型将内容写回
if item['type'] == 'node': if item['type'] == 'node':

View File

@@ -6,8 +6,7 @@ from jsonpath_ng.ext import parse
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
@dataclass @dataclass
@@ -15,7 +14,7 @@ class JsonTranslatorConfig(AiTranslatorConfig):
json_paths: list[str] json_paths: list[str]
class JsonTranslator(Translator): class JsonTranslator(AiTranslator):
def __init__(self, config: JsonTranslatorConfig): def __init__(self, config: JsonTranslatorConfig):
super().__init__(config=config) super().__init__(config=config)
self.chunk_size = config.chunk_size self.chunk_size = config.chunk_size

View File

@@ -6,8 +6,7 @@ from docutranslate.agents import MDTranslateAgent
from docutranslate.agents.markdown_agent import MDTranslateAgentConfig from docutranslate.agents.markdown_agent import MDTranslateAgentConfig
from docutranslate.context.md_mask_context import MDMaskUrisContext from docutranslate.context.md_mask_context import MDMaskUrisContext
from docutranslate.ir.markdown_document import MarkdownDocument from docutranslate.ir.markdown_document import MarkdownDocument
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts from docutranslate.utils.markdown_splitter import split_markdown_text, join_markdown_texts
@@ -16,7 +15,7 @@ class MDTranslatorConfig(AiTranslatorConfig):
... ...
class MDTranslator(Translator): class MDTranslator(AiTranslator):
def __init__(self, config: MDTranslatorConfig): def __init__(self, config: MDTranslatorConfig):
super().__init__(config=config) super().__init__(config=config)
self.chunk_size = config.chunk_size self.chunk_size = config.chunk_size

View File

@@ -6,8 +6,7 @@ import srt # 导入srt库来处理字幕文件
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
@dataclass @dataclass
@@ -16,7 +15,7 @@ class SrtTranslatorConfig(AiTranslatorConfig):
separator: str = "\n" separator: str = "\n"
class SrtTranslator(Translator): class SrtTranslator(AiTranslator):
""" """
一个用于翻译 SRT (.srt) 字幕文件的翻译器。 一个用于翻译 SRT (.srt) 字幕文件的翻译器。
它会提取每个字幕块的文本内容,进行翻译,然后根据配置将译文写回。 它会提取每个字幕块的文本内容,进行翻译,然后根据配置将译文写回。

View File

@@ -3,8 +3,7 @@ from typing import Self
from docutranslate.agents.txt_agent import TXTTranslateAgent, TXTTranslateAgentConfig from docutranslate.agents.txt_agent import TXTTranslateAgent, TXTTranslateAgentConfig
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
from docutranslate.utils.markdown_splitter import split_markdown_text from docutranslate.utils.markdown_splitter import split_markdown_text
@@ -13,7 +12,7 @@ class TXTTranslatorConfig(AiTranslatorConfig):
... ...
class TXTTranslator(Translator): class TXTTranslator(AiTranslator):
def __init__(self, config: TXTTranslatorConfig): def __init__(self, config: TXTTranslatorConfig):
super().__init__(config=config) super().__init__(config=config)
self.chunk_size = config.chunk_size self.chunk_size = config.chunk_size

View File

@@ -8,8 +8,7 @@ from openpyxl.cell import Cell
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document from docutranslate.ir.document import Document
from docutranslate.translator.ai_translator.base import AiTranslatorConfig from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
from docutranslate.translator.base import Translator
@dataclass @dataclass
@@ -23,7 +22,7 @@ class XlsxTranslatorConfig(AiTranslatorConfig):
translate_regions: Optional[List[str]] = None translate_regions: Optional[List[str]] = None
class XlsxTranslator(Translator): class XlsxTranslator(AiTranslator):
def __init__(self, config: XlsxTranslatorConfig): def __init__(self, config: XlsxTranslatorConfig):
super().__init__(config=config) super().__init__(config=config)
self.chunk_size = config.chunk_size self.chunk_size = config.chunk_size