pptx支持
This commit is contained in:
0
docutranslate/exporter/pptx/__init__.py
Normal file
0
docutranslate/exporter/pptx/__init__.py
Normal file
11
docutranslate/exporter/pptx/base.py
Normal file
11
docutranslate/exporter/pptx/base.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import Exporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
#TODO:看情况是否需要为json单独写一个document类型
|
||||||
|
class PPTXExporter(Exporter[Document]):
|
||||||
|
|
||||||
|
def export(self,document:Document)->Document:
|
||||||
|
...
|
||||||
93
docutranslate/exporter/pptx/pptx2html_exporter.py
Normal file
93
docutranslate/exporter/pptx/pptx2html_exporter.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
|
import html
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
|
from docutranslate.exporter.pptx.base import PPTXExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PPTX2HTMLExporterConfig(ExporterConfig):
|
||||||
|
cdn: bool = True
|
||||||
|
include_hidden_slides: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class PPTX2HTMLExporter(PPTXExporter):
|
||||||
|
def __init__(self, config: PPTX2HTMLExporterConfig = None):
|
||||||
|
config = config or PPTX2HTMLExporterConfig()
|
||||||
|
super().__init__(config=config)
|
||||||
|
self.cdn = config.cdn
|
||||||
|
self.include_hidden_slides = getattr(config, 'include_hidden_slides', False)
|
||||||
|
|
||||||
|
def export(self, document: Document) -> Document:
|
||||||
|
# 使用 python-pptx 加载二进制内容
|
||||||
|
prs = Presentation(BytesIO(document.content))
|
||||||
|
|
||||||
|
html_parts = []
|
||||||
|
|
||||||
|
# 添加基础的 HTML 头部
|
||||||
|
html_parts.append("<!DOCTYPE html><html><head><meta charset='utf-8'>")
|
||||||
|
html_parts.append("<style>")
|
||||||
|
html_parts.append(".slide { border: 1px solid #ccc; margin: 20px auto; padding: 20px; max-width: 800px; }")
|
||||||
|
html_parts.append(".slide-title { font-size: 1.5em; font-weight: bold; margin-bottom: 10px; }")
|
||||||
|
html_parts.append("</style>")
|
||||||
|
html_parts.append("</head><body>")
|
||||||
|
|
||||||
|
for i, slide in enumerate(prs.slides):
|
||||||
|
# 处理隐藏幻灯片的逻辑
|
||||||
|
# 注意: python-pptx 的 slide 对象可能没有 hidden 属性,取决于版本,
|
||||||
|
# 若需要严格过滤需检查 slide.element.get('show') 等,这里做基础遍历。
|
||||||
|
|
||||||
|
slide_html = []
|
||||||
|
slide_html.append(f'<div class="slide" id="slide-{i + 1}">')
|
||||||
|
|
||||||
|
# 1. 尝试提取并处理标题
|
||||||
|
title = slide.shapes.title
|
||||||
|
if title and title.has_text_frame and title.text.strip():
|
||||||
|
escaped_title = html.escape(title.text)
|
||||||
|
slide_html.append(f'<div class="slide-title">{escaped_title}</div>')
|
||||||
|
|
||||||
|
# 2. 遍历其他形状提取文本
|
||||||
|
for shape in slide.shapes:
|
||||||
|
# 跳过已经处理过的标题
|
||||||
|
if shape == title:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if hasattr(shape, "has_text_frame") and shape.has_text_frame:
|
||||||
|
for paragraph in shape.text_frame.paragraphs:
|
||||||
|
text = paragraph.text.strip()
|
||||||
|
if text:
|
||||||
|
# 简单处理:将每个段落作为 p 标签
|
||||||
|
# 进阶处理可以根据 paragraph.level 处理列表缩进
|
||||||
|
escaped_text = html.escape(text)
|
||||||
|
slide_html.append(f'<p>{escaped_text}</p>')
|
||||||
|
|
||||||
|
# 如果需要处理表格 (Table)
|
||||||
|
if shape.has_table:
|
||||||
|
slide_html.append('<table border="1" style="border-collapse: collapse; width: 100%;">')
|
||||||
|
for row in shape.table.rows:
|
||||||
|
slide_html.append('<tr>')
|
||||||
|
for cell in row.cells:
|
||||||
|
cell_text = html.escape(cell.text_frame.text) if cell.text_frame else ""
|
||||||
|
slide_html.append(f'<td style="padding: 5px;">{cell_text}</td>')
|
||||||
|
slide_html.append('</tr>')
|
||||||
|
slide_html.append('</table>')
|
||||||
|
|
||||||
|
slide_html.append('</div>')
|
||||||
|
html_parts.append("".join(slide_html))
|
||||||
|
|
||||||
|
html_parts.append("</body></html>")
|
||||||
|
|
||||||
|
full_html = "\n".join(html_parts)
|
||||||
|
|
||||||
|
return Document.from_bytes(
|
||||||
|
content=full_html.encode("utf-8"),
|
||||||
|
suffix=".html",
|
||||||
|
stem=document.stem
|
||||||
|
)
|
||||||
10
docutranslate/exporter/pptx/pptx2pptx_exporter.py
Normal file
10
docutranslate/exporter/pptx/pptx2pptx_exporter.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
|
||||||
|
from docutranslate.exporter.docx.base import DocxExporter
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
class PPTX2PPTXExporter(DocxExporter):
|
||||||
|
def export(self, document: Document) -> Document:
|
||||||
|
return document.copy()
|
||||||
356
docutranslate/translator/ai_translator/pptx_translator.py
Normal file
356
docutranslate/translator/ai_translator/pptx_translator.py
Normal file
@@ -0,0 +1,356 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Self, Literal, List, Dict, Any, Tuple, Optional
|
||||||
|
|
||||||
|
from pptx import Presentation
|
||||||
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||||||
|
from pptx.enum.text import MSO_AUTO_SIZE
|
||||||
|
from pptx.enum.dml import MSO_COLOR_TYPE
|
||||||
|
from pptx.text.text import _Paragraph, TextFrame
|
||||||
|
from pptx.oxml.ns import qn
|
||||||
|
|
||||||
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.base import AiTranslatorConfig, AiTranslator
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- 配置类 ----------------
|
||||||
|
@dataclass
|
||||||
|
class PPTXTranslatorConfig(AiTranslatorConfig):
|
||||||
|
insert_mode: Literal["replace", "append", "prepend"] = "replace"
|
||||||
|
separator: str = "\n"
|
||||||
|
# 指定翻译后的中文字体(东亚字体),防止乱码或回退到宋体
|
||||||
|
# 推荐使用 "Microsoft YaHei" (微软雅黑) 或 "DengXian" (等线)
|
||||||
|
target_cjk_font: str = "Microsoft YaHei"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- 主类 ----------------
|
||||||
|
class PPTXTranslator(AiTranslator):
|
||||||
|
"""
|
||||||
|
基于 python-pptx 的 .pptx 文件翻译器 (增强版)。
|
||||||
|
|
||||||
|
改进特性:
|
||||||
|
1. 深度遍历:支持母版、版式、备注页、以及隐藏在 AlternateContent (兼容性块) 中的文本。
|
||||||
|
2. 公式保护:智能检测文本间的公式,防止翻译后文字错位。
|
||||||
|
3. 字体美化:中西文字体分离,中文使用微软雅黑,英文保持原样。
|
||||||
|
4. 布局自适应:防止翻译后文本溢出。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: PPTXTranslatorConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
self.chunk_size = config.chunk_size
|
||||||
|
self.translate_agent = None
|
||||||
|
if not self.skip_translate:
|
||||||
|
agent_config = SegmentsTranslateAgentConfig(
|
||||||
|
custom_prompt=config.custom_prompt, to_lang=config.to_lang, base_url=config.base_url,
|
||||||
|
api_key=config.api_key, model_id=config.model_id, temperature=config.temperature,
|
||||||
|
thinking=config.thinking, concurrent=config.concurrent, timeout=config.timeout,
|
||||||
|
logger=self.logger, glossary_dict=config.glossary_dict, retry=config.retry,
|
||||||
|
system_proxy_enable=config.system_proxy_enable, force_json=config.force_json
|
||||||
|
)
|
||||||
|
self.translate_agent = SegmentsTranslateAgent(agent_config)
|
||||||
|
self.insert_mode = config.insert_mode
|
||||||
|
self.separator = config.separator
|
||||||
|
self.target_cjk_font = config.target_cjk_font
|
||||||
|
|
||||||
|
# ---------------- 辅助函数:样式与字体 ----------------
|
||||||
|
|
||||||
|
def _get_font_signature(self, run) -> Tuple:
|
||||||
|
"""获取 Run 的字体样式签名,用于合并判断。"""
|
||||||
|
font = run.font
|
||||||
|
color_key = None
|
||||||
|
|
||||||
|
# 稳健的颜色获取逻辑
|
||||||
|
if hasattr(font, 'color') and font.color:
|
||||||
|
try:
|
||||||
|
if font.color.type == MSO_COLOR_TYPE.RGB:
|
||||||
|
color_key = str(font.color.rgb)
|
||||||
|
elif font.color.type == MSO_COLOR_TYPE.THEME:
|
||||||
|
color_key = f"THEME_{font.color.theme_color}_{font.color.brightness}"
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return (
|
||||||
|
font.name,
|
||||||
|
font.size,
|
||||||
|
font.bold,
|
||||||
|
font.italic,
|
||||||
|
font.underline,
|
||||||
|
color_key
|
||||||
|
)
|
||||||
|
|
||||||
|
def _have_same_significant_styles(self, run1, run2) -> bool:
|
||||||
|
"""检查两个 Run 是否样式相同且在 XML 结构上紧邻(中间无公式)。"""
|
||||||
|
if run1 is None or run2 is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 1. 检查视觉样式是否一致
|
||||||
|
if self._get_font_signature(run1) != self._get_font_signature(run2):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 2. 检查 XML 邻接性
|
||||||
|
# 如果 run1 和 run2 之间夹杂了 <m:oMath> (公式) 或其他标签,
|
||||||
|
# 它们的 XML 索引将不连续。此时必须切分,否则回填时文字会跑到公式前面。
|
||||||
|
try:
|
||||||
|
r1_element = run1._r
|
||||||
|
r2_element = run2._r
|
||||||
|
parent = r1_element.getparent()
|
||||||
|
|
||||||
|
# 只有当它们属于同一个父节点,且索引差为1时,才视为紧邻
|
||||||
|
if parent == r2_element.getparent():
|
||||||
|
index1 = parent.index(r1_element)
|
||||||
|
index2 = parent.index(r2_element)
|
||||||
|
if index2 != index1 + 1:
|
||||||
|
return False # 中间有东西(如公式),禁止合并
|
||||||
|
except Exception:
|
||||||
|
# 如果底层操作失败,保守起见不合并
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _set_east_asian_font(self, run, font_name: str):
|
||||||
|
"""设置 Run 的东亚字体 (解决中文乱码/宋体问题)。"""
|
||||||
|
if not font_name:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
rPr = run.font._element.get_or_add_rPr()
|
||||||
|
# 设置 ea (East Asian) 字体,不影响 latin (西文) 字体
|
||||||
|
ea = rPr.get_or_add_ea()
|
||||||
|
ea.set(qn('a:typeface'), font_name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ---------------- 核心遍历逻辑 ----------------
|
||||||
|
|
||||||
|
def _process_text_frame(self, text_frame: TextFrame, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
|
"""处理 TextFrame 中的所有段落"""
|
||||||
|
for paragraph in text_frame.paragraphs:
|
||||||
|
self._process_paragraph(paragraph, elements, texts)
|
||||||
|
|
||||||
|
def _process_paragraph(self, paragraph: _Paragraph, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
|
"""处理单个段落,智能切分文本"""
|
||||||
|
if not paragraph.runs:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_runs = []
|
||||||
|
|
||||||
|
def flush_segment():
|
||||||
|
if not current_runs:
|
||||||
|
return
|
||||||
|
full_text = "".join(r.text for r in current_runs)
|
||||||
|
# 只有非空文本才翻译
|
||||||
|
if full_text.strip():
|
||||||
|
elements.append({
|
||||||
|
"type": "text_runs",
|
||||||
|
"runs": list(current_runs),
|
||||||
|
"paragraph": paragraph,
|
||||||
|
"text_frame": paragraph._parent
|
||||||
|
})
|
||||||
|
texts.append(full_text)
|
||||||
|
current_runs.clear()
|
||||||
|
|
||||||
|
for run in paragraph.runs:
|
||||||
|
# 这里的 run.text 只有纯文本,不包含公式内容
|
||||||
|
if not run.text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
last_run = current_runs[-1] if current_runs else None
|
||||||
|
|
||||||
|
# 样式不同 或 物理位置不连续(中间有公式)则切分
|
||||||
|
if last_run and not self._have_same_significant_styles(last_run, run):
|
||||||
|
flush_segment()
|
||||||
|
|
||||||
|
current_runs.append(run)
|
||||||
|
|
||||||
|
flush_segment()
|
||||||
|
|
||||||
|
def _process_shape(self, shape, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
|
"""递归处理常规形状"""
|
||||||
|
# 1. 组合图形
|
||||||
|
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
||||||
|
for child_shape in shape.shapes:
|
||||||
|
self._process_shape(child_shape, elements, texts)
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. 表格
|
||||||
|
if shape.has_table:
|
||||||
|
for row in shape.table.rows:
|
||||||
|
for cell in row.cells:
|
||||||
|
if hasattr(cell, "text_frame") and cell.text_frame:
|
||||||
|
self._process_text_frame(cell.text_frame, elements, texts)
|
||||||
|
return
|
||||||
|
|
||||||
|
# 3. 常规文本框
|
||||||
|
if shape.has_text_frame:
|
||||||
|
try:
|
||||||
|
self._process_text_frame(shape.text_frame, elements, texts)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _scan_deep_xml_for_text(self, slide_element, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
|
"""
|
||||||
|
[深度扫描] 直接遍历 XML 树,寻找标准 API 无法触及的文本。
|
||||||
|
修复了 KeyError: 'mc' 问题。
|
||||||
|
"""
|
||||||
|
# 定义 XML 命名空间 URI
|
||||||
|
MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
||||||
|
# 手动构建带命名空间的标签名,不依赖 qn()
|
||||||
|
MC_ALT = f"{{{MC_NS}}}AlternateContent"
|
||||||
|
MC_CHOICE = f"{{{MC_NS}}}Choice"
|
||||||
|
|
||||||
|
# 对于 'p' (PresentationML) 命名空间,python-pptx 支持 qn,可以继续使用
|
||||||
|
P_SP = qn('p:sp')
|
||||||
|
P_TXBODY = qn('p:txBody')
|
||||||
|
|
||||||
|
# 查找所有 AlternateContent 块
|
||||||
|
for alt_content in slide_element.iter(MC_ALT):
|
||||||
|
# 找到 Choice 分支
|
||||||
|
choice = alt_content.find(MC_CHOICE)
|
||||||
|
if choice is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 在 Choice 内部寻找形状 (p:sp)
|
||||||
|
for sp in choice.iter(P_SP):
|
||||||
|
# 寻找 p:txBody (文本主体)
|
||||||
|
txBody = sp.find(P_TXBODY)
|
||||||
|
if txBody is not None:
|
||||||
|
try:
|
||||||
|
# 手动构建 TextFrame 对象
|
||||||
|
# 这里的 parent 设为 None 在读取/写入 text 属性时通常是安全的
|
||||||
|
tf = TextFrame(txBody, None)
|
||||||
|
self._process_text_frame(tf, elements, texts)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"处理深度 XML 文本框时出错: {e}")
|
||||||
|
|
||||||
|
def _scan_presentation_content(self, prs: Presentation, elements: List[Dict[str, Any]], texts: List[str]):
|
||||||
|
"""全量扫描 PPT 内容"""
|
||||||
|
|
||||||
|
# 辅助内部函数:扫描单个“幻灯片类”对象
|
||||||
|
def scan_slide_object(slide_obj):
|
||||||
|
# 1. 常规 API 遍历 (处理普通文本、表格、组合)
|
||||||
|
for shape in slide_obj.shapes:
|
||||||
|
self._process_shape(shape, elements, texts)
|
||||||
|
|
||||||
|
# 2. 深度 XML 遍历 (处理 AlternateContent/公式文本)
|
||||||
|
self._scan_deep_xml_for_text(slide_obj.element, elements, texts)
|
||||||
|
|
||||||
|
# 1. 遍历普通幻灯片 (Slides)
|
||||||
|
for slide in prs.slides:
|
||||||
|
scan_slide_object(slide)
|
||||||
|
# 备注页
|
||||||
|
if slide.has_notes_slide:
|
||||||
|
notes = slide.notes_slide
|
||||||
|
if notes.notes_text_frame:
|
||||||
|
self._process_text_frame(notes.notes_text_frame, elements, texts)
|
||||||
|
|
||||||
|
# 2. 遍历母版 (Slide Masters)
|
||||||
|
for master in prs.slide_masters:
|
||||||
|
scan_slide_object(master)
|
||||||
|
|
||||||
|
# 3. 遍历版式 (Layouts)
|
||||||
|
for layout in master.slide_layouts:
|
||||||
|
scan_slide_object(layout)
|
||||||
|
|
||||||
|
# ---------------- 翻译前后处理 ----------------
|
||||||
|
|
||||||
|
def _pre_translate(self, document: Document) -> Tuple[Presentation, List[Dict[str, Any]], List[str]]:
|
||||||
|
"""解析 PPT 文件"""
|
||||||
|
prs = Presentation(BytesIO(document.content))
|
||||||
|
elements, texts = [], []
|
||||||
|
|
||||||
|
self._scan_presentation_content(prs, elements, texts)
|
||||||
|
self.logger.info(f"共提取了 {len(texts)} 个文本片段 (包含隐藏的公式文本)。")
|
||||||
|
return prs, elements, texts
|
||||||
|
|
||||||
|
def _apply_translation(self, element_info: Dict[str, Any], final_text: str):
|
||||||
|
"""回填翻译,精细控制样式"""
|
||||||
|
runs = element_info["runs"]
|
||||||
|
if not runs:
|
||||||
|
return
|
||||||
|
|
||||||
|
original_text = "".join(r.text for r in runs)
|
||||||
|
|
||||||
|
text_to_set = final_text
|
||||||
|
if self.insert_mode == "append":
|
||||||
|
text_to_set = original_text + self.separator + final_text
|
||||||
|
elif self.insert_mode == "prepend":
|
||||||
|
text_to_set = final_text + self.separator + original_text
|
||||||
|
|
||||||
|
# --- 回填策略 ---
|
||||||
|
primary_run = runs[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. 写入文本
|
||||||
|
primary_run.text = text_to_set
|
||||||
|
|
||||||
|
# 2. 设置东亚字体 (保留西文字体设置)
|
||||||
|
if self.target_cjk_font:
|
||||||
|
self._set_east_asian_font(primary_run, self.target_cjk_font)
|
||||||
|
|
||||||
|
# 3. 处理溢出
|
||||||
|
text_frame = element_info.get("text_frame")
|
||||||
|
if text_frame and hasattr(text_frame, 'auto_size'):
|
||||||
|
if text_frame.auto_size == MSO_AUTO_SIZE.NONE:
|
||||||
|
text_frame.auto_size = MSO_AUTO_SIZE.TEXT_TO_FIT_SHAPE
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"应用翻译到 Run 时出错: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 清空后续 run (模拟合并效果)
|
||||||
|
for i in range(1, len(runs)):
|
||||||
|
runs[i].text = ""
|
||||||
|
|
||||||
|
def _after_translate(self, prs: Presentation, elements: List[Dict[str, Any]], translated: List[str],
|
||||||
|
originals: List[str]) -> bytes:
|
||||||
|
"""保存结果"""
|
||||||
|
if len(elements) != len(translated):
|
||||||
|
min_len = min(len(elements), len(translated))
|
||||||
|
elements = elements[:min_len]
|
||||||
|
translated = translated[:min_len]
|
||||||
|
|
||||||
|
for info, trans in zip(elements, translated):
|
||||||
|
self._apply_translation(info, trans)
|
||||||
|
|
||||||
|
output_stream = BytesIO()
|
||||||
|
prs.save(output_stream)
|
||||||
|
return output_stream.getvalue()
|
||||||
|
|
||||||
|
# ---------------- 接口实现 ----------------
|
||||||
|
|
||||||
|
def translate(self, document: Document) -> Self:
|
||||||
|
prs, elements, originals = self._pre_translate(document)
|
||||||
|
if not originals:
|
||||||
|
self.logger.info("未找到可翻译文本。")
|
||||||
|
document.content = self._after_translate(prs, elements, [], [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
self.glossary_dict_gen = self.glossary_agent.send_segments(originals, self.chunk_size)
|
||||||
|
if self.translate_agent:
|
||||||
|
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
||||||
|
|
||||||
|
translated = self.translate_agent.send_segments(originals,
|
||||||
|
self.chunk_size) if self.translate_agent else originals
|
||||||
|
document.content = self._after_translate(prs, elements, translated, originals)
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self, document: Document) -> Self:
|
||||||
|
prs, elements, originals = await asyncio.to_thread(self._pre_translate, document)
|
||||||
|
if not originals:
|
||||||
|
self.logger.info("未找到可翻译文本。")
|
||||||
|
document.content = await asyncio.to_thread(self._after_translate, prs, elements, [], [])
|
||||||
|
return self
|
||||||
|
|
||||||
|
if self.glossary_agent:
|
||||||
|
self.glossary_dict_gen = await self.glossary_agent.send_segments_async(originals, self.chunk_size)
|
||||||
|
if self.translate_agent:
|
||||||
|
self.translate_agent.update_glossary_dict(self.glossary_dict_gen)
|
||||||
|
|
||||||
|
translated = await self.translate_agent.send_segments_async(originals,
|
||||||
|
self.chunk_size) if self.translate_agent else originals
|
||||||
|
document.content = await asyncio.to_thread(self._after_translate, prs, elements, translated, originals)
|
||||||
|
return self
|
||||||
@@ -111,3 +111,12 @@ class AssExportable(Protocol[T_ExporterConfig]):
|
|||||||
|
|
||||||
def save_as_ass(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
def save_as_ass(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class PPTXExportable(Protocol[T_ExporterConfig]):
|
||||||
|
def export_to_pptx(self, config: T_ExporterConfig | None = None) -> bytes:
|
||||||
|
...
|
||||||
|
|
||||||
|
def save_as_pptx(self, name: str, output_dir: Path | str, config: T_ExporterConfig | None = None) -> Self:
|
||||||
|
...
|
||||||
|
|||||||
72
docutranslate/workflow/pptx_workflow.py
Normal file
72
docutranslate/workflow/pptx_workflow.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from docutranslate.exporter.base import ExporterConfig
|
||||||
|
from docutranslate.exporter.pptx.pptx2html_exporter import PPTX2HTMLExporterConfig, PPTX2HTMLExporter
|
||||||
|
from docutranslate.exporter.pptx.pptx2pptx_exporter import PPTX2PPTXExporter
|
||||||
|
from docutranslate.glossary.glossary import Glossary
|
||||||
|
from docutranslate.ir.document import Document
|
||||||
|
from docutranslate.translator.ai_translator.pptx_translator import PPTXTranslatorConfig, PPTXTranslator
|
||||||
|
from docutranslate.workflow.base import WorkflowConfig, Workflow
|
||||||
|
from docutranslate.workflow.interfaces import HTMLExportable, PPTXExportable
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class PPTXWorkflowConfig(WorkflowConfig):
|
||||||
|
translator_config: PPTXTranslatorConfig
|
||||||
|
html_exporter_config: PPTX2HTMLExporterConfig
|
||||||
|
|
||||||
|
|
||||||
|
class PPTXWorkflow(Workflow[PPTXWorkflowConfig, Document, Document], HTMLExportable[PPTX2HTMLExporterConfig],
|
||||||
|
PPTXExportable[ExporterConfig]):
|
||||||
|
def __init__(self, config: PPTXWorkflowConfig):
|
||||||
|
super().__init__(config=config)
|
||||||
|
if config.logger:
|
||||||
|
for sub_config in [self.config.translator_config]:
|
||||||
|
if sub_config:
|
||||||
|
sub_config.logger = config.logger
|
||||||
|
|
||||||
|
def _pre_translate(self, document_original: Document):
|
||||||
|
document = document_original.copy()
|
||||||
|
translate_config = self.config.translator_config
|
||||||
|
translator = PPTXTranslator(translate_config)
|
||||||
|
return document, translator
|
||||||
|
|
||||||
|
def translate(self) -> Self:
|
||||||
|
document, translator = self._pre_translate(self.document_original)
|
||||||
|
translator.translate(document)
|
||||||
|
if translator.glossary_dict_gen:
|
||||||
|
self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen))
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def translate_async(self) -> Self:
|
||||||
|
document, translator = self._pre_translate(self.document_original)
|
||||||
|
await translator.translate_async(document)
|
||||||
|
if translator.glossary_dict_gen:
|
||||||
|
self.attachment.add_document("glossary", Glossary.glossary_dict2csv(translator.glossary_dict_gen))
|
||||||
|
self.document_translated = document
|
||||||
|
return self
|
||||||
|
|
||||||
|
def export_to_html(self, config: PPTX2HTMLExporterConfig = None) -> str:
|
||||||
|
config = config or self.config.html_exporter_config
|
||||||
|
docu = self._export(PPTX2HTMLExporter(config))
|
||||||
|
return docu.content.decode()
|
||||||
|
|
||||||
|
def export_to_pptx(self, _: ExporterConfig | None = None) -> bytes:
|
||||||
|
docu = self._export(PPTX2PPTXExporter())
|
||||||
|
return docu.content
|
||||||
|
|
||||||
|
def save_as_html(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
|
config: PPTX2HTMLExporter | None = None) -> Self:
|
||||||
|
config = config or self.config.html_exporter_config
|
||||||
|
self._save(exporter=PPTX2HTMLExporter(config), name=name, output_dir=output_dir)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def save_as_pptx(self, name: str = None, output_dir: Path | str = "./output",
|
||||||
|
_: ExporterConfig | None = None) -> Self:
|
||||||
|
self._save(exporter=PPTX2PPTXExporter(), name=name, output_dir=output_dir)
|
||||||
|
return self
|
||||||
@@ -19,6 +19,7 @@ dependencies = [
|
|||||||
"chardet>=5.2.0",
|
"chardet>=5.2.0",
|
||||||
"pysubs2>=1.8.0",
|
"pysubs2>=1.8.0",
|
||||||
"httpx>=0.28.1",
|
"httpx>=0.28.1",
|
||||||
|
"python-pptx>=1.0.2",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
2
uv.lock
generated
2
uv.lock
generated
@@ -386,6 +386,7 @@ dependencies = [
|
|||||||
{ name = "pymdown-extensions" },
|
{ name = "pymdown-extensions" },
|
||||||
{ name = "pysubs2" },
|
{ name = "pysubs2" },
|
||||||
{ name = "python-docx" },
|
{ name = "python-docx" },
|
||||||
|
{ name = "python-pptx" },
|
||||||
{ name = "srt" },
|
{ name = "srt" },
|
||||||
{ name = "xlsx2html" },
|
{ name = "xlsx2html" },
|
||||||
]
|
]
|
||||||
@@ -422,6 +423,7 @@ requires-dist = [
|
|||||||
{ name = "pymdown-extensions", specifier = ">=10.16.1" },
|
{ name = "pymdown-extensions", specifier = ">=10.16.1" },
|
||||||
{ name = "pysubs2", specifier = ">=1.8.0" },
|
{ name = "pysubs2", specifier = ">=1.8.0" },
|
||||||
{ name = "python-docx", specifier = ">=1.2.0" },
|
{ name = "python-docx", specifier = ">=1.2.0" },
|
||||||
|
{ name = "python-pptx", specifier = ">=1.0.2" },
|
||||||
{ name = "srt", specifier = ">=3.5.3" },
|
{ name = "srt", specifier = ">=3.5.3" },
|
||||||
{ name = "xlsx2html", specifier = ">=0.6.2" },
|
{ name = "xlsx2html", specifier = ">=0.6.2" },
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user