优化xlsx\epub翻译

This commit is contained in:
xunbu
2025-11-12 18:58:54 +08:00
parent 555090bd48
commit 98576a8714
4 changed files with 53 additions and 83 deletions

View File

@@ -1,3 +1,3 @@
# SPDX-FileCopyrightText: 2025 QinHan # SPDX-FileCopyrightText: 2025 QinHan
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
__version__="1.5.1" __version__="1.5.2a1"

View File

@@ -2,13 +2,11 @@
# SPDX-License-Identifier: MPL-2.0 # SPDX-License-Identifier: MPL-2.0
import asyncio import asyncio
import os import os
import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import zipfile import zipfile
from collections import defaultdict
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Dict, Any, Tuple from typing import Self, Literal, List, Dict, Any
from bs4 import BeautifulSoup, Tag, NavigableString from bs4 import BeautifulSoup, Tag, NavigableString
@@ -105,7 +103,7 @@ class EpubTranslator(AiTranslator):
continue continue
if file_path not in soups: if file_path not in soups:
soups[file_path] = BeautifulSoup(content_bytes, "html.parser") soups[file_path] = BeautifulSoup(content_bytes, "lxml")
soup = soups[file_path] soup = soups[file_path]
@@ -154,7 +152,7 @@ class EpubTranslator(AiTranslator):
if self.insert_mode == "replace": if self.insert_mode == "replace":
original_tag.clear() original_tag.clear()
new_content_soup = BeautifulSoup(translated_html, 'html.parser') new_content_soup = BeautifulSoup(translated_html, 'lxml')
for node in list(new_content_soup.children): for node in list(new_content_soup.children):
original_tag.append(node.extract()) original_tag.append(node.extract())
@@ -163,8 +161,8 @@ class EpubTranslator(AiTranslator):
original_tag.clear() original_tag.clear()
# 解析HTML片段 # 解析HTML片段
original_nodes = BeautifulSoup(original_html, 'html.parser').contents original_nodes = BeautifulSoup(original_html, 'lxml').contents
translated_nodes = BeautifulSoup(translated_html, 'html.parser').contents translated_nodes = BeautifulSoup(translated_html, 'lxml').contents
# 创建分隔符节点 # 创建分隔符节点
separator_nodes = [] separator_nodes = []
@@ -189,7 +187,7 @@ class EpubTranslator(AiTranslator):
else: else:
# --- 常规块级元素处理:创建新标签 --- # --- 常规块级元素处理:创建新标签 ---
translated_tag = soup.new_tag(original_tag.name, attrs=original_tag.attrs) translated_tag = soup.new_tag(original_tag.name, attrs=original_tag.attrs)
new_content_soup = BeautifulSoup(translated_html, 'html.parser') new_content_soup = BeautifulSoup(translated_html, 'lxml')
for node in list(new_content_soup.children): for node in list(new_content_soup.children):
translated_tag.append(node.extract()) translated_tag.append(node.extract())

View File

@@ -7,7 +7,7 @@ from typing import Self, Literal, List, Optional
import zipfile import zipfile
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import openpyxl import openpyxl # openpyxl 仍然保留,以备将来可能需要混合模式或用于其他目的
from openpyxl.cell import Cell from openpyxl.cell import Cell
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
@@ -54,84 +54,51 @@ class XlsxTranslator(AiTranslator):
self.translate_regions = config.translate_regions self.translate_regions = config.translate_regions
def _get_texts_to_translate(self, document: Document) -> List[str]: def _get_texts_to_translate(self, document: Document) -> List[str]:
"""使用 openpyxl 识别指定区域内需要翻译的文本。""" """
【已修改】通过直接解析内部XML文件来识别需要翻译的文本。
这种方法可以正确处理包含富文本的单元格,并确保与重建逻辑一致,但不支持按区域翻译。
"""
if self.translate_regions:
self.logger.warning("当前文本提取方法直接解析XML不支持 'translate_regions'。将翻译文件中的所有文本内容。")
texts_to_translate = set() texts_to_translate = set()
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
try: try:
# 使用 data_only=True 来获取单元格的计算值,而不是公式 with zipfile.ZipFile(BytesIO(document.content), 'r') as original_zip:
workbook = openpyxl.load_workbook(BytesIO(document.content), data_only=True) # --- 1. 处理共享字符串 (sharedStrings.xml) ---
# 如果未指定区域,则翻译所有文本 # 这是所有文本(包括富文本片段)的主要存储位置。
if not self.translate_regions: if "xl/sharedStrings.xml" in original_zip.namelist():
for sheet in workbook.worksheets: with original_zip.open("xl/sharedStrings.xml") as f:
for row in sheet.iter_rows(): root = ET.fromstring(f.read())
for cell in row: # 查找所有 <t> 元素,无论它们在哪个层级,这能正确捕获富文本片段
# 仅处理共享字符串类型,这是最常见的文本存储方式 text_nodes = root.findall('.//main:t', ns)
if isinstance(cell.value, str) and cell.data_type == "s": for node in text_nodes:
texts_to_translate.add(cell.value) # 确保节点有文本内容且不是纯粹的空白
# 同时也要检查表格的标题 if node.text and node.text.strip():
for sheet in workbook.worksheets: texts_to_translate.add(node.text)
for table in sheet._tables:
for column in table.tableColumns:
if column.name:
texts_to_translate.add(column.name)
# 如果指定了区域 # --- 2. 处理表格标题 (tableX.xml) ---
else: # 表格的列名不存储在 sharedStrings.xml 中,需要单独处理。
processed_coordinates = set() for item in original_zip.infolist():
regions_by_sheet = {} if item.filename.startswith("xl/tables/table"):
all_sheet_regions = [] with original_zip.open(item.filename) as f:
for region in self.translate_regions: root = ET.fromstring(f.read())
if '!' in region: table_columns = root.findall('.//main:tableColumn', ns)
sheet_name, cell_range = region.split('!', 1) for col in table_columns:
# 支持带引号的工作表名称 original_name = col.get('name')
sheet_name = sheet_name.strip("'") if original_name and original_name.strip():
if sheet_name not in regions_by_sheet: texts_to_translate.add(original_name)
regions_by_sheet[sheet_name] = []
regions_by_sheet[sheet_name].append(cell_range)
else:
all_sheet_regions.append(region)
for sheet in workbook.worksheets:
sheet_specific_ranges = regions_by_sheet.get(sheet.title, [])
total_ranges_for_this_sheet = sheet_specific_ranges + all_sheet_regions
if not total_ranges_for_this_sheet:
continue
# 检查此区域内的表格标题
for table in sheet._tables:
# openpyxl 没有提供简单的方法来检查表格是否与区域相交
# 为简单起见,我们假设如果指定了工作表,则翻译该工作表上的所有表格标题
for column in table.tableColumns:
if column.name:
texts_to_translate.add(column.name)
for cell_range in total_ranges_for_this_sheet:
try:
cells_in_range = sheet[cell_range]
flat_cells = []
if isinstance(cells_in_range, Cell):
flat_cells.append(cells_in_range)
elif isinstance(cells_in_range, tuple):
for item in cells_in_range:
if isinstance(item, Cell):
flat_cells.append(item)
elif isinstance(item, tuple):
flat_cells.extend(item)
for cell in flat_cells:
if isinstance(cell.value, str) and cell.data_type == "s":
texts_to_translate.add(cell.value)
except Exception as e: except Exception as e:
self.logger.warning(f"跳过无效的区域 '{cell_range}' 在工作表 '{sheet.title}'. 错误: {e}") self.logger.error(f"直接解析XLSX的XML文件失败: {e}")
workbook.close()
except Exception as e:
self.logger.error(f"使用 openpyxl 预处理文件失败: {e}")
return list(texts_to_translate) return list(texts_to_translate)
def _rebuild_xlsx_with_translated_content(self, original_content_bytes: bytes, translation_map: dict) -> bytes: def _rebuild_xlsx_with_translated_content(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
""" """
通过替换 sharedStrings.xml 和 tableX.xml 中的文本内容来重构 XLSX 文件。 【无需修改】通过替换 sharedStrings.xml 和 tableX.xml 中的文本内容来重构 XLSX 文件。
此函数的逻辑与新的读取逻辑完全匹配。
""" """
# 注册命名空间以正确解析和生成XML # 注册命名空间以正确解析和生成XML
ns = { ns = {
@@ -192,7 +159,7 @@ class XlsxTranslator(AiTranslator):
original_texts = self._get_texts_to_translate(document) original_texts = self._get_texts_to_translate(document)
if not original_texts: if not original_texts:
print("\n指定区域中没有找到需要翻译的文本内容。") print("\n文件中没有找到需要翻译的文本内容。")
return self return self
if self.glossary_agent: if self.glossary_agent:
@@ -215,7 +182,7 @@ class XlsxTranslator(AiTranslator):
original_texts = await asyncio.to_thread(self._get_texts_to_translate, document) original_texts = await asyncio.to_thread(self._get_texts_to_translate, document)
if not original_texts: if not original_texts:
print("\n指定区域中没有找到需要翻译的文本内容。") print("\n文件中没有找到需要翻译的文本内容。")
return self return self
if self.glossary_agent: if self.glossary_agent:

View File

@@ -1,8 +1,13 @@
更新日志 更新日志
---------------- ----------------
v1.5.1版 v1.5.2-alpha1版
优化
- 优化xlsx翻译效果
- 优化epub的性能开销
----------------
v1.5.1版 2025.11.10
特性 特性
- 支持mineru部署服务 - 支持使用mineru部署服务
- txt翻译支持不分段、按连续行分段 - txt翻译支持不分段、按连续行分段
- 提供强制json输出的选项 - 提供强制json输出的选项
优化 优化