优化xlsx\epub翻译
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
# SPDX-FileCopyrightText: 2025 QinHan
|
# SPDX-FileCopyrightText: 2025 QinHan
|
||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
__version__="1.5.1"
|
__version__="1.5.2a1"
|
||||||
@@ -2,13 +2,11 @@
|
|||||||
# SPDX-License-Identifier: MPL-2.0
|
# SPDX-License-Identifier: MPL-2.0
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import zipfile
|
import zipfile
|
||||||
from collections import defaultdict
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Self, Literal, List, Dict, Any, Tuple
|
from typing import Self, Literal, List, Dict, Any
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, Tag, NavigableString
|
from bs4 import BeautifulSoup, Tag, NavigableString
|
||||||
|
|
||||||
@@ -105,7 +103,7 @@ class EpubTranslator(AiTranslator):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if file_path not in soups:
|
if file_path not in soups:
|
||||||
soups[file_path] = BeautifulSoup(content_bytes, "html.parser")
|
soups[file_path] = BeautifulSoup(content_bytes, "lxml")
|
||||||
|
|
||||||
soup = soups[file_path]
|
soup = soups[file_path]
|
||||||
|
|
||||||
@@ -154,7 +152,7 @@ class EpubTranslator(AiTranslator):
|
|||||||
|
|
||||||
if self.insert_mode == "replace":
|
if self.insert_mode == "replace":
|
||||||
original_tag.clear()
|
original_tag.clear()
|
||||||
new_content_soup = BeautifulSoup(translated_html, 'html.parser')
|
new_content_soup = BeautifulSoup(translated_html, 'lxml')
|
||||||
for node in list(new_content_soup.children):
|
for node in list(new_content_soup.children):
|
||||||
original_tag.append(node.extract())
|
original_tag.append(node.extract())
|
||||||
|
|
||||||
@@ -163,8 +161,8 @@ class EpubTranslator(AiTranslator):
|
|||||||
original_tag.clear()
|
original_tag.clear()
|
||||||
|
|
||||||
# 解析HTML片段
|
# 解析HTML片段
|
||||||
original_nodes = BeautifulSoup(original_html, 'html.parser').contents
|
original_nodes = BeautifulSoup(original_html, 'lxml').contents
|
||||||
translated_nodes = BeautifulSoup(translated_html, 'html.parser').contents
|
translated_nodes = BeautifulSoup(translated_html, 'lxml').contents
|
||||||
|
|
||||||
# 创建分隔符节点
|
# 创建分隔符节点
|
||||||
separator_nodes = []
|
separator_nodes = []
|
||||||
@@ -189,7 +187,7 @@ class EpubTranslator(AiTranslator):
|
|||||||
else:
|
else:
|
||||||
# --- 常规块级元素处理:创建新标签 ---
|
# --- 常规块级元素处理:创建新标签 ---
|
||||||
translated_tag = soup.new_tag(original_tag.name, attrs=original_tag.attrs)
|
translated_tag = soup.new_tag(original_tag.name, attrs=original_tag.attrs)
|
||||||
new_content_soup = BeautifulSoup(translated_html, 'html.parser')
|
new_content_soup = BeautifulSoup(translated_html, 'lxml')
|
||||||
for node in list(new_content_soup.children):
|
for node in list(new_content_soup.children):
|
||||||
translated_tag.append(node.extract())
|
translated_tag.append(node.extract())
|
||||||
|
|
||||||
@@ -268,4 +266,4 @@ class EpubTranslator(AiTranslator):
|
|||||||
document.content = await asyncio.to_thread(
|
document.content = await asyncio.to_thread(
|
||||||
self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts
|
self._after_translate, all_files, soups, items_to_translate, translated_texts, original_texts
|
||||||
)
|
)
|
||||||
return self
|
return self
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from typing import Self, Literal, List, Optional
|
|||||||
import zipfile
|
import zipfile
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
import openpyxl
|
import openpyxl # openpyxl 仍然保留,以备将来可能需要混合模式或用于其他目的
|
||||||
from openpyxl.cell import Cell
|
from openpyxl.cell import Cell
|
||||||
|
|
||||||
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
|
||||||
@@ -54,84 +54,51 @@ class XlsxTranslator(AiTranslator):
|
|||||||
self.translate_regions = config.translate_regions
|
self.translate_regions = config.translate_regions
|
||||||
|
|
||||||
def _get_texts_to_translate(self, document: Document) -> List[str]:
|
def _get_texts_to_translate(self, document: Document) -> List[str]:
|
||||||
"""使用 openpyxl 识别指定区域内需要翻译的文本。"""
|
"""
|
||||||
|
【已修改】通过直接解析内部XML文件来识别需要翻译的文本。
|
||||||
|
这种方法可以正确处理包含富文本的单元格,并确保与重建逻辑一致,但不支持按区域翻译。
|
||||||
|
"""
|
||||||
|
if self.translate_regions:
|
||||||
|
self.logger.warning("当前文本提取方法直接解析XML,不支持 'translate_regions'。将翻译文件中的所有文本内容。")
|
||||||
|
|
||||||
texts_to_translate = set()
|
texts_to_translate = set()
|
||||||
|
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 使用 data_only=True 来获取单元格的计算值,而不是公式
|
with zipfile.ZipFile(BytesIO(document.content), 'r') as original_zip:
|
||||||
workbook = openpyxl.load_workbook(BytesIO(document.content), data_only=True)
|
# --- 1. 处理共享字符串 (sharedStrings.xml) ---
|
||||||
# 如果未指定区域,则翻译所有文本
|
# 这是所有文本(包括富文本片段)的主要存储位置。
|
||||||
if not self.translate_regions:
|
if "xl/sharedStrings.xml" in original_zip.namelist():
|
||||||
for sheet in workbook.worksheets:
|
with original_zip.open("xl/sharedStrings.xml") as f:
|
||||||
for row in sheet.iter_rows():
|
root = ET.fromstring(f.read())
|
||||||
for cell in row:
|
# 查找所有 <t> 元素,无论它们在哪个层级,这能正确捕获富文本片段
|
||||||
# 仅处理共享字符串类型,这是最常见的文本存储方式
|
text_nodes = root.findall('.//main:t', ns)
|
||||||
if isinstance(cell.value, str) and cell.data_type == "s":
|
for node in text_nodes:
|
||||||
texts_to_translate.add(cell.value)
|
# 确保节点有文本内容且不是纯粹的空白
|
||||||
# 同时也要检查表格的标题
|
if node.text and node.text.strip():
|
||||||
for sheet in workbook.worksheets:
|
texts_to_translate.add(node.text)
|
||||||
for table in sheet._tables:
|
|
||||||
for column in table.tableColumns:
|
|
||||||
if column.name:
|
|
||||||
texts_to_translate.add(column.name)
|
|
||||||
|
|
||||||
# 如果指定了区域
|
# --- 2. 处理表格标题 (tableX.xml) ---
|
||||||
else:
|
# 表格的列名不存储在 sharedStrings.xml 中,需要单独处理。
|
||||||
processed_coordinates = set()
|
for item in original_zip.infolist():
|
||||||
regions_by_sheet = {}
|
if item.filename.startswith("xl/tables/table"):
|
||||||
all_sheet_regions = []
|
with original_zip.open(item.filename) as f:
|
||||||
for region in self.translate_regions:
|
root = ET.fromstring(f.read())
|
||||||
if '!' in region:
|
table_columns = root.findall('.//main:tableColumn', ns)
|
||||||
sheet_name, cell_range = region.split('!', 1)
|
for col in table_columns:
|
||||||
# 支持带引号的工作表名称
|
original_name = col.get('name')
|
||||||
sheet_name = sheet_name.strip("'")
|
if original_name and original_name.strip():
|
||||||
if sheet_name not in regions_by_sheet:
|
texts_to_translate.add(original_name)
|
||||||
regions_by_sheet[sheet_name] = []
|
|
||||||
regions_by_sheet[sheet_name].append(cell_range)
|
|
||||||
else:
|
|
||||||
all_sheet_regions.append(region)
|
|
||||||
|
|
||||||
for sheet in workbook.worksheets:
|
|
||||||
sheet_specific_ranges = regions_by_sheet.get(sheet.title, [])
|
|
||||||
total_ranges_for_this_sheet = sheet_specific_ranges + all_sheet_regions
|
|
||||||
|
|
||||||
if not total_ranges_for_this_sheet:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 检查此区域内的表格标题
|
|
||||||
for table in sheet._tables:
|
|
||||||
# openpyxl 没有提供简单的方法来检查表格是否与区域相交
|
|
||||||
# 为简单起见,我们假设如果指定了工作表,则翻译该工作表上的所有表格标题
|
|
||||||
for column in table.tableColumns:
|
|
||||||
if column.name:
|
|
||||||
texts_to_translate.add(column.name)
|
|
||||||
|
|
||||||
for cell_range in total_ranges_for_this_sheet:
|
|
||||||
try:
|
|
||||||
cells_in_range = sheet[cell_range]
|
|
||||||
flat_cells = []
|
|
||||||
if isinstance(cells_in_range, Cell):
|
|
||||||
flat_cells.append(cells_in_range)
|
|
||||||
elif isinstance(cells_in_range, tuple):
|
|
||||||
for item in cells_in_range:
|
|
||||||
if isinstance(item, Cell):
|
|
||||||
flat_cells.append(item)
|
|
||||||
elif isinstance(item, tuple):
|
|
||||||
flat_cells.extend(item)
|
|
||||||
|
|
||||||
for cell in flat_cells:
|
|
||||||
if isinstance(cell.value, str) and cell.data_type == "s":
|
|
||||||
texts_to_translate.add(cell.value)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"跳过无效的区域 '{cell_range}' 在工作表 '{sheet.title}'. 错误: {e}")
|
|
||||||
workbook.close()
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"使用 openpyxl 预处理文件失败: {e}")
|
self.logger.error(f"直接解析XLSX的XML文件失败: {e}")
|
||||||
|
|
||||||
return list(texts_to_translate)
|
return list(texts_to_translate)
|
||||||
|
|
||||||
def _rebuild_xlsx_with_translated_content(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
|
def _rebuild_xlsx_with_translated_content(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
|
||||||
"""
|
"""
|
||||||
通过替换 sharedStrings.xml 和 tableX.xml 中的文本内容来重构 XLSX 文件。
|
【无需修改】通过替换 sharedStrings.xml 和 tableX.xml 中的文本内容来重构 XLSX 文件。
|
||||||
|
此函数的逻辑与新的读取逻辑完全匹配。
|
||||||
"""
|
"""
|
||||||
# 注册命名空间以正确解析和生成XML
|
# 注册命名空间以正确解析和生成XML
|
||||||
ns = {
|
ns = {
|
||||||
@@ -192,7 +159,7 @@ class XlsxTranslator(AiTranslator):
|
|||||||
original_texts = self._get_texts_to_translate(document)
|
original_texts = self._get_texts_to_translate(document)
|
||||||
|
|
||||||
if not original_texts:
|
if not original_texts:
|
||||||
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
|
print("\n在文件中没有找到需要翻译的文本内容。")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
if self.glossary_agent:
|
if self.glossary_agent:
|
||||||
@@ -215,7 +182,7 @@ class XlsxTranslator(AiTranslator):
|
|||||||
original_texts = await asyncio.to_thread(self._get_texts_to_translate, document)
|
original_texts = await asyncio.to_thread(self._get_texts_to_translate, document)
|
||||||
|
|
||||||
if not original_texts:
|
if not original_texts:
|
||||||
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
|
print("\n在文件中没有找到需要翻译的文本内容。")
|
||||||
return self
|
return self
|
||||||
|
|
||||||
if self.glossary_agent:
|
if self.glossary_agent:
|
||||||
|
|||||||
Reference in New Issue
Block a user