From d292f2bbd207576a467373be7259f9953a915d32 Mon Sep 17 00:00:00 2001 From: xunbu Date: Thu, 30 Oct 2025 19:03:10 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dxlsx=E5=B7=A5=E4=BD=9C?= =?UTF-8?q?=E6=B5=81=E4=B8=8D=E8=83=BD=E4=BF=9D=E7=95=99=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ai_translator/xlsx_translator.py | 265 ++++++++++-------- 1 file changed, 151 insertions(+), 114 deletions(-) diff --git a/docutranslate/translator/ai_translator/xlsx_translator.py b/docutranslate/translator/ai_translator/xlsx_translator.py index 8d21df6..c5916e8 100644 --- a/docutranslate/translator/ai_translator/xlsx_translator.py +++ b/docutranslate/translator/ai_translator/xlsx_translator.py @@ -4,6 +4,8 @@ import asyncio from dataclasses import dataclass from io import BytesIO from typing import Self, Literal, List, Optional +import zipfile +import xml.etree.ElementTree as ET import openpyxl from openpyxl.cell import Cell @@ -48,139 +50,171 @@ class XlsxTranslator(AiTranslator): self.translate_agent = SegmentsTranslateAgent(agent_config) self.insert_mode = config.insert_mode self.separator = config.separator - # --- 新增功能 --- self.translate_regions = config.translate_regions - def _pre_translate(self, document: Document): - workbook = openpyxl.load_workbook(BytesIO(document.content)) - cells_to_translate = [] - - # --- 步骤 1: 根据是否指定区域,收集需要翻译的文本单元格 --- - - # 如果未指定翻译区域,则沿用旧逻辑,翻译所有单元格 - if not self.translate_regions: # 也处理 None 或空列表的情况 - for sheet in workbook.worksheets: - for row in sheet.iter_rows(): - for cell in row: - if isinstance(cell.value, str) and cell.data_type == "s": - cells_to_translate.append({ - "sheet_name": sheet.title, - "coordinate": cell.coordinate, - "original_text": cell.value, - }) - # 如果指定了翻译区域,则只在这些区域内查找 - else: - processed_coordinates = set() - - regions_by_sheet = {} - all_sheet_regions = [] - for region in self.translate_regions: - if '!' in region: - sheet_name, cell_range = region.split('!', 1) - if sheet_name not in regions_by_sheet: - regions_by_sheet[sheet_name] = [] - regions_by_sheet[sheet_name].append(cell_range) - else: - all_sheet_regions.append(region) - - for sheet in workbook.worksheets: - sheet_specific_ranges = regions_by_sheet.get(sheet.title, []) - total_ranges_for_this_sheet = sheet_specific_ranges + all_sheet_regions - - if not total_ranges_for_this_sheet: - continue - - for cell_range in total_ranges_for_this_sheet: - try: - cells_in_range = sheet[cell_range] - - # --- START: 这是修改的关键部分 --- - # 无论返回的是单个cell、一维元组(行/列)还是二维元组(矩形),都将其展平为一维列表 - flat_cells = [] - if isinstance(cells_in_range, Cell): - flat_cells.append(cells_in_range) - elif isinstance(cells_in_range, tuple): - for item in cells_in_range: - if isinstance(item, Cell): - flat_cells.append(item) # 处理一维元组 - elif isinstance(item, tuple): - for cell in item: # 处理二维元组 - flat_cells.append(cell) - # --- END: 修改结束 --- - - # 使用简化后的单层循环 - for cell in flat_cells: - full_coordinate = (sheet.title, cell.coordinate) - if full_coordinate in processed_coordinates: - continue - - if isinstance(cell.value, str) and cell.data_type == "s": - cell_info = { - "sheet_name": sheet.title, - "coordinate": cell.coordinate, - "original_text": cell.value, - } - cells_to_translate.append(cell_info) - processed_coordinates.add(full_coordinate) - - except Exception as e: - self.logger.warning(f"跳过无效的区域 '{cell_range}' 在工作表 '{sheet.title}'. 错误: {e}") - - original_texts = [cell["original_text"] for cell in cells_to_translate] - return workbook, cells_to_translate, original_texts - - def _after_translate(self, workbook, cells_to_translate, translated_texts, original_texts): - for i, cell_info in enumerate(cells_to_translate): - sheet_name = cell_info["sheet_name"] - coordinate = cell_info["coordinate"] - translated_text = translated_texts[i] - original_text = original_texts[i] - - # 定位到工作表和单元格 - sheet = workbook[sheet_name] - if self.insert_mode == "replace": - sheet[coordinate] = translated_text - elif self.insert_mode == "append": - sheet[coordinate] = original_text + self.separator + translated_text - elif self.insert_mode == "prepend": - sheet[coordinate] = translated_text + self.separator + original_text - else: - self.logger.error("不正确的XlsxTranslatorConfig参数") - - workbook_output_stream = BytesIO() - # 保存修改后的工作簿到新文件 + def _get_texts_to_translate(self, document: Document) -> List[str]: + """使用 openpyxl 识别指定区域内需要翻译的文本。""" + texts_to_translate = set() try: - workbook.save(workbook_output_stream) - finally: + # 使用 data_only=True 来获取单元格的计算值,而不是公式 + workbook = openpyxl.load_workbook(BytesIO(document.content), data_only=True) + # 如果未指定区域,则翻译所有文本 + if not self.translate_regions: + for sheet in workbook.worksheets: + for row in sheet.iter_rows(): + for cell in row: + # 仅处理共享字符串类型,这是最常见的文本存储方式 + if isinstance(cell.value, str) and cell.data_type == "s": + texts_to_translate.add(cell.value) + # 同时也要检查表格的标题 + for sheet in workbook.worksheets: + for table in sheet._tables: + for column in table.tableColumns: + if column.name: + texts_to_translate.add(column.name) + + # 如果指定了区域 + else: + processed_coordinates = set() + regions_by_sheet = {} + all_sheet_regions = [] + for region in self.translate_regions: + if '!' in region: + sheet_name, cell_range = region.split('!', 1) + # 支持带引号的工作表名称 + sheet_name = sheet_name.strip("'") + if sheet_name not in regions_by_sheet: + regions_by_sheet[sheet_name] = [] + regions_by_sheet[sheet_name].append(cell_range) + else: + all_sheet_regions.append(region) + + for sheet in workbook.worksheets: + sheet_specific_ranges = regions_by_sheet.get(sheet.title, []) + total_ranges_for_this_sheet = sheet_specific_ranges + all_sheet_regions + + if not total_ranges_for_this_sheet: + continue + + # 检查此区域内的表格标题 + for table in sheet._tables: + # openpyxl 没有提供简单的方法来检查表格是否与区域相交 + # 为简单起见,我们假设如果指定了工作表,则翻译该工作表上的所有表格标题 + for column in table.tableColumns: + if column.name: + texts_to_translate.add(column.name) + + for cell_range in total_ranges_for_this_sheet: + try: + cells_in_range = sheet[cell_range] + flat_cells = [] + if isinstance(cells_in_range, Cell): + flat_cells.append(cells_in_range) + elif isinstance(cells_in_range, tuple): + for item in cells_in_range: + if isinstance(item, Cell): + flat_cells.append(item) + elif isinstance(item, tuple): + flat_cells.extend(item) + + for cell in flat_cells: + if isinstance(cell.value, str) and cell.data_type == "s": + texts_to_translate.add(cell.value) + except Exception as e: + self.logger.warning(f"跳过无效的区域 '{cell_range}' 在工作表 '{sheet.title}'. 错误: {e}") workbook.close() - return workbook_output_stream.getvalue() + except Exception as e: + self.logger.error(f"使用 openpyxl 预处理文件失败: {e}") + + return list(texts_to_translate) + + def _rebuild_xlsx_with_translated_content(self, original_content_bytes: bytes, translation_map: dict) -> bytes: + """ + 通过替换 sharedStrings.xml 和 tableX.xml 中的文本内容来重构 XLSX 文件。 + """ + # 注册命名空间以正确解析和生成XML + ns = { + 'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' + } + ET.register_namespace('', ns['main']) + + original_zip_io = BytesIO(original_content_bytes) + output_zip_io = BytesIO() + + try: + with zipfile.ZipFile(original_zip_io, 'r') as original_zip: + with zipfile.ZipFile(output_zip_io, 'w', zipfile.ZIP_DEFLATED) as output_zip: + for item in original_zip.infolist(): + file_content = original_zip.read(item.filename) + + # --- 1. 处理共享字符串文件 --- + if item.filename == "xl/sharedStrings.xml": + root = ET.fromstring(file_content) + text_nodes = root.findall('.//main:t', ns) + for node in text_nodes: + original_text = node.text + if original_text in translation_map: + translated_text = translation_map[original_text] + if self.insert_mode == "replace": + node.text = translated_text + elif self.insert_mode == "append": + node.text = original_text + self.separator + translated_text + elif self.insert_mode == "prepend": + node.text = translated_text + self.separator + original_text + file_content = ET.tostring(root, encoding='utf-8', xml_declaration=True) + + # --- 2. 处理表格定义文件 --- + elif item.filename.startswith("xl/tables/table"): + root = ET.fromstring(file_content) + table_columns = root.findall('.//main:tableColumn', ns) + for col in table_columns: + original_name = col.get('name') + if original_name in translation_map: + translated_name = translation_map[original_name] + if self.insert_mode == "replace": + col.set('name', translated_name) + elif self.insert_mode == "append": + col.set('name', original_name + self.separator + translated_name) + elif self.insert_mode == "prepend": + col.set('name', translated_name + self.separator + original_name) + file_content = ET.tostring(root, encoding='utf-8', xml_declaration=True) + + output_zip.writestr(item, file_content) + + return output_zip_io.getvalue() + + except (zipfile.BadZipFile, ET.ParseError) as e: + self.logger.error(f"处理XLSX文件失败: {e}. 返回原始文件。") + return original_content_bytes def translate(self, document: Document) -> Self: + original_texts = self._get_texts_to_translate(document) - workbook, cells_to_translate, original_texts = self._pre_translate(document) - if not cells_to_translate: + if not original_texts: print("\n在指定区域中没有找到需要翻译的纯文本内容。") - workbook.close() return self + if self.glossary_agent: self.glossary_dict_gen = self.glossary_agent.send_segments(original_texts, self.chunk_size) if self.translate_agent: self.translate_agent.update_glossary_dict(self.glossary_dict_gen) - # --- 步骤 2: 调用翻译函数 --- + if self.translate_agent: translated_texts = self.translate_agent.send_segments(original_texts, self.chunk_size) else: translated_texts = original_texts - document.content = self._after_translate(workbook, cells_to_translate, translated_texts, original_texts) + translation_map = dict(zip(original_texts, translated_texts)) + + document.content = self._rebuild_xlsx_with_translated_content(document.content, translation_map) + return self async def translate_async(self, document: Document) -> Self: + original_texts = await asyncio.to_thread(self._get_texts_to_translate, document) - workbook, cells_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document) - if not cells_to_translate: + if not original_texts: print("\n在指定区域中没有找到需要翻译的纯文本内容。") - workbook.close() return self if self.glossary_agent: @@ -188,11 +222,14 @@ class XlsxTranslator(AiTranslator): if self.translate_agent: self.translate_agent.update_glossary_dict(self.glossary_dict_gen) - # --- 步骤 2: 调用翻译函数 --- if self.translate_agent: translated_texts = await self.translate_agent.send_segments_async(original_texts, self.chunk_size) else: translated_texts = original_texts - document.content = await asyncio.to_thread(self._after_translate, workbook, cells_to_translate, - translated_texts, original_texts) - return self + + translation_map = dict(zip(original_texts, translated_texts)) + + document.content = await asyncio.to_thread(self._rebuild_xlsx_with_translated_content, document.content, + translation_map) + + return self \ No newline at end of file