增加xlsx翻译区域选项

This commit is contained in:
xunbu
2025-08-23 14:05:01 +08:00
parent e89c5c95af
commit cdceccdbed
4 changed files with 92 additions and 23 deletions

View File

@@ -252,6 +252,10 @@ class XlsxWorkflowParams(BaseWorkflowParams):
"\n",
description="当 insert_mode 为 'append''prepend' 时,用于分隔原文和译文的分隔符。"
)
translate_regions: Optional[List[str]] = Field(
None,
description="指定翻译区域列表。示例: ['Sheet1!A1:B10', 'C:D', 'E5']。如果不指定表名 (如 'C:D'),则应用于所有表。如果为 None则翻译整个文件中的所有文本。"
)
class DocxWorkflowParams(BaseWorkflowParams):
@@ -367,7 +371,8 @@ class TranslateServiceRequest(BaseModel):
"insert_mode": "append",
"separator": " \n---翻译---\n ",
"chunk_size": 2000,
"concurrent": 5
"concurrent": 5,
"translate_regions": ["Sheet1!A1:B10", "C:D"]
}
}
},
@@ -530,7 +535,7 @@ async def _perform_translation(
**payload.model_dump(include={
'base_url', 'api_key', 'model_id', 'to_lang', 'custom_prompt',
'temperature', 'thinking', 'chunk_size', 'concurrent',
'insert_mode', 'separator'
'insert_mode', 'separator', 'translate_regions'
}, exclude_none=True)
)
html_exporter_config = Xlsx2HTMLExporterConfig(cdn=True)

View File

@@ -26,6 +26,8 @@
"separatorHelp": "当插入模式为附加或前置时,用于分隔原文和译文的字符。<code>\\n</code> 代表换行。",
"xlsxSettingsTitleText": "XLSX翻译选项",
"insertModeHelpXlsx": "选择如何将翻译后的文本插入到单元格中。",
"xlsxTranslateRegionsLabel": "翻译区域 (可选)",
"xlsxTranslateRegionsPlaceholder": "每行一个区域, 例如:Sheet1!A1:B10不指定表名则对所有表生效",
"srtSettingsTitleText": "SRT翻译选项",
"insertModeHelpSrt": "选择如何将翻译后的文本插入。",
"epubSettingsTitleText": "EPUB翻译选项",
@@ -175,6 +177,8 @@
"separatorHelp": "Characters to separate original and translated text in append/prepend modes. <code>\\n</code> for new line.",
"xlsxSettingsTitleText": "XLSX Translation Options",
"insertModeHelpXlsx": "Choose how to insert translated text into cells.",
"xlsxTranslateRegionsLabel": "Translation area (optional)",
"xlsxTranslateRegionsPlaceholder": "One area per line, for example: Sheet1!A1:B10 (if no sheet name is specified, it applies to all sheets)",
"srtSettingsTitleText": "SRT Translation Options",
"insertModeHelpSrt": "Choose how to insert the translated text.",
"epubSettingsTitleText": "EPUB Translation Options",

File diff suppressed because one or more lines are too long

View File

@@ -1,9 +1,10 @@
import asyncio
from dataclasses import dataclass
from dataclasses import dataclass, field
from io import BytesIO
from typing import Self, Literal
from typing import Self, Literal, List, Optional
import openpyxl
from openpyxl.cell import Cell
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
from docutranslate.ir.document import Document
@@ -15,6 +16,11 @@ from docutranslate.translator.base import Translator
class XlsxTranslatorConfig(AiTranslatorConfig):
insert_mode: Literal["replace", "append", "prepend"] = "replace"
separator: str = "\n"
# 指定翻译区域列表。
# 示例: ["Sheet1!A1:B10", "C:D", "E5"]
# 如果不指定表名 (如 "C:D"),则应用于所有表。
# 如果为 None 或空列表,则翻译整个文件中的所有文本。
translate_regions: Optional[List[str]] = None
class XlsxTranslator(Translator):
@@ -35,25 +41,80 @@ class XlsxTranslator(Translator):
self.translate_agent = SegmentsTranslateAgent(agent_config)
self.insert_mode = config.insert_mode
self.separator = config.separator
# --- 新增功能 ---
self.translate_regions = config.translate_regions
def _pre_translate(self, document: Document):
workbook = openpyxl.load_workbook(BytesIO(document.content))
# --- 步骤 1: 收集所有需要翻译的文本单元格 ---
cells_to_translate = []
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
for row in sheet.iter_rows():
for cell in row:
# 关键判断:值是字符串(str) 且 数据类型是 's' (string),以排除公式('f')
if isinstance(cell.value, str) and cell.data_type == "s":
cell_info = {
"sheet_name": sheet_name,
"coordinate": cell.coordinate,
"original_text": cell.value,
}
cells_to_translate.append(cell_info)
# --- 步骤 1: 根据是否指定区域,收集需要翻译的文本单元格 ---
# 如果未指定翻译区域,则沿用旧逻辑,翻译所有单元格
if self.translate_regions is None:
for sheet in workbook.worksheets:
for row in sheet.iter_rows():
for cell in row:
if isinstance(cell.value, str) and cell.data_type == "s":
cells_to_translate.append({
"sheet_name": sheet.title,
"coordinate": cell.coordinate,
"original_text": cell.value,
})
# 如果指定了翻译区域,则只在这些区域内查找
else:
# 用于防止重叠区域导致重复翻译
processed_coordinates = set()
# 1. 解析区域,区分“全局区域”和“指定工作表区域”
regions_by_sheet = {}
all_sheet_regions = []
for region in self.translate_regions:
if '!' in region:
sheet_name, cell_range = region.split('!', 1)
if sheet_name not in regions_by_sheet:
regions_by_sheet[sheet_name] = []
regions_by_sheet[sheet_name].append(cell_range)
else:
all_sheet_regions.append(region)
# 2. 遍历工作表,应用区域规则
for sheet in workbook.worksheets:
# 获取当前工作表的“指定区域”和“全局区域”
sheet_specific_ranges = regions_by_sheet.get(sheet.title, [])
total_ranges_for_this_sheet = sheet_specific_ranges + all_sheet_regions
if not total_ranges_for_this_sheet:
continue
# 3. 遍历区域内的单元格
for cell_range in total_ranges_for_this_sheet:
try:
# sheet[cell_range] 可以获取单个单元格或一个元组的元组
cells_in_range = sheet[cell_range]
if isinstance(cells_in_range, Cell):
# 将单个单元格包装成与多单元格范围一致的结构
cells_in_range = ((cells_in_range,),)
for row_of_cells in cells_in_range:
for cell in row_of_cells:
full_coordinate = (sheet.title, cell.coordinate)
# 如果该单元格已处理,则跳过
if full_coordinate in processed_coordinates:
continue
# 关键判断:值是字符串(str) 且 数据类型是 's' (string)
if isinstance(cell.value, str) and cell.data_type == "s":
cell_info = {
"sheet_name": sheet.title,
"coordinate": cell.coordinate,
"original_text": cell.value,
}
cells_to_translate.append(cell_info)
processed_coordinates.add(full_coordinate)
except Exception as e:
self.logger.warning(f"跳过无效的区域 '{cell_range}' 在工作表 '{sheet.title}'. 错误: {e}")
# 提取所有原文文本,准备进行批量翻译
original_texts = [cell["original_text"] for cell in cells_to_translate]
return workbook, cells_to_translate, original_texts
@@ -88,7 +149,7 @@ class XlsxTranslator(Translator):
workbook, cells_to_translate, original_texts = self._pre_translate(document)
if not cells_to_translate:
print("\n文件中没有找到需要翻译的纯文本内容。")
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
workbook.close()
return self
# --- 步骤 2: 调用翻译函数 ---
@@ -101,7 +162,7 @@ class XlsxTranslator(Translator):
workbook, cells_to_translate, original_texts = await asyncio.to_thread(self._pre_translate, document)
if not cells_to_translate:
print("\n文件中没有找到需要翻译的纯文本内容。")
print("\n在指定区域中没有找到需要翻译的纯文本内容。")
workbook.close()
return self
# --- 步骤 2: 调用翻译函数 ---
@@ -110,4 +171,3 @@ class XlsxTranslator(Translator):
document.content = await asyncio.to_thread(self._after_translate, workbook, cells_to_translate,
translated_texts, original_texts)
return self