修复部分文本未翻译问题

This commit is contained in:
xunbu
2025-12-07 21:12:48 +08:00
parent 99c5d2c7c7
commit e9de5c9ab9

View File

@@ -5,10 +5,9 @@ from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from typing import Self, Literal, List, Optional, Dict, Tuple, Set from typing import Self, Literal, List, Optional, Dict, Tuple, Set
import zipfile import zipfile
import re # 引入正则用于解析简写 import re
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
# 仅导入 openpyxl 的工具函数用于坐标计算,不加载 workbook 对象
from openpyxl.utils.cell import coordinate_to_tuple, range_boundaries from openpyxl.utils.cell import coordinate_to_tuple, range_boundaries
from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
@@ -21,9 +20,6 @@ class XlsxTranslatorConfig(AiTranslatorConfig):
insert_mode: Literal["replace", "append", "prepend"] = "replace" insert_mode: Literal["replace", "append", "prepend"] = "replace"
separator: str = "\n" separator: str = "\n"
# 指定翻译区域列表。 # 指定翻译区域列表。
# 示例: ["Sheet1!A1:B10", "C", "3"] (支持简写: C代表C列, 3代表第3行)
# 如果不指定表名 (如 "C"),则应用于所有表。
# 如果为 None 或空列表,则翻译整个文件中的所有文本。
translate_regions: Optional[List[str]] = None translate_regions: Optional[List[str]] = None
@@ -54,44 +50,81 @@ class XlsxTranslator(AiTranslator):
self.separator = config.separator self.separator = config.separator
self.translate_regions = config.translate_regions self.translate_regions = config.translate_regions
# 命名空间定义 # 我们虽然不依赖它查找,但写入新节点时最好还是带上标准命名空间
self.ns = { self.NS_MAIN = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main', ET.register_namespace('', self.NS_MAIN)
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
}
ET.register_namespace('', self.ns['main'])
# ========================================================================= # =========================================================================
# 辅助方法:无需加载 Workbook 即可解析结构 # 核心辅助方法:忽略命名空间查找
# =========================================================================
def _tag_is(self, elem: ET.Element, tag_name: str) -> bool:
"""判断元素的标签名是否匹配(忽略命名空间)。"""
return elem.tag.endswith(f"}}{tag_name}") or elem.tag == tag_name
def _find_child(self, parent: ET.Element, tag_name: str) -> Optional[ET.Element]:
"""在直接子节点中查找指定标签(忽略命名空间)。"""
for child in parent:
if self._tag_is(child, tag_name):
return child
return None
def _find_all_children(self, parent: ET.Element, tag_name: str) -> List[ET.Element]:
"""查找所有匹配的直接子节点(忽略命名空间)。"""
return [child for child in parent if self._tag_is(child, tag_name)]
def _get_child_text(self, parent: ET.Element, tag_name: str) -> Optional[str]:
"""获取子节点的文本内容(忽略命名空间)。"""
child = self._find_child(parent, tag_name)
return child.text if child is not None else None
# =========================================================================
# 辅助逻辑
# ========================================================================= # =========================================================================
def _get_shared_strings(self, zf: zipfile.ZipFile) -> List[str]: def _get_shared_strings(self, zf: zipfile.ZipFile) -> List[str]:
"""解析共享字符串表,返回字符串列表。"""
if "xl/sharedStrings.xml" not in zf.namelist(): if "xl/sharedStrings.xml" not in zf.namelist():
return [] return []
shared_strings = [] shared_strings = []
with zf.open("xl/sharedStrings.xml") as f: with zf.open("xl/sharedStrings.xml") as f:
context = ET.iterparse(f, events=("end",)) context = ET.iterparse(f, events=("end",))
for event, elem in context: for event, elem in context:
if elem.tag.endswith('}si'): # shared item # 匹配 <si>
texts = [t.text for t in elem.findall('.//main:t', self.ns) if t.text] if self._tag_is(elem, "si"):
# 查找所有 <t>
# 注意xml.etree 的 findall 只能简单的路径查找,
# 既然我们要忽略命名空间,最好手动遍历子树,但 si 结构简单,
# 这里简化处理:直接遍历 iter 出来的 t 元素(如果在 si 内部)
pass
# 由于 iterparse 是扁平流,很难直接关联 si 和 t。
# 更稳妥的方式是:当 elem 是 si 时,遍历 elem 的 children
texts = []
for child in elem.iter():
if self._tag_is(child, "t") and child.text:
texts.append(child.text)
shared_strings.append("".join(texts)) shared_strings.append("".join(texts))
elem.clear() elem.clear()
return shared_strings return shared_strings
def _get_sheet_mapping(self, zf: zipfile.ZipFile) -> Dict[str, str]: def _get_sheet_mapping(self, zf: zipfile.ZipFile) -> Dict[str, str]:
""" """建立 SheetName -> ZipFilename 的映射(稳健版)"""
获取 Sheet 名称到文件路径的映射。
例如: {'Sheet1': 'xl/worksheets/sheet1.xml'}
"""
sheet_name_to_rid = {} sheet_name_to_rid = {}
try: try:
with zf.open("xl/workbook.xml") as f: with zf.open("xl/workbook.xml") as f:
root = ET.fromstring(f.read()) root = ET.fromstring(f.read())
for sheet in root.findall(".//main:sheet", self.ns): # 查找所有 sheet 标签
# 需递归查找,因为 sheet 通常在 sheets 节点下
for sheet in root.iter():
if self._tag_is(sheet, "sheet"):
name = sheet.get("name") name = sheet.get("name")
rid = sheet.get(f"{{{self.ns['r']}}}id") # id 的属性名通常带有 r: 前缀,这很难忽略命名空间,
# 但 openpyxl 规范里 id 属性几乎总是依赖 relationships 命名空间
# 这里我们尝试遍历属性找到 key 包含 id 的
rid = None
for k, v in sheet.attrib.items():
if k.endswith("id"): # 匹配 r:id
rid = v
break
if name and rid: if name and rid:
sheet_name_to_rid[name] = rid sheet_name_to_rid[name] = rid
except Exception: except Exception:
@@ -100,13 +133,16 @@ class XlsxTranslator(AiTranslator):
rid_to_target = {} rid_to_target = {}
try: try:
with zf.open("xl/_rels/workbook.xml.rels") as f: with zf.open("xl/_rels/workbook.xml.rels") as f:
tree = ET.parse(f) root = ET.fromstring(f.read())
root = tree.getroot()
for child in root: for child in root:
# Relationships 里的 tag 也是 Relationship
rid = child.get("Id") rid = child.get("Id")
target = child.get("Target") target = child.get("Target")
if rid and target: if rid and target:
if not target.startswith("/"): target = target.replace("\\", "/")
if target.startswith("/"):
target = target.lstrip("/")
else:
target = "xl/" + target target = "xl/" + target
rid_to_target[rid] = target rid_to_target[rid] = target
except Exception: except Exception:
@@ -116,50 +152,32 @@ class XlsxTranslator(AiTranslator):
for name, rid in sheet_name_to_rid.items(): for name, rid in sheet_name_to_rid.items():
if rid in rid_to_target: if rid in rid_to_target:
mapping[name] = rid_to_target[rid] mapping[name] = rid_to_target[rid]
return mapping return mapping
def _normalize_range(self, range_str: str) -> str: def _normalize_range(self, range_str: str) -> str:
"""
将用户输入的简写转换为标准范围格式。
"C" -> "C:C"
"3" -> "3:3"
"A1" -> "A1:A1" (openpyxl range_boundaries 实际上支持 A1但这里统一处理更安全)
"""
# 纯字母 (例如 "C", "AA") -> 整列
if re.match(r"^[A-Za-z]+$", range_str): if re.match(r"^[A-Za-z]+$", range_str):
return f"{range_str}:{range_str}" return f"{range_str}:{range_str}"
# 纯数字 (例如 "3", "10") -> 整行
if re.match(r"^\d+$", range_str): if re.match(r"^\d+$", range_str):
return f"{range_str}:{range_str}" return f"{range_str}:{range_str}"
return range_str return range_str
def _parse_region_boundaries(self, sheet_mapping: Dict[str, str]) -> Dict[str, List[Tuple]]: def _parse_region_boundaries(self, sheet_mapping: Dict[str, str]) -> Dict[str, List[Tuple]]:
"""
解析配置的 translate_regions。
返回: { 'xl/worksheets/sheet1.xml': [(min_col, min_row, max_col, max_row), ...], ... }
"""
if not self.translate_regions: if not self.translate_regions:
return {} return {}
region_map = {}
region_map = {} # filename -> list of boundaries global_regions = []
global_regions = [] # list of boundaries for all sheets
for region in self.translate_regions: for region in self.translate_regions:
sheet_name = None sheet_name = None
raw_range = region.strip() raw_range = region.strip()
if "!" in raw_range: if "!" in raw_range:
parts = raw_range.split("!", 1) parts = raw_range.split("!", 1)
sheet_name = parts[0].strip("'") sheet_name = parts[0].strip("'")
range_part = self._normalize_range(parts[1]) range_part = self._normalize_range(parts[1])
else: else:
range_part = self._normalize_range(raw_range) range_part = self._normalize_range(raw_range)
try: try:
# boundaries: (min_col, min_row, max_col, max_row)
boundaries = range_boundaries(range_part) boundaries = range_boundaries(range_part)
if sheet_name: if sheet_name:
filename = sheet_mapping.get(sheet_name) filename = sheet_mapping.get(sheet_name)
if filename: if filename:
@@ -171,18 +189,15 @@ class XlsxTranslator(AiTranslator):
except Exception as e: except Exception as e:
self.logger.warning(f"无法解析区域 '{region}': {e}") self.logger.warning(f"无法解析区域 '{region}': {e}")
# 将全局区域添加到所有已知 Sheet
if global_regions: if global_regions:
all_files = set(sheet_mapping.values()) target_files = set(sheet_mapping.values())
for f in all_files: for f in target_files:
if f not in region_map: if f not in region_map:
region_map[f] = [] region_map[f] = []
region_map[f].extend(global_regions) region_map[f].extend(global_regions)
return region_map return region_map
def _is_in_boundaries(self, col: int, row: int, boundaries_list: List[Tuple]) -> bool: def _is_in_boundaries(self, col: int, row: int, boundaries_list: List[Tuple]) -> bool:
"""检查坐标 (col, row) 是否在给定的边界列表中。"""
for (min_col, min_row, max_col, max_row) in boundaries_list: for (min_col, min_row, max_col, max_row) in boundaries_list:
if min_col is not None and col < min_col: continue if min_col is not None and col < min_col: continue
if min_row is not None and row < min_row: continue if min_row is not None and row < min_row: continue
@@ -191,20 +206,30 @@ class XlsxTranslator(AiTranslator):
return True return True
return False return False
def _apply_insert_mode(self, original: str, translated: str) -> str:
if self.insert_mode == "append":
return f"{original}{self.separator}{translated}"
elif self.insert_mode == "prepend":
return f"{translated}{self.separator}{original}"
else:
return translated
# ========================================================================= # =========================================================================
# 高效 XML 区域提取与重构 # 区域处理 (使用 Helper 方法,完全解耦命名空间)
# ========================================================================= # =========================================================================
def _get_texts_xml_regions(self, document: Document) -> List[str]: def _get_texts_xml_regions(self, document: Document) -> List[str]:
"""使用纯 XML 解析(结合 SharedStrings提取指定区域文本。"""
texts_to_translate = set() texts_to_translate = set()
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf: with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
shared_strings = self._get_shared_strings(zf) shared_strings = self._get_shared_strings(zf)
if not shared_strings:
return []
sheet_mapping = self._get_sheet_mapping(zf) sheet_mapping = self._get_sheet_mapping(zf)
if not sheet_mapping:
all_sheets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
for s in all_sheets:
sheet_mapping[f"Unknown_{s}"] = s
boundaries_map = self._parse_region_boundaries(sheet_mapping) boundaries_map = self._parse_region_boundaries(sheet_mapping)
for filename, boundaries in boundaries_map.items(): for filename, boundaries in boundaries_map.items():
@@ -214,79 +239,107 @@ class XlsxTranslator(AiTranslator):
with zf.open(filename) as f: with zf.open(filename) as f:
context = ET.iterparse(f, events=("end",)) context = ET.iterparse(f, events=("end",))
for event, elem in context: for event, elem in context:
if elem.tag.endswith('}c'): # Cell # 匹配 <c>
r_attr = elem.get('r') # e.g. "C5" if self._tag_is(elem, "c"):
t_attr = elem.get('t') # e.g. "s" r_attr = elem.get('r')
t_attr = elem.get('t')
if r_attr and t_attr == 's': if r_attr:
try: try:
# 【修正】coordinate_to_tuple 返回 (row, col)
row, col = coordinate_to_tuple(r_attr) row, col = coordinate_to_tuple(r_attr)
if self._is_in_boundaries(col, row, boundaries): if self._is_in_boundaries(col, row, boundaries):
v_node = elem.find('main:v', self.ns) text_found = None
if v_node is not None and v_node.text:
idx = int(v_node.text) # Shared String
if t_attr == 's':
v_text = self._get_child_text(elem, "v")
if v_text:
idx = int(v_text)
if 0 <= idx < len(shared_strings): if 0 <= idx < len(shared_strings):
texts_to_translate.add(shared_strings[idx]) text_found = shared_strings[idx]
# Inline String
elif t_attr == 'inlineStr':
is_node = self._find_child(elem, "is")
if is_node is not None:
t_text = self._get_child_text(is_node, "t")
if t_text:
text_found = t_text
if text_found:
texts_to_translate.add(text_found)
except Exception: except Exception:
pass pass
elem.clear() elem.clear()
elif self._tag_is(elem, "row"):
elem.clear()
return list(texts_to_translate) return list(texts_to_translate)
def _rebuild_xml_regions(self, original_content_bytes: bytes, translation_map: dict) -> bytes: def _rebuild_xml_regions(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
"""使用纯 XML 重构,修正了坐标解包顺序。"""
output_zip_io = BytesIO() output_zip_io = BytesIO()
with zipfile.ZipFile(BytesIO(original_content_bytes), 'r') as zf_in: with zipfile.ZipFile(BytesIO(original_content_bytes), 'r') as zf_in:
with zipfile.ZipFile(output_zip_io, 'w', zipfile.ZIP_DEFLATED) as zf_out: with zipfile.ZipFile(output_zip_io, 'w', zipfile.ZIP_DEFLATED) as zf_out:
shared_strings = self._get_shared_strings(zf_in) shared_strings = self._get_shared_strings(zf_in)
sheet_mapping = self._get_sheet_mapping(zf_in) sheet_mapping = self._get_sheet_mapping(zf_in)
if not sheet_mapping:
all_sheets = [n for n in zf_in.namelist() if
n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
for s in all_sheets:
sheet_mapping[f"Unknown_{s}"] = s
boundaries_map = self._parse_region_boundaries(sheet_mapping) boundaries_map = self._parse_region_boundaries(sheet_mapping)
for item in zf_in.infolist(): for item in zf_in.infolist():
if item.filename in boundaries_map: if item.filename in boundaries_map:
boundaries = boundaries_map[item.filename] boundaries = boundaries_map[item.filename]
with zf_in.open(item.filename) as f: with zf_in.open(item.filename) as f:
tree = ET.parse(f) tree = ET.parse(f)
root = tree.getroot() root = tree.getroot()
cells_modified = False cells_modified = False
for cell in root.findall(".//main:c", self.ns):
# 查找所有 <c>
# 因为 findall 不支持复杂的 tag.endswith这里我们遍历所有节点
# 对于 parse 加载的树iter() 是高效的
for cell in root.iter():
if not self._tag_is(cell, "c"):
continue
r_attr = cell.get('r') r_attr = cell.get('r')
t_attr = cell.get('t') t_attr = cell.get('t')
if r_attr:
if r_attr and t_attr == 's':
try: try:
# 【修正】coordinate_to_tuple 返回 (row, col)
row, col = coordinate_to_tuple(r_attr) row, col = coordinate_to_tuple(r_attr)
if self._is_in_boundaries(col, row, boundaries): if self._is_in_boundaries(col, row, boundaries):
v_node = cell.find('main:v', self.ns) original_text = None
if v_node is not None and v_node.text:
idx = int(v_node.text) if t_attr == 's':
v_text = self._get_child_text(cell, "v")
if v_text:
idx = int(v_text)
if 0 <= idx < len(shared_strings): if 0 <= idx < len(shared_strings):
original_text = shared_strings[idx] original_text = shared_strings[idx]
elif t_attr == 'inlineStr':
is_node = self._find_child(cell, "is")
if is_node is not None:
original_text = self._get_child_text(is_node, "t")
if original_text in translation_map: if original_text and original_text in translation_map:
translated_text = translation_map[original_text] final_text = self._apply_insert_mode(original_text,
translation_map[original_text])
final_text = translated_text # 清空旧内容
if self.insert_mode == "append": for child in list(cell):
final_text = original_text + self.separator + translated_text cell.remove(child)
elif self.insert_mode == "prepend":
final_text = translated_text + self.separator + original_text
# 转换为 inlineStr # 写入新内容 (这里必须使用带有命名空间的标签名否则Excel不认)
cell.set('t', 'inlineStr') cell.set('t', 'inlineStr')
cell.remove(v_node) # 注意:写入时使用 self.NS_MAIN 是必须的,因为这是标准
is_node = ET.Element(f"{{{self.ns['main']}}}is") is_node = ET.Element(f"{{{self.NS_MAIN}}}is")
t_node = ET.SubElement(is_node, f"{{{self.ns['main']}}}t") t_node = ET.SubElement(is_node, f"{{{self.NS_MAIN}}}t")
t_node.text = final_text t_node.text = final_text
cell.append(is_node) cell.append(is_node)
cells_modified = True cells_modified = True
except Exception: except Exception:
pass pass
@@ -298,11 +351,10 @@ class XlsxTranslator(AiTranslator):
zf_out.writestr(item, zf_in.read(item.filename)) zf_out.writestr(item, zf_in.read(item.filename))
else: else:
zf_out.writestr(item, zf_in.read(item.filename)) zf_out.writestr(item, zf_in.read(item.filename))
return output_zip_io.getvalue() return output_zip_io.getvalue()
# ========================================================================= # =========================================================================
# 原有全文档逻辑 (针对全文档翻译保持极致速度) # 全文档处理 (同样使用 Helper)
# ========================================================================= # =========================================================================
def _get_texts_xml_all(self, document: Document) -> List[str]: def _get_texts_xml_all(self, document: Document) -> List[str]:
@@ -311,20 +363,39 @@ class XlsxTranslator(AiTranslator):
with zipfile.ZipFile(BytesIO(document.content), 'r') as zf: with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
if "xl/sharedStrings.xml" in zf.namelist(): if "xl/sharedStrings.xml" in zf.namelist():
with zf.open("xl/sharedStrings.xml") as f: with zf.open("xl/sharedStrings.xml") as f:
root = ET.fromstring(f.read()) context = ET.iterparse(f, events=("end",))
for node in root.findall('.//main:t', self.ns): for event, elem in context:
if node.text and node.text.strip(): if self._tag_is(elem, "t"):
texts_to_translate.add(node.text) if elem.text and elem.text.strip():
texts_to_translate.add(elem.text)
elem.clear()
sheet_files = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
for sheet_file in sheet_files:
with zf.open(sheet_file) as f:
context = ET.iterparse(f, events=("end",))
for event, elem in context:
if self._tag_is(elem, "c"):
if elem.get('t') == 'inlineStr':
is_node = self._find_child(elem, "is")
if is_node is not None:
t_text = self._get_child_text(is_node, "t")
if t_text and t_text.strip():
texts_to_translate.add(t_text)
elem.clear()
elif self._tag_is(elem, "row"):
elem.clear()
for item in zf.infolist(): for item in zf.infolist():
if item.filename.startswith("xl/tables/table"): if item.filename.startswith("xl/tables/table"):
with zf.open(item.filename) as f: with zf.open(item.filename) as f:
root = ET.fromstring(f.read()) root = ET.fromstring(f.read())
for col in root.findall('.//main:tableColumn', self.ns): for col in root.iter():
if self._tag_is(col, "tableColumn"):
if col.get('name'): if col.get('name'):
texts_to_translate.add(col.get('name')) texts_to_translate.add(col.get('name'))
except Exception as e: except Exception as e:
self.logger.error(f"XML解析失败: {e}") self.logger.error(f"XML解析失败: {e}", exc_info=True)
return list(texts_to_translate) return list(texts_to_translate)
def _rebuild_xml_all(self, original_content_bytes: bytes, translation_map: dict) -> bytes: def _rebuild_xml_all(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
@@ -337,41 +408,54 @@ class XlsxTranslator(AiTranslator):
if item.filename == "xl/sharedStrings.xml": if item.filename == "xl/sharedStrings.xml":
root = ET.fromstring(content) root = ET.fromstring(content)
for node in root.findall('.//main:t', self.ns): modified = False
for node in root.iter():
if self._tag_is(node, "t"):
if node.text in translation_map: if node.text in translation_map:
trans = translation_map[node.text] node.text = self._apply_insert_mode(node.text, translation_map[node.text])
if self.insert_mode == "append": modified = True
node.text = node.text + self.separator + trans if modified:
elif self.insert_mode == "prepend": zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
node.text = trans + self.separator + node.text
else: else:
node.text = trans zf_out.writestr(item, content)
content = ET.tostring(root, encoding='utf-8', xml_declaration=True)
elif item.filename.startswith("xl/worksheets/sheet") and item.filename.endswith(".xml"):
root = ET.fromstring(content)
modified = False
for cell in root.iter():
if self._tag_is(cell, "c") and cell.get('t') == 'inlineStr':
is_node = self._find_child(cell, "is")
if is_node is not None:
t_node = self._find_child(is_node, "t")
if t_node is not None and t_node.text in translation_map:
t_node.text = self._apply_insert_mode(t_node.text,
translation_map[t_node.text])
modified = True
if modified:
zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
else:
zf_out.writestr(item, content)
elif item.filename.startswith("xl/tables/table"): elif item.filename.startswith("xl/tables/table"):
root = ET.fromstring(content) root = ET.fromstring(content)
for col in root.findall('.//main:tableColumn', self.ns): modified = False
for col in root.iter():
if self._tag_is(col, "tableColumn"):
orig = col.get('name') orig = col.get('name')
if orig in translation_map: if orig in translation_map:
trans = translation_map[orig] col.set('name', self._apply_insert_mode(orig, translation_map[orig]))
if self.insert_mode == "append": modified = True
col.set('name', orig + self.separator + trans) if modified:
elif self.insert_mode == "prepend": zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
col.set('name', trans + self.separator + orig) else:
zf_out.writestr(item, content)
else: else:
col.set('name', trans)
content = ET.tostring(root, encoding='utf-8', xml_declaration=True)
zf_out.writestr(item, content) zf_out.writestr(item, content)
return output_zip_io.getvalue() return output_zip_io.getvalue()
except Exception as e: except Exception as e:
self.logger.error(f"XML重构失败: {e}") self.logger.error(f"XML重构失败: {e}", exc_info=True)
return original_content_bytes return original_content_bytes
# =========================================================================
# 主入口
# =========================================================================
def translate(self, document: Document) -> Self: def translate(self, document: Document) -> Self:
if self.translate_regions: if self.translate_regions:
original_texts = self._get_texts_xml_regions(document) original_texts = self._get_texts_xml_regions(document)