修复部分文本未翻译问题

2025-12-07 21:12:48 +08:00
parent 99c5d2c7c7
commit e9de5c9ab9
1 changed files with 220 additions and 136 deletions
--- a/docutranslate/translator/ai_translator/xlsx_translator.py
+++ b/docutranslate/translator/ai_translator/xlsx_translator.py
@@ -5,10 +5,9 @@ from dataclasses import dataclass
 from io import BytesIO
 from typing import Self, Literal, List, Optional, Dict, Tuple, Set
 import zipfile
-import re  # 引入正则用于解析简写
+import re
 import xml.etree.ElementTree as ET
 # 仅导入 openpyxl 的工具函数用于坐标计算，不加载 workbook 对象
 from openpyxl.utils.cell import coordinate_to_tuple, range_boundaries
 from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
@@ -21,9 +20,6 @@ class XlsxTranslatorConfig(AiTranslatorConfig):
    insert_mode: Literal["replace", "append", "prepend"] = "replace"
    separator: str = "\n"
    # 指定翻译区域列表。
    # 示例: ["Sheet1!A1:B10", "C", "3"] (支持简写: C代表C列, 3代表第3行)
    # 如果不指定表名 (如 "C")，则应用于所有表。
    # 如果为 None 或空列表，则翻译整个文件中的所有文本。
    translate_regions: Optional[List[str]] = None
@@ -54,44 +50,81 @@ class XlsxTranslator(AiTranslator):
        self.separator = config.separator
        self.translate_regions = config.translate_regions
-        # 命名空间定义
+        # 我们虽然不依赖它查找，但写入新节点时最好还是带上标准命名空间
-        self.ns = {
+        self.NS_MAIN = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
-            'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
+        ET.register_namespace('', self.NS_MAIN)
            'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
        }
        ET.register_namespace('', self.ns['main'])
    # =========================================================================
-    # 辅助方法：无需加载 Workbook 即可解析结构
+    # 核心辅助方法：忽略命名空间查找
    # =========================================================================
    def _tag_is(self, elem: ET.Element, tag_name: str) -> bool:
        """判断元素的标签名是否匹配（忽略命名空间）。"""
        return elem.tag.endswith(f"}}{tag_name}") or elem.tag == tag_name
    def _find_child(self, parent: ET.Element, tag_name: str) -> Optional[ET.Element]:
        """在直接子节点中查找指定标签（忽略命名空间）。"""
        for child in parent:
            if self._tag_is(child, tag_name):
                return child
        return None
    def _find_all_children(self, parent: ET.Element, tag_name: str) -> List[ET.Element]:
        """查找所有匹配的直接子节点（忽略命名空间）。"""
        return [child for child in parent if self._tag_is(child, tag_name)]
    def _get_child_text(self, parent: ET.Element, tag_name: str) -> Optional[str]:
        """获取子节点的文本内容（忽略命名空间）。"""
        child = self._find_child(parent, tag_name)
        return child.text if child is not None else None
    # =========================================================================
    # 辅助逻辑
    # =========================================================================
    def _get_shared_strings(self, zf: zipfile.ZipFile) -> List[str]:
        """解析共享字符串表，返回字符串列表。"""
        if "xl/sharedStrings.xml" not in zf.namelist():
            return []
        shared_strings = []
        with zf.open("xl/sharedStrings.xml") as f:
            context = ET.iterparse(f, events=("end",))
            for event, elem in context:
-                if elem.tag.endswith('}si'):  # shared item
+                # 匹配 <si>
-                    texts = [t.text for t in elem.findall('.//main:t', self.ns) if t.text]
+                if self._tag_is(elem, "si"):
                    # 查找所有 <t>
                    # 注意：xml.etree 的 findall 只能简单的路径查找，
                    # 既然我们要忽略命名空间，最好手动遍历子树，但 si 结构简单，
                    # 这里简化处理：直接遍历 iter 出来的 t 元素（如果在 si 内部）
                    pass
                    # 由于 iterparse 是扁平流，很难直接关联 si 和 t。
                    # 更稳妥的方式是：当 elem 是 si 时，遍历 elem 的 children
                    texts = []
                    for child in elem.iter():
                        if self._tag_is(child, "t") and child.text:
                            texts.append(child.text)
                    shared_strings.append("".join(texts))
                    elem.clear()
        return shared_strings
    def _get_sheet_mapping(self, zf: zipfile.ZipFile) -> Dict[str, str]:
-        """
+        """建立 SheetName -> ZipFilename 的映射（稳健版）"""
        获取 Sheet 名称到文件路径的映射。
        例如: {'Sheet1': 'xl/worksheets/sheet1.xml'}
        """
        sheet_name_to_rid = {}
        try:
            with zf.open("xl/workbook.xml") as f:
                root = ET.fromstring(f.read())
-                for sheet in root.findall(".//main:sheet", self.ns):
+                # 查找所有 sheet 标签
                # 需递归查找，因为 sheet 通常在 sheets 节点下
                for sheet in root.iter():
                    if self._tag_is(sheet, "sheet"):
                        name = sheet.get("name")
-                    rid = sheet.get(f"{{{self.ns['r']}}}id")
+                        # id 的属性名通常带有 r: 前缀，这很难忽略命名空间，
                        # 但 openpyxl 规范里 id 属性几乎总是依赖 relationships 命名空间
                        # 这里我们尝试遍历属性找到 key 包含 id 的
                        rid = None
                        for k, v in sheet.attrib.items():
                            if k.endswith("id"):  # 匹配 r:id
                                rid = v
                                break
                        if name and rid:
                            sheet_name_to_rid[name] = rid
        except Exception:
@@ -100,13 +133,16 @@ class XlsxTranslator(AiTranslator):
        rid_to_target = {}
        try:
            with zf.open("xl/_rels/workbook.xml.rels") as f:
-                tree = ET.parse(f)
+                root = ET.fromstring(f.read())
                root = tree.getroot()
                for child in root:
                    # Relationships 里的 tag 也是 Relationship
                    rid = child.get("Id")
                    target = child.get("Target")
                    if rid and target:
-                        if not target.startswith("/"):
+                        target = target.replace("\\", "/")
                        if target.startswith("/"):
                            target = target.lstrip("/")
                        else:
                            target = "xl/" + target
                        rid_to_target[rid] = target
        except Exception:
@@ -116,50 +152,32 @@ class XlsxTranslator(AiTranslator):
        for name, rid in sheet_name_to_rid.items():
            if rid in rid_to_target:
                mapping[name] = rid_to_target[rid]
        return mapping
    def _normalize_range(self, range_str: str) -> str:
        """
        将用户输入的简写转换为标准范围格式。
        "C" -> "C:C"
        "3" -> "3:3"
        "A1" -> "A1:A1" (openpyxl range_boundaries 实际上支持 A1，但这里统一处理更安全)
        """
        # 纯字母 (例如 "C", "AA") -> 整列
        if re.match(r"^[A-Za-z]+$", range_str):
            return f"{range_str}:{range_str}"
        # 纯数字 (例如 "3", "10") -> 整行
        if re.match(r"^\d+$", range_str):
            return f"{range_str}:{range_str}"
        return range_str
    def _parse_region_boundaries(self, sheet_mapping: Dict[str, str]) -> Dict[str, List[Tuple]]:
        """
        解析配置的 translate_regions。
        返回: { 'xl/worksheets/sheet1.xml': [(min_col, min_row, max_col, max_row), ...], ... }
        """
        if not self.translate_regions:
            return {}
-
+        region_map = {}
-        region_map = {}  # filename -> list of boundaries
+        global_regions = []
        global_regions = []  # list of boundaries for all sheets
        for region in self.translate_regions:
            sheet_name = None
            raw_range = region.strip()
            if "!" in raw_range:
                parts = raw_range.split("!", 1)
                sheet_name = parts[0].strip("'")
                range_part = self._normalize_range(parts[1])
            else:
                range_part = self._normalize_range(raw_range)
            try:
                # boundaries: (min_col, min_row, max_col, max_row)
                boundaries = range_boundaries(range_part)
                if sheet_name:
                    filename = sheet_mapping.get(sheet_name)
                    if filename:
@@ -171,18 +189,15 @@ class XlsxTranslator(AiTranslator):
            except Exception as e:
                self.logger.warning(f"无法解析区域 '{region}': {e}")
        # 将全局区域添加到所有已知 Sheet
        if global_regions:
-            all_files = set(sheet_mapping.values())
+            target_files = set(sheet_mapping.values())
-            for f in all_files:
+            for f in target_files:
                if f not in region_map:
                    region_map[f] = []
                region_map[f].extend(global_regions)
        return region_map
    def _is_in_boundaries(self, col: int, row: int, boundaries_list: List[Tuple]) -> bool:
        """检查坐标 (col, row) 是否在给定的边界列表中。"""
        for (min_col, min_row, max_col, max_row) in boundaries_list:
            if min_col is not None and col < min_col: continue
            if min_row is not None and row < min_row: continue
@@ -191,20 +206,30 @@ class XlsxTranslator(AiTranslator):
            return True
        return False
    def _apply_insert_mode(self, original: str, translated: str) -> str:
        if self.insert_mode == "append":
            return f"{original}{self.separator}{translated}"
        elif self.insert_mode == "prepend":
            return f"{translated}{self.separator}{original}"
        else:
            return translated
    # =========================================================================
-    # 高效 XML 区域提取与重构
+    # 区域处理 (使用 Helper 方法，完全解耦命名空间)
    # =========================================================================
    def _get_texts_xml_regions(self, document: Document) -> List[str]:
        """使用纯 XML 解析（结合 SharedStrings）提取指定区域文本。"""
        texts_to_translate = set()
        with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
            shared_strings = self._get_shared_strings(zf)
            if not shared_strings:
                return []
            sheet_mapping = self._get_sheet_mapping(zf)
            if not sheet_mapping:
                all_sheets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
                for s in all_sheets:
                    sheet_mapping[f"Unknown_{s}"] = s
            boundaries_map = self._parse_region_boundaries(sheet_mapping)
            for filename, boundaries in boundaries_map.items():
@@ -214,79 +239,107 @@ class XlsxTranslator(AiTranslator):
                with zf.open(filename) as f:
                    context = ET.iterparse(f, events=("end",))
                    for event, elem in context:
-                        if elem.tag.endswith('}c'):  # Cell
+                        # 匹配 <c>
-                            r_attr = elem.get('r')  # e.g. "C5"
+                        if self._tag_is(elem, "c"):
-                            t_attr = elem.get('t')  # e.g. "s"
+                            r_attr = elem.get('r')
                            t_attr = elem.get('t')
-                            if r_attr and t_attr == 's':
+                            if r_attr:
                                try:
                                    # 【修正】coordinate_to_tuple 返回 (row, col)
                                    row, col = coordinate_to_tuple(r_attr)
                                    if self._is_in_boundaries(col, row, boundaries):
-                                        v_node = elem.find('main:v', self.ns)
+                                        text_found = None
-                                        if v_node is not None and v_node.text:
+
-                                            idx = int(v_node.text)
+                                        # Shared String
                                        if t_attr == 's':
                                            v_text = self._get_child_text(elem, "v")
                                            if v_text:
                                                idx = int(v_text)
                                                if 0 <= idx < len(shared_strings):
-                                                texts_to_translate.add(shared_strings[idx])
+                                                    text_found = shared_strings[idx]
                                        # Inline String
                                        elif t_attr == 'inlineStr':
                                            is_node = self._find_child(elem, "is")
                                            if is_node is not None:
                                                t_text = self._get_child_text(is_node, "t")
                                                if t_text:
                                                    text_found = t_text
                                        if text_found:
                                            texts_to_translate.add(text_found)
                                except Exception:
                                    pass
                            elem.clear()
                        elif self._tag_is(elem, "row"):
                            elem.clear()
        return list(texts_to_translate)
    def _rebuild_xml_regions(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
        """使用纯 XML 重构，修正了坐标解包顺序。"""
        output_zip_io = BytesIO()
        with zipfile.ZipFile(BytesIO(original_content_bytes), 'r') as zf_in:
            with zipfile.ZipFile(output_zip_io, 'w', zipfile.ZIP_DEFLATED) as zf_out:
                shared_strings = self._get_shared_strings(zf_in)
                sheet_mapping = self._get_sheet_mapping(zf_in)
                if not sheet_mapping:
                    all_sheets = [n for n in zf_in.namelist() if
                                  n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
                    for s in all_sheets:
                        sheet_mapping[f"Unknown_{s}"] = s
                boundaries_map = self._parse_region_boundaries(sheet_mapping)
                for item in zf_in.infolist():
                    if item.filename in boundaries_map:
                        boundaries = boundaries_map[item.filename]
                        with zf_in.open(item.filename) as f:
                            tree = ET.parse(f)
                            root = tree.getroot()
                            cells_modified = False
-                            for cell in root.findall(".//main:c", self.ns):
+
                            # 查找所有 <c>
                            # 因为 findall 不支持复杂的 tag.endswith，这里我们遍历所有节点
                            # 对于 parse 加载的树，iter() 是高效的
                            for cell in root.iter():
                                if not self._tag_is(cell, "c"):
                                    continue
                                r_attr = cell.get('r')
                                t_attr = cell.get('t')
-
+                                if r_attr:
                                if r_attr and t_attr == 's':
                                    try:
                                        # 【修正】coordinate_to_tuple 返回 (row, col)
                                        row, col = coordinate_to_tuple(r_attr)
                                        if self._is_in_boundaries(col, row, boundaries):
-                                            v_node = cell.find('main:v', self.ns)
+                                            original_text = None
-                                            if v_node is not None and v_node.text:
+
-                                                idx = int(v_node.text)
+                                            if t_attr == 's':
                                                v_text = self._get_child_text(cell, "v")
                                                if v_text:
                                                    idx = int(v_text)
                                                    if 0 <= idx < len(shared_strings):
                                                        original_text = shared_strings[idx]
                                            elif t_attr == 'inlineStr':
                                                is_node = self._find_child(cell, "is")
                                                if is_node is not None:
                                                    original_text = self._get_child_text(is_node, "t")
-                                                    if original_text in translation_map:
+                                            if original_text and original_text in translation_map:
-                                                        translated_text = translation_map[original_text]
+                                                final_text = self._apply_insert_mode(original_text,
                                                                                     translation_map[original_text])
-                                                        final_text = translated_text
+                                                # 清空旧内容
-                                                        if self.insert_mode == "append":
+                                                for child in list(cell):
-                                                            final_text = original_text + self.separator + translated_text
+                                                    cell.remove(child)
                                                        elif self.insert_mode == "prepend":
                                                            final_text = translated_text + self.separator + original_text
-                                                        # 转换为 inlineStr
+                                                # 写入新内容 (这里必须使用带有命名空间的标签名，否则Excel不认)
                                                cell.set('t', 'inlineStr')
-                                                        cell.remove(v_node)
+                                                # 注意：写入时使用 self.NS_MAIN 是必须的，因为这是标准
-                                                        is_node = ET.Element(f"{{{self.ns['main']}}}is")
+                                                is_node = ET.Element(f"{{{self.NS_MAIN}}}is")
-                                                        t_node = ET.SubElement(is_node, f"{{{self.ns['main']}}}t")
+                                                t_node = ET.SubElement(is_node, f"{{{self.NS_MAIN}}}t")
                                                t_node.text = final_text
                                                cell.append(is_node)
                                                cells_modified = True
                                    except Exception:
                                        pass
@@ -298,11 +351,10 @@ class XlsxTranslator(AiTranslator):
                                zf_out.writestr(item, zf_in.read(item.filename))
                    else:
                        zf_out.writestr(item, zf_in.read(item.filename))
        return output_zip_io.getvalue()
    # =========================================================================
-    # 原有全文档逻辑 (针对全文档翻译保持极致速度)
+    # 全文档处理 (同样使用 Helper)
    # =========================================================================
    def _get_texts_xml_all(self, document: Document) -> List[str]:
@@ -311,20 +363,39 @@ class XlsxTranslator(AiTranslator):
            with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
                if "xl/sharedStrings.xml" in zf.namelist():
                    with zf.open("xl/sharedStrings.xml") as f:
-                        root = ET.fromstring(f.read())
+                        context = ET.iterparse(f, events=("end",))
-                        for node in root.findall('.//main:t', self.ns):
+                        for event, elem in context:
-                            if node.text and node.text.strip():
+                            if self._tag_is(elem, "t"):
-                                texts_to_translate.add(node.text)
+                                if elem.text and elem.text.strip():
                                    texts_to_translate.add(elem.text)
                                elem.clear()
                sheet_files = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
                for sheet_file in sheet_files:
                    with zf.open(sheet_file) as f:
                        context = ET.iterparse(f, events=("end",))
                        for event, elem in context:
                            if self._tag_is(elem, "c"):
                                if elem.get('t') == 'inlineStr':
                                    is_node = self._find_child(elem, "is")
                                    if is_node is not None:
                                        t_text = self._get_child_text(is_node, "t")
                                        if t_text and t_text.strip():
                                            texts_to_translate.add(t_text)
                                elem.clear()
                            elif self._tag_is(elem, "row"):
                                elem.clear()
                for item in zf.infolist():
                    if item.filename.startswith("xl/tables/table"):
                        with zf.open(item.filename) as f:
                            root = ET.fromstring(f.read())
-                            for col in root.findall('.//main:tableColumn', self.ns):
+                            for col in root.iter():
                                if self._tag_is(col, "tableColumn"):
                                    if col.get('name'):
                                        texts_to_translate.add(col.get('name'))
        except Exception as e:
-            self.logger.error(f"XML解析失败: {e}")
+            self.logger.error(f"XML解析失败: {e}", exc_info=True)
        return list(texts_to_translate)
    def _rebuild_xml_all(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
@@ -337,41 +408,54 @@ class XlsxTranslator(AiTranslator):
                        if item.filename == "xl/sharedStrings.xml":
                            root = ET.fromstring(content)
-                            for node in root.findall('.//main:t', self.ns):
+                            modified = False
                            for node in root.iter():
                                if self._tag_is(node, "t"):
                                    if node.text in translation_map:
-                                    trans = translation_map[node.text]
+                                        node.text = self._apply_insert_mode(node.text, translation_map[node.text])
-                                    if self.insert_mode == "append":
+                                        modified = True
-                                        node.text = node.text + self.separator + trans
+                            if modified:
-                                    elif self.insert_mode == "prepend":
+                                zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
                                        node.text = trans + self.separator + node.text
                            else:
-                                        node.text = trans
+                                zf_out.writestr(item, content)
-                            content = ET.tostring(root, encoding='utf-8', xml_declaration=True)
+
                        elif item.filename.startswith("xl/worksheets/sheet") and item.filename.endswith(".xml"):
                            root = ET.fromstring(content)
                            modified = False
                            for cell in root.iter():
                                if self._tag_is(cell, "c") and cell.get('t') == 'inlineStr':
                                    is_node = self._find_child(cell, "is")
                                    if is_node is not None:
                                        t_node = self._find_child(is_node, "t")
                                        if t_node is not None and t_node.text in translation_map:
                                            t_node.text = self._apply_insert_mode(t_node.text,
                                                                                  translation_map[t_node.text])
                                            modified = True
                            if modified:
                                zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
                            else:
                                zf_out.writestr(item, content)
                        elif item.filename.startswith("xl/tables/table"):
                            root = ET.fromstring(content)
-                            for col in root.findall('.//main:tableColumn', self.ns):
+                            modified = False
                            for col in root.iter():
                                if self._tag_is(col, "tableColumn"):
                                    orig = col.get('name')
                                    if orig in translation_map:
-                                    trans = translation_map[orig]
+                                        col.set('name', self._apply_insert_mode(orig, translation_map[orig]))
-                                    if self.insert_mode == "append":
+                                        modified = True
-                                        col.set('name', orig + self.separator + trans)
+                            if modified:
-                                    elif self.insert_mode == "prepend":
+                                zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
-                                        col.set('name', trans + self.separator + orig)
+                            else:
                                zf_out.writestr(item, content)
                        else:
                                        col.set('name', trans)
                            content = ET.tostring(root, encoding='utf-8', xml_declaration=True)
                            zf_out.writestr(item, content)
            return output_zip_io.getvalue()
        except Exception as e:
-            self.logger.error(f"XML重构失败: {e}")
+            self.logger.error(f"XML重构失败: {e}", exc_info=True)
            return original_content_bytes
    # =========================================================================
    # 主入口
    # =========================================================================
    def translate(self, document: Document) -> Self:
        if self.translate_regions:
            original_texts = self._get_texts_xml_regions(document)