修复部分文本未翻译问题

2025-12-07 21:12:48 +08:00
parent 99c5d2c7c7
commit e9de5c9ab9
1 changed files with 220 additions and 136 deletions
--- a/docutranslate/translator/ai_translator/xlsx_translator.py
+++ b/docutranslate/translator/ai_translator/xlsx_translator.py
@@ -5,10 +5,9 @@ from dataclasses import dataclass
 from io import BytesIO
 from typing import Self, Literal, List, Optional, Dict, Tuple, Set
 import zipfile
-import re  # 引入正则用于解析简写
+import re
 import xml.etree.ElementTree as ET

-# 仅导入 openpyxl 的工具函数用于坐标计算，不加载 workbook 对象
 from openpyxl.utils.cell import coordinate_to_tuple, range_boundaries

 from docutranslate.agents.segments_agent import SegmentsTranslateAgentConfig, SegmentsTranslateAgent
@@ -21,9 +20,6 @@ class XlsxTranslatorConfig(AiTranslatorConfig):
    insert_mode: Literal["replace", "append", "prepend"] = "replace"
    separator: str = "\n"
    # 指定翻译区域列表。
-    # 示例: ["Sheet1!A1:B10", "C", "3"] (支持简写: C代表C列, 3代表第3行)
-    # 如果不指定表名 (如 "C")，则应用于所有表。
-    # 如果为 None 或空列表，则翻译整个文件中的所有文本。
    translate_regions: Optional[List[str]] = None


@@ -54,59 +50,99 @@ class XlsxTranslator(AiTranslator):
        self.separator = config.separator
        self.translate_regions = config.translate_regions

-        # 命名空间定义
-        self.ns = {
-            'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
-            'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
-        }
-        ET.register_namespace('', self.ns['main'])
+        # 我们虽然不依赖它查找，但写入新节点时最好还是带上标准命名空间
+        self.NS_MAIN = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
+        ET.register_namespace('', self.NS_MAIN)

    # =========================================================================
-    # 辅助方法：无需加载 Workbook 即可解析结构
+    # 核心辅助方法：忽略命名空间查找
+    # =========================================================================
+
+    def _tag_is(self, elem: ET.Element, tag_name: str) -> bool:
+        """判断元素的标签名是否匹配（忽略命名空间）。"""
+        return elem.tag.endswith(f"}}{tag_name}") or elem.tag == tag_name
+
+    def _find_child(self, parent: ET.Element, tag_name: str) -> Optional[ET.Element]:
+        """在直接子节点中查找指定标签（忽略命名空间）。"""
+        for child in parent:
+            if self._tag_is(child, tag_name):
+                return child
+        return None
+
+    def _find_all_children(self, parent: ET.Element, tag_name: str) -> List[ET.Element]:
+        """查找所有匹配的直接子节点（忽略命名空间）。"""
+        return [child for child in parent if self._tag_is(child, tag_name)]
+
+    def _get_child_text(self, parent: ET.Element, tag_name: str) -> Optional[str]:
+        """获取子节点的文本内容（忽略命名空间）。"""
+        child = self._find_child(parent, tag_name)
+        return child.text if child is not None else None
+
+    # =========================================================================
+    # 辅助逻辑
    # =========================================================================

    def _get_shared_strings(self, zf: zipfile.ZipFile) -> List[str]:
-        """解析共享字符串表，返回字符串列表。"""
        if "xl/sharedStrings.xml" not in zf.namelist():
            return []
-
        shared_strings = []
        with zf.open("xl/sharedStrings.xml") as f:
            context = ET.iterparse(f, events=("end",))
            for event, elem in context:
-                if elem.tag.endswith('}si'):  # shared item
-                    texts = [t.text for t in elem.findall('.//main:t', self.ns) if t.text]
+                # 匹配 <si>
+                if self._tag_is(elem, "si"):
+                    # 查找所有 <t>
+                    # 注意：xml.etree 的 findall 只能简单的路径查找，
+                    # 既然我们要忽略命名空间，最好手动遍历子树，但 si 结构简单，
+                    # 这里简化处理：直接遍历 iter 出来的 t 元素（如果在 si 内部）
+                    pass
+                    # 由于 iterparse 是扁平流，很难直接关联 si 和 t。
+                    # 更稳妥的方式是：当 elem 是 si 时，遍历 elem 的 children
+                    texts = []
+                    for child in elem.iter():
+                        if self._tag_is(child, "t") and child.text:
+                            texts.append(child.text)
                    shared_strings.append("".join(texts))
                    elem.clear()
        return shared_strings

    def _get_sheet_mapping(self, zf: zipfile.ZipFile) -> Dict[str, str]:
-        """
-        获取 Sheet 名称到文件路径的映射。
-        例如: {'Sheet1': 'xl/worksheets/sheet1.xml'}
-        """
+        """建立 SheetName -> ZipFilename 的映射（稳健版）"""
        sheet_name_to_rid = {}
        try:
            with zf.open("xl/workbook.xml") as f:
                root = ET.fromstring(f.read())
-                for sheet in root.findall(".//main:sheet", self.ns):
-                    name = sheet.get("name")
-                    rid = sheet.get(f"{{{self.ns['r']}}}id")
-                    if name and rid:
-                        sheet_name_to_rid[name] = rid
+                # 查找所有 sheet 标签
+                # 需递归查找，因为 sheet 通常在 sheets 节点下
+                for sheet in root.iter():
+                    if self._tag_is(sheet, "sheet"):
+                        name = sheet.get("name")
+                        # id 的属性名通常带有 r: 前缀，这很难忽略命名空间，
+                        # 但 openpyxl 规范里 id 属性几乎总是依赖 relationships 命名空间
+                        # 这里我们尝试遍历属性找到 key 包含 id 的
+                        rid = None
+                        for k, v in sheet.attrib.items():
+                            if k.endswith("id"):  # 匹配 r:id
+                                rid = v
+                                break
+                        if name and rid:
+                            sheet_name_to_rid[name] = rid
        except Exception:
            return {}

        rid_to_target = {}
        try:
            with zf.open("xl/_rels/workbook.xml.rels") as f:
-                tree = ET.parse(f)
-                root = tree.getroot()
+                root = ET.fromstring(f.read())
                for child in root:
+                    # Relationships 里的 tag 也是 Relationship
                    rid = child.get("Id")
                    target = child.get("Target")
                    if rid and target:
-                        if not target.startswith("/"):
+                        target = target.replace("\\", "/")
+                        if target.startswith("/"):
+                            target = target.lstrip("/")
+                        else:
                            target = "xl/" + target
                        rid_to_target[rid] = target
        except Exception:
@@ -116,50 +152,32 @@ class XlsxTranslator(AiTranslator):
        for name, rid in sheet_name_to_rid.items():
            if rid in rid_to_target:
                mapping[name] = rid_to_target[rid]
-
        return mapping

    def _normalize_range(self, range_str: str) -> str:
-        """
-        将用户输入的简写转换为标准范围格式。
-        "C" -> "C:C"
-        "3" -> "3:3"
-        "A1" -> "A1:A1" (openpyxl range_boundaries 实际上支持 A1，但这里统一处理更安全)
-        """
-        # 纯字母 (例如 "C", "AA") -> 整列
        if re.match(r"^[A-Za-z]+$", range_str):
            return f"{range_str}:{range_str}"
-        # 纯数字 (例如 "3", "10") -> 整行
        if re.match(r"^\d+$", range_str):
            return f"{range_str}:{range_str}"
        return range_str

    def _parse_region_boundaries(self, sheet_mapping: Dict[str, str]) -> Dict[str, List[Tuple]]:
-        """
-        解析配置的 translate_regions。
-        返回: { 'xl/worksheets/sheet1.xml': [(min_col, min_row, max_col, max_row), ...], ... }
-        """
        if not self.translate_regions:
            return {}
-
-        region_map = {}  # filename -> list of boundaries
-        global_regions = []  # list of boundaries for all sheets
+        region_map = {}
+        global_regions = []

        for region in self.translate_regions:
            sheet_name = None
            raw_range = region.strip()
-
            if "!" in raw_range:
                parts = raw_range.split("!", 1)
                sheet_name = parts[0].strip("'")
                range_part = self._normalize_range(parts[1])
            else:
                range_part = self._normalize_range(raw_range)
-
            try:
-                # boundaries: (min_col, min_row, max_col, max_row)
                boundaries = range_boundaries(range_part)
-
                if sheet_name:
                    filename = sheet_mapping.get(sheet_name)
                    if filename:
@@ -171,18 +189,15 @@ class XlsxTranslator(AiTranslator):
            except Exception as e:
                self.logger.warning(f"无法解析区域 '{region}': {e}")

-        # 将全局区域添加到所有已知 Sheet
        if global_regions:
-            all_files = set(sheet_mapping.values())
-            for f in all_files:
+            target_files = set(sheet_mapping.values())
+            for f in target_files:
                if f not in region_map:
                    region_map[f] = []
                region_map[f].extend(global_regions)
-
        return region_map

    def _is_in_boundaries(self, col: int, row: int, boundaries_list: List[Tuple]) -> bool:
-        """检查坐标 (col, row) 是否在给定的边界列表中。"""
        for (min_col, min_row, max_col, max_row) in boundaries_list:
            if min_col is not None and col < min_col: continue
            if min_row is not None and row < min_row: continue
@@ -191,20 +206,30 @@ class XlsxTranslator(AiTranslator):
            return True
        return False

+    def _apply_insert_mode(self, original: str, translated: str) -> str:
+        if self.insert_mode == "append":
+            return f"{original}{self.separator}{translated}"
+        elif self.insert_mode == "prepend":
+            return f"{translated}{self.separator}{original}"
+        else:
+            return translated
+
    # =========================================================================
-    # 高效 XML 区域提取与重构
+    # 区域处理 (使用 Helper 方法，完全解耦命名空间)
    # =========================================================================

    def _get_texts_xml_regions(self, document: Document) -> List[str]:
-        """使用纯 XML 解析（结合 SharedStrings）提取指定区域文本。"""
        texts_to_translate = set()

        with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
            shared_strings = self._get_shared_strings(zf)
-            if not shared_strings:
-                return []
-
            sheet_mapping = self._get_sheet_mapping(zf)
+
+            if not sheet_mapping:
+                all_sheets = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
+                for s in all_sheets:
+                    sheet_mapping[f"Unknown_{s}"] = s
+
            boundaries_map = self._parse_region_boundaries(sheet_mapping)

            for filename, boundaries in boundaries_map.items():
@@ -214,80 +239,108 @@ class XlsxTranslator(AiTranslator):
                with zf.open(filename) as f:
                    context = ET.iterparse(f, events=("end",))
                    for event, elem in context:
-                        if elem.tag.endswith('}c'):  # Cell
-                            r_attr = elem.get('r')  # e.g. "C5"
-                            t_attr = elem.get('t')  # e.g. "s"
+                        # 匹配 <c>
+                        if self._tag_is(elem, "c"):
+                            r_attr = elem.get('r')
+                            t_attr = elem.get('t')

-                            if r_attr and t_attr == 's':
+                            if r_attr:
                                try:
-                                    # 【修正】coordinate_to_tuple 返回 (row, col)
                                    row, col = coordinate_to_tuple(r_attr)
                                    if self._is_in_boundaries(col, row, boundaries):
-                                        v_node = elem.find('main:v', self.ns)
-                                        if v_node is not None and v_node.text:
-                                            idx = int(v_node.text)
-                                            if 0 <= idx < len(shared_strings):
-                                                texts_to_translate.add(shared_strings[idx])
+                                        text_found = None
+
+                                        # Shared String
+                                        if t_attr == 's':
+                                            v_text = self._get_child_text(elem, "v")
+                                            if v_text:
+                                                idx = int(v_text)
+                                                if 0 <= idx < len(shared_strings):
+                                                    text_found = shared_strings[idx]
+
+                                        # Inline String
+                                        elif t_attr == 'inlineStr':
+                                            is_node = self._find_child(elem, "is")
+                                            if is_node is not None:
+                                                t_text = self._get_child_text(is_node, "t")
+                                                if t_text:
+                                                    text_found = t_text
+
+                                        if text_found:
+                                            texts_to_translate.add(text_found)
                                except Exception:
                                    pass
                            elem.clear()
+                        elif self._tag_is(elem, "row"):
+                            elem.clear()

        return list(texts_to_translate)

    def _rebuild_xml_regions(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
-        """使用纯 XML 重构，修正了坐标解包顺序。"""
        output_zip_io = BytesIO()
-
        with zipfile.ZipFile(BytesIO(original_content_bytes), 'r') as zf_in:
            with zipfile.ZipFile(output_zip_io, 'w', zipfile.ZIP_DEFLATED) as zf_out:
-
                shared_strings = self._get_shared_strings(zf_in)
                sheet_mapping = self._get_sheet_mapping(zf_in)
+
+                if not sheet_mapping:
+                    all_sheets = [n for n in zf_in.namelist() if
+                                  n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
+                    for s in all_sheets:
+                        sheet_mapping[f"Unknown_{s}"] = s
+
                boundaries_map = self._parse_region_boundaries(sheet_mapping)

                for item in zf_in.infolist():
                    if item.filename in boundaries_map:
                        boundaries = boundaries_map[item.filename]
-
                        with zf_in.open(item.filename) as f:
                            tree = ET.parse(f)
                            root = tree.getroot()
-
                            cells_modified = False
-                            for cell in root.findall(".//main:c", self.ns):
+
+                            # 查找所有 <c>
+                            # 因为 findall 不支持复杂的 tag.endswith，这里我们遍历所有节点
+                            # 对于 parse 加载的树，iter() 是高效的
+                            for cell in root.iter():
+                                if not self._tag_is(cell, "c"):
+                                    continue
+
                                r_attr = cell.get('r')
                                t_attr = cell.get('t')
-
-                                if r_attr and t_attr == 's':
+                                if r_attr:
                                    try:
-                                        # 【修正】coordinate_to_tuple 返回 (row, col)
                                        row, col = coordinate_to_tuple(r_attr)
-
                                        if self._is_in_boundaries(col, row, boundaries):
-                                            v_node = cell.find('main:v', self.ns)
-                                            if v_node is not None and v_node.text:
-                                                idx = int(v_node.text)
-                                                if 0 <= idx < len(shared_strings):
-                                                    original_text = shared_strings[idx]
+                                            original_text = None

-                                                    if original_text in translation_map:
-                                                        translated_text = translation_map[original_text]
+                                            if t_attr == 's':
+                                                v_text = self._get_child_text(cell, "v")
+                                                if v_text:
+                                                    idx = int(v_text)
+                                                    if 0 <= idx < len(shared_strings):
+                                                        original_text = shared_strings[idx]
+                                            elif t_attr == 'inlineStr':
+                                                is_node = self._find_child(cell, "is")
+                                                if is_node is not None:
+                                                    original_text = self._get_child_text(is_node, "t")

-                                                        final_text = translated_text
-                                                        if self.insert_mode == "append":
-                                                            final_text = original_text + self.separator + translated_text
-                                                        elif self.insert_mode == "prepend":
-                                                            final_text = translated_text + self.separator + original_text
+                                            if original_text and original_text in translation_map:
+                                                final_text = self._apply_insert_mode(original_text,
+                                                                                     translation_map[original_text])

-                                                        # 转换为 inlineStr
-                                                        cell.set('t', 'inlineStr')
-                                                        cell.remove(v_node)
-                                                        is_node = ET.Element(f"{{{self.ns['main']}}}is")
-                                                        t_node = ET.SubElement(is_node, f"{{{self.ns['main']}}}t")
-                                                        t_node.text = final_text
-                                                        cell.append(is_node)
+                                                # 清空旧内容
+                                                for child in list(cell):
+                                                    cell.remove(child)

-                                                        cells_modified = True
+                                                # 写入新内容 (这里必须使用带有命名空间的标签名，否则Excel不认)
+                                                cell.set('t', 'inlineStr')
+                                                # 注意：写入时使用 self.NS_MAIN 是必须的，因为这是标准
+                                                is_node = ET.Element(f"{{{self.NS_MAIN}}}is")
+                                                t_node = ET.SubElement(is_node, f"{{{self.NS_MAIN}}}t")
+                                                t_node.text = final_text
+                                                cell.append(is_node)
+                                                cells_modified = True
                                    except Exception:
                                        pass

@@ -298,11 +351,10 @@ class XlsxTranslator(AiTranslator):
                                zf_out.writestr(item, zf_in.read(item.filename))
                    else:
                        zf_out.writestr(item, zf_in.read(item.filename))
-
        return output_zip_io.getvalue()

    # =========================================================================
-    # 原有全文档逻辑 (针对全文档翻译保持极致速度)
+    # 全文档处理 (同样使用 Helper)
    # =========================================================================

    def _get_texts_xml_all(self, document: Document) -> List[str]:
@@ -311,20 +363,39 @@ class XlsxTranslator(AiTranslator):
            with zipfile.ZipFile(BytesIO(document.content), 'r') as zf:
                if "xl/sharedStrings.xml" in zf.namelist():
                    with zf.open("xl/sharedStrings.xml") as f:
-                        root = ET.fromstring(f.read())
-                        for node in root.findall('.//main:t', self.ns):
-                            if node.text and node.text.strip():
-                                texts_to_translate.add(node.text)
+                        context = ET.iterparse(f, events=("end",))
+                        for event, elem in context:
+                            if self._tag_is(elem, "t"):
+                                if elem.text and elem.text.strip():
+                                    texts_to_translate.add(elem.text)
+                                elem.clear()
+
+                sheet_files = [n for n in zf.namelist() if n.startswith("xl/worksheets/sheet") and n.endswith(".xml")]
+                for sheet_file in sheet_files:
+                    with zf.open(sheet_file) as f:
+                        context = ET.iterparse(f, events=("end",))
+                        for event, elem in context:
+                            if self._tag_is(elem, "c"):
+                                if elem.get('t') == 'inlineStr':
+                                    is_node = self._find_child(elem, "is")
+                                    if is_node is not None:
+                                        t_text = self._get_child_text(is_node, "t")
+                                        if t_text and t_text.strip():
+                                            texts_to_translate.add(t_text)
+                                elem.clear()
+                            elif self._tag_is(elem, "row"):
+                                elem.clear()

                for item in zf.infolist():
                    if item.filename.startswith("xl/tables/table"):
                        with zf.open(item.filename) as f:
                            root = ET.fromstring(f.read())
-                            for col in root.findall('.//main:tableColumn', self.ns):
-                                if col.get('name'):
-                                    texts_to_translate.add(col.get('name'))
+                            for col in root.iter():
+                                if self._tag_is(col, "tableColumn"):
+                                    if col.get('name'):
+                                        texts_to_translate.add(col.get('name'))
        except Exception as e:
-            self.logger.error(f"XML解析失败: {e}")
+            self.logger.error(f"XML解析失败: {e}", exc_info=True)
        return list(texts_to_translate)

    def _rebuild_xml_all(self, original_content_bytes: bytes, translation_map: dict) -> bytes:
@@ -337,41 +408,54 @@ class XlsxTranslator(AiTranslator):

                        if item.filename == "xl/sharedStrings.xml":
                            root = ET.fromstring(content)
-                            for node in root.findall('.//main:t', self.ns):
-                                if node.text in translation_map:
-                                    trans = translation_map[node.text]
-                                    if self.insert_mode == "append":
-                                        node.text = node.text + self.separator + trans
-                                    elif self.insert_mode == "prepend":
-                                        node.text = trans + self.separator + node.text
-                                    else:
-                                        node.text = trans
-                            content = ET.tostring(root, encoding='utf-8', xml_declaration=True)
+                            modified = False
+                            for node in root.iter():
+                                if self._tag_is(node, "t"):
+                                    if node.text in translation_map:
+                                        node.text = self._apply_insert_mode(node.text, translation_map[node.text])
+                                        modified = True
+                            if modified:
+                                zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
+                            else:
+                                zf_out.writestr(item, content)
+
+                        elif item.filename.startswith("xl/worksheets/sheet") and item.filename.endswith(".xml"):
+                            root = ET.fromstring(content)
+                            modified = False
+                            for cell in root.iter():
+                                if self._tag_is(cell, "c") and cell.get('t') == 'inlineStr':
+                                    is_node = self._find_child(cell, "is")
+                                    if is_node is not None:
+                                        t_node = self._find_child(is_node, "t")
+                                        if t_node is not None and t_node.text in translation_map:
+                                            t_node.text = self._apply_insert_mode(t_node.text,
+                                                                                  translation_map[t_node.text])
+                                            modified = True
+                            if modified:
+                                zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
+                            else:
+                                zf_out.writestr(item, content)

                        elif item.filename.startswith("xl/tables/table"):
                            root = ET.fromstring(content)
-                            for col in root.findall('.//main:tableColumn', self.ns):
-                                orig = col.get('name')
-                                if orig in translation_map:
-                                    trans = translation_map[orig]
-                                    if self.insert_mode == "append":
-                                        col.set('name', orig + self.separator + trans)
-                                    elif self.insert_mode == "prepend":
-                                        col.set('name', trans + self.separator + orig)
-                                    else:
-                                        col.set('name', trans)
-                            content = ET.tostring(root, encoding='utf-8', xml_declaration=True)
-
-                        zf_out.writestr(item, content)
+                            modified = False
+                            for col in root.iter():
+                                if self._tag_is(col, "tableColumn"):
+                                    orig = col.get('name')
+                                    if orig in translation_map:
+                                        col.set('name', self._apply_insert_mode(orig, translation_map[orig]))
+                                        modified = True
+                            if modified:
+                                zf_out.writestr(item, ET.tostring(root, encoding='utf-8', xml_declaration=True))
+                            else:
+                                zf_out.writestr(item, content)
+                        else:
+                            zf_out.writestr(item, content)
            return output_zip_io.getvalue()
        except Exception as e:
-            self.logger.error(f"XML重构失败: {e}")
+            self.logger.error(f"XML重构失败: {e}", exc_info=True)
            return original_content_bytes

-    # =========================================================================
-    # 主入口
-    # =========================================================================
-
    def translate(self, document: Document) -> Self:
        if self.translate_regions:
            original_texts = self._get_texts_xml_regions(document)