ZLD_POC/backend/app/mineru_parser.py

"""Parse MinerU structured JSON (layout.json / middle.json) into field records."""
from __future__ import annotations

import logging
import re
from dataclasses import dataclass, field

from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


def _extract_table_text(html: str) -> str:
    """将表格 HTML 转为可供文本匹配的多行字符串。

    每行格式：单元格1｜单元格2｜单元格3
    同一行内的单元格用 ｜ 连接，行与行之间用换行分隔。
    """
    try:
        soup = BeautifulSoup(html, "html.parser")
        rows = []
        for tr in soup.find_all("tr"):
            cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
            if any(cells):
                rows.append("｜".join(cells))
        return "\n".join(rows)
    except Exception:
        # 解析失败时退回正则粗提取
        return re.sub(r"<[^>]+>", " ", html).strip()

# 1 pt = 0.352778 mm
PT_TO_MM = 0.352778

# LaTeX inline-equation → Unicode 映射（仅处理标签文件中常见的符号）
_LATEX_TO_UNICODE: dict[str, str] = {
    r"\times":   "×",
    r"\div":     "÷",
    r"\pm":      "±",
    r"\mp":      "∓",
    r"\cdot":    "·",
    r"\leq":     "≤",
    r"\geq":     "≥",
    r"\neq":     "≠",
    r"\approx":  "≈",
    r"\infty":   "∞",
    r"\circ":    "°",
    r"\degree":  "°",
    r"\alpha":   "α",
    r"\beta":    "β",
    r"\gamma":   "γ",
    r"\delta":   "δ",
    r"\mu":      "μ",
    r"\%":       "%",
}

# MinerU 有时将 ^{\circ} 输出为 ^{circ}（缺少反斜杠）
# 用正则统一匹配两种写法
_SUPERSCRIPT_DEGREE_RE = re.compile(r"\^\{\\?circ\}", re.IGNORECASE)


@dataclass
class MineruDocument:
    page_width: float       # points
    page_height: float      # points
    fields: list[dict]      # list of field dicts ready for the API response


def _page_size(page: dict) -> tuple[float, float]:
    """Return (width, height) in points for a MinerU page entry."""
    # MinerU stores page size as [width, height] in `page_size`
    size = page.get("page_size") or page.get("page_size_pt") or []
    if isinstance(size, (list, tuple)) and len(size) >= 2:
        return float(size[0]), float(size[1])
    # Fallback: inspect block bboxes
    return 595.0, 842.0  # A4 default


def _latex_to_text(expr: str) -> str:
    """将简单的 LaTeX 表达式转换为可读文本（逐一替换已知符号）。"""
    result = expr.strip()
    # 优先处理上标度数：^{circ} 或 ^{\circ} → °
    result = _SUPERSCRIPT_DEGREE_RE.sub("°", result)
    # 其他上标 ^{...} / 下标 _{...}：去掉包装，只保留内容
    result = re.sub(r"[\^_]\{([^}]*)\}", r"\1", result)
    for latex, uni in _LATEX_TO_UNICODE.items():
        result = result.replace(latex, uni)
    # 剩余未识别的命令（如 \foo）直接去掉反斜杠，降级为原始字母
    result = re.sub(r"\\([A-Za-z]+)", r"\1", result)
    return result


def _span_content(span: dict) -> str:
    """从 span 中提取可供匹配的文本内容。

    - type == "table"：解析 html 字段，转为行列文本
    - type == "inline_equation"：LaTeX → Unicode 文本
    - 其他类型：取 content 字段，并修复常见 LaTeX 上标残留（如 ^{circ}）
    """
    span_type = span.get("type") or ""
    if span_type == "table":
        html = span.get("html") or ""
        return _extract_table_text(html) if html else ""
    if span_type == "inline_equation":
        return _latex_to_text((span.get("content") or "").strip())
    # 普通文本 span：MinerU 有时在 content 中直接嵌入 LaTeX 上标（如 ^{circ}）
    raw = (span.get("content") or "").strip()
    return _SUPERSCRIPT_DEGREE_RE.sub("°", raw)


def _iter_lines(block: dict):
    """Yield (line, block) tuples for all lines in a block.

    Handles two MinerU structures:
    - Flat:  block → lines → spans   (text/title/etc.)
    - Nested: block → blocks → lines → spans  (table blocks)
    """
    lines = block.get("lines")
    if lines:
        for line in lines:
            yield line, block
    else:
        # Table blocks (and some other types) have a nested `blocks` layer
        for inner in block.get("blocks", []):
            for line in inner.get("lines", []):
                yield line, block


def _iter_line_fields(page: dict):
    """Yield one record per non-empty *line* across the whole page.

    Each yielded tuple is ``(merged_text, line, first_text_span, block)`` where:
    - ``merged_text``      – all span contents concatenated (LaTeX already converted)
    - ``line``             – the MinerU line dict (carries the authoritative bbox)
    - ``first_text_span``  – first span that has font metadata, or ``None``
    - ``block``            – the containing block (carries ``type``)

    Merging at the line level correctly handles footer / title blocks where a
    single printed sentence is split across many spans (e.g. text + inline_equation
    + text …).  Table blocks still produce one record per table because they have
    exactly one span (type="table") per line.
    """
    def _process_block_set(blocks_iter):
        for block in blocks_iter:
            for line, src_block in _iter_lines(block):
                spans = line.get("spans", [])
                if not spans:
                    continue

                parts: list[str] = []
                first_text_span: dict | None = None
                table_html: str | None = None
                for span in spans:
                    content = _span_content(span)
                    if content:
                        parts.append(content)
                        if span.get("type") == "table":
                            # 保留原始 HTML，前端可用于渲染含 colspan/rowspan 的复杂表格
                            table_html = span.get("html") or None
                        elif first_text_span is None:
                            first_text_span = span

                merged = "".join(parts)
                if merged:
                    yield merged, line, first_text_span, src_block, table_html

    yield from _process_block_set(page.get("para_blocks", []))
    yield from _process_block_set(page.get("blocks", []))


def _bbox(obj: dict) -> tuple[float, float, float, float]:
    """Return (x0, y0, x1, y1) from an object's bbox field."""
    bbox = obj.get("bbox") or [0, 0, 0, 0]
    if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
        return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
    return 0.0, 0.0, 0.0, 0.0


def parse_mineru_fields(data: dict) -> MineruDocument:
    """Convert raw MinerU structured JSON into a :class:`MineruDocument`.

    Parameters
    ----------
    data:
        The parsed JSON dict returned by :class:`~backend.app.mineru_client.MineruClient`.
        Must contain a ``pdf_info`` list with one entry per page.

    Returns
    -------
    MineruDocument
        Holds page dimensions and a flat list of text field dicts.
    """
    pdf_info: list[dict] = data.get("pdf_info", [])
    if not pdf_info:
        logger.warning("MinerU JSON contains empty pdf_info")
        return MineruDocument(page_width=595.0, page_height=842.0, fields=[])

    # Use the first page's dimensions for the preview
    first_page = pdf_info[0]
    page_width, page_height = _page_size(first_page)

    fields: list[dict] = []
    for page in pdf_info:
        page_idx = int(page.get("page_idx", 0))
        page_num = page_idx + 1
        pw, ph = _page_size(page)

        for content, line, font_span, _block, table_html in _iter_line_fields(page):
            # bbox comes from the line (covers all spans in one visual row)
            x0, y0, x1, y1 = _bbox(line)

            font_size_pt: float | None = None
            font_name: str | None = None
            if font_span is not None:
                raw_size = font_span.get("size") or font_span.get("font_size")
                if raw_size is not None:
                    try:
                        font_size_pt = float(raw_size)
                    except (TypeError, ValueError):
                        pass
                font_name = font_span.get("font") or font_span.get("font_name") or None

            font_height_mm: float | None = (
                round(font_size_pt * PT_TO_MM, 2) if font_size_pt else None
            )

            block_type = (_block.get("type") or "text").strip() or "text"

            fields.append(
                {
                    "page": page_num,
                    "block_type": block_type,
                    "text": content,
                    "table_html": table_html,
                    "font_name": font_name,
                    "font_size_pt": round(font_size_pt, 2) if font_size_pt else None,
                    "font_height_mm": font_height_mm,
                    "x0_pt": round(x0, 2),
                    "top_pt": round(y0, 2),
                    "x1_pt": round(x1, 2),
                    "bottom_pt": round(y1, 2),
                }
            )

    logger.info(
        "MinerU parser extracted %d fields across %d page(s)",
        len(fields),
        len(pdf_info),
    )
    return MineruDocument(
        page_width=page_width,
        page_height=page_height,
        fields=fields,
    )


def parse_mineru_image_blocks(data: dict) -> list[dict]:
    """从 MinerU 结构化 JSON 中提取所有 image 类型的 block。

    Returns
    -------
    list of dict，每项包含：
        - page        : 页码（从 1 起）
        - block_type  : "image"
        - img_path    : MinerU 在 zip 包内记录的相对路径（可能为 None）
        - x0_pt, top_pt, x1_pt, bottom_pt : block 边界框（与文本字段坐标系相同）
    """
    pdf_info: list[dict] = data.get("pdf_info", [])
    images: list[dict] = []

    for page in pdf_info:
        page_idx = int(page.get("page_idx", 0))
        page_num = page_idx + 1

        for blocks_key in ("para_blocks", "blocks"):
            for block in page.get(blocks_key, []):
                if (block.get("type") or "").strip().lower() != "image":
                    continue
                x0, y0, x1, y1 = _bbox(block)
                # MinerU 有时把图片路径放在这几个字段中
                img_path = (
                    block.get("img_path")
                    or block.get("image_path")
                    or block.get("path")
                    or None
                )
                images.append(
                    {
                        "page": page_num,
                        "block_type": "image",
                        "img_path": img_path,
                        "x0_pt": round(x0, 2),
                        "top_pt": round(y0, 2),
                        "x1_pt": round(x1, 2),
                        "bottom_pt": round(y1, 2),
                    }
                )

    logger.info("MinerU parser found %d image block(s)", len(images))
    return images