Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions
--- a/backend/app/mineru_parser.py
+++ b/backend/app/mineru_parser.py
@@ -0,0 +1,299 @@
+"""Parse MinerU structured JSON (layout.json / middle.json) into field records."""
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass, field
+
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_table_text(html: str) -> str:
+    """将表格 HTML 转为可供文本匹配的多行字符串。
+
+    每行格式：单元格1｜单元格2｜单元格3
+    同一行内的单元格用 ｜ 连接，行与行之间用换行分隔。
+    """
+    try:
+        soup = BeautifulSoup(html, "html.parser")
+        rows = []
+        for tr in soup.find_all("tr"):
+            cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
+            if any(cells):
+                rows.append("｜".join(cells))
+        return "\n".join(rows)
+    except Exception:
+        # 解析失败时退回正则粗提取
+        return re.sub(r"<[^>]+>", " ", html).strip()
+
+# 1 pt = 0.352778 mm
+PT_TO_MM = 0.352778
+
+# LaTeX inline-equation → Unicode 映射（仅处理标签文件中常见的符号）
+_LATEX_TO_UNICODE: dict[str, str] = {
+    r"\times":   "×",
+    r"\div":     "÷",
+    r"\pm":      "±",
+    r"\mp":      "∓",
+    r"\cdot":    "·",
+    r"\leq":     "≤",
+    r"\geq":     "≥",
+    r"\neq":     "≠",
+    r"\approx":  "≈",
+    r"\infty":   "∞",
+    r"\circ":    "°",
+    r"\degree":  "°",
+    r"\alpha":   "α",
+    r"\beta":    "β",
+    r"\gamma":   "γ",
+    r"\delta":   "δ",
+    r"\mu":      "μ",
+    r"\%":       "%",
+}
+
+# MinerU 有时将 ^{\circ} 输出为 ^{circ}（缺少反斜杠）
+# 用正则统一匹配两种写法
+_SUPERSCRIPT_DEGREE_RE = re.compile(r"\^\{\\?circ\}", re.IGNORECASE)
+
+
+@dataclass
+class MineruDocument:
+    page_width: float       # points
+    page_height: float      # points
+    fields: list[dict]      # list of field dicts ready for the API response
+
+
+def _page_size(page: dict) -> tuple[float, float]:
+    """Return (width, height) in points for a MinerU page entry."""
+    # MinerU stores page size as [width, height] in `page_size`
+    size = page.get("page_size") or page.get("page_size_pt") or []
+    if isinstance(size, (list, tuple)) and len(size) >= 2:
+        return float(size[0]), float(size[1])
+    # Fallback: inspect block bboxes
+    return 595.0, 842.0  # A4 default
+
+
+def _latex_to_text(expr: str) -> str:
+    """将简单的 LaTeX 表达式转换为可读文本（逐一替换已知符号）。"""
+    result = expr.strip()
+    # 优先处理上标度数：^{circ} 或 ^{\circ} → °
+    result = _SUPERSCRIPT_DEGREE_RE.sub("°", result)
+    # 其他上标 ^{...} / 下标 _{...}：去掉包装，只保留内容
+    result = re.sub(r"[\^_]\{([^}]*)\}", r"\1", result)
+    for latex, uni in _LATEX_TO_UNICODE.items():
+        result = result.replace(latex, uni)
+    # 剩余未识别的命令（如 \foo）直接去掉反斜杠，降级为原始字母
+    result = re.sub(r"\\([A-Za-z]+)", r"\1", result)
+    return result
+
+
+def _span_content(span: dict) -> str:
+    """从 span 中提取可供匹配的文本内容。
+
+    - type == "table"：解析 html 字段，转为行列文本
+    - type == "inline_equation"：LaTeX → Unicode 文本
+    - 其他类型：取 content 字段，并修复常见 LaTeX 上标残留（如 ^{circ}）
+    """
+    span_type = span.get("type") or ""
+    if span_type == "table":
+        html = span.get("html") or ""
+        return _extract_table_text(html) if html else ""
+    if span_type == "inline_equation":
+        return _latex_to_text((span.get("content") or "").strip())
+    # 普通文本 span：MinerU 有时在 content 中直接嵌入 LaTeX 上标（如 ^{circ}）
+    raw = (span.get("content") or "").strip()
+    return _SUPERSCRIPT_DEGREE_RE.sub("°", raw)
+
+
+def _iter_lines(block: dict):
+    """Yield (line, block) tuples for all lines in a block.
+
+    Handles two MinerU structures:
+    - Flat:  block → lines → spans   (text/title/etc.)
+    - Nested: block → blocks → lines → spans  (table blocks)
+    """
+    lines = block.get("lines")
+    if lines:
+        for line in lines:
+            yield line, block
+    else:
+        # Table blocks (and some other types) have a nested `blocks` layer
+        for inner in block.get("blocks", []):
+            for line in inner.get("lines", []):
+                yield line, block
+
+
+def _iter_line_fields(page: dict):
+    """Yield one record per non-empty *line* across the whole page.
+
+    Each yielded tuple is ``(merged_text, line, first_text_span, block)`` where:
+    - ``merged_text``      – all span contents concatenated (LaTeX already converted)
+    - ``line``             – the MinerU line dict (carries the authoritative bbox)
+    - ``first_text_span``  – first span that has font metadata, or ``None``
+    - ``block``            – the containing block (carries ``type``)
+
+    Merging at the line level correctly handles footer / title blocks where a
+    single printed sentence is split across many spans (e.g. text + inline_equation
+    + text …).  Table blocks still produce one record per table because they have
+    exactly one span (type="table") per line.
+    """
+    def _process_block_set(blocks_iter):
+        for block in blocks_iter:
+            for line, src_block in _iter_lines(block):
+                spans = line.get("spans", [])
+                if not spans:
+                    continue
+
+                parts: list[str] = []
+                first_text_span: dict | None = None
+                table_html: str | None = None
+                for span in spans:
+                    content = _span_content(span)
+                    if content:
+                        parts.append(content)
+                        if span.get("type") == "table":
+                            # 保留原始 HTML，前端可用于渲染含 colspan/rowspan 的复杂表格
+                            table_html = span.get("html") or None
+                        elif first_text_span is None:
+                            first_text_span = span
+
+                merged = "".join(parts)
+                if merged:
+                    yield merged, line, first_text_span, src_block, table_html
+
+    yield from _process_block_set(page.get("para_blocks", []))
+    yield from _process_block_set(page.get("blocks", []))
+
+
+def _bbox(obj: dict) -> tuple[float, float, float, float]:
+    """Return (x0, y0, x1, y1) from an object's bbox field."""
+    bbox = obj.get("bbox") or [0, 0, 0, 0]
+    if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
+        return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
+    return 0.0, 0.0, 0.0, 0.0
+
+
+def parse_mineru_fields(data: dict) -> MineruDocument:
+    """Convert raw MinerU structured JSON into a :class:`MineruDocument`.
+
+    Parameters
+    ----------
+    data:
+        The parsed JSON dict returned by :class:`~backend.app.mineru_client.MineruClient`.
+        Must contain a ``pdf_info`` list with one entry per page.
+
+    Returns
+    -------
+    MineruDocument
+        Holds page dimensions and a flat list of text field dicts.
+    """
+    pdf_info: list[dict] = data.get("pdf_info", [])
+    if not pdf_info:
+        logger.warning("MinerU JSON contains empty pdf_info")
+        return MineruDocument(page_width=595.0, page_height=842.0, fields=[])
+
+    # Use the first page's dimensions for the preview
+    first_page = pdf_info[0]
+    page_width, page_height = _page_size(first_page)
+
+    fields: list[dict] = []
+    for page in pdf_info:
+        page_idx = int(page.get("page_idx", 0))
+        page_num = page_idx + 1
+        pw, ph = _page_size(page)
+
+        for content, line, font_span, _block, table_html in _iter_line_fields(page):
+            # bbox comes from the line (covers all spans in one visual row)
+            x0, y0, x1, y1 = _bbox(line)
+
+            font_size_pt: float | None = None
+            font_name: str | None = None
+            if font_span is not None:
+                raw_size = font_span.get("size") or font_span.get("font_size")
+                if raw_size is not None:
+                    try:
+                        font_size_pt = float(raw_size)
+                    except (TypeError, ValueError):
+                        pass
+                font_name = font_span.get("font") or font_span.get("font_name") or None
+
+            font_height_mm: float | None = (
+                round(font_size_pt * PT_TO_MM, 2) if font_size_pt else None
+            )
+
+            block_type = (_block.get("type") or "text").strip() or "text"
+
+            fields.append(
+                {
+                    "page": page_num,
+                    "block_type": block_type,
+                    "text": content,
+                    "table_html": table_html,
+                    "font_name": font_name,
+                    "font_size_pt": round(font_size_pt, 2) if font_size_pt else None,
+                    "font_height_mm": font_height_mm,
+                    "x0_pt": round(x0, 2),
+                    "top_pt": round(y0, 2),
+                    "x1_pt": round(x1, 2),
+                    "bottom_pt": round(y1, 2),
+                }
+            )
+
+    logger.info(
+        "MinerU parser extracted %d fields across %d page(s)",
+        len(fields),
+        len(pdf_info),
+    )
+    return MineruDocument(
+        page_width=page_width,
+        page_height=page_height,
+        fields=fields,
+    )
+
+
+def parse_mineru_image_blocks(data: dict) -> list[dict]:
+    """从 MinerU 结构化 JSON 中提取所有 image 类型的 block。
+
+    Returns
+    -------
+    list of dict，每项包含：
+        - page        : 页码（从 1 起）
+        - block_type  : "image"
+        - img_path    : MinerU 在 zip 包内记录的相对路径（可能为 None）
+        - x0_pt, top_pt, x1_pt, bottom_pt : block 边界框（与文本字段坐标系相同）
+    """
+    pdf_info: list[dict] = data.get("pdf_info", [])
+    images: list[dict] = []
+
+    for page in pdf_info:
+        page_idx = int(page.get("page_idx", 0))
+        page_num = page_idx + 1
+
+        for blocks_key in ("para_blocks", "blocks"):
+            for block in page.get(blocks_key, []):
+                if (block.get("type") or "").strip().lower() != "image":
+                    continue
+                x0, y0, x1, y1 = _bbox(block)
+                # MinerU 有时把图片路径放在这几个字段中
+                img_path = (
+                    block.get("img_path")
+                    or block.get("image_path")
+                    or block.get("path")
+                    or None
+                )
+                images.append(
+                    {
+                        "page": page_num,
+                        "block_type": "image",
+                        "img_path": img_path,
+                        "x0_pt": round(x0, 2),
+                        "top_pt": round(y0, 2),
+                        "x1_pt": round(x1, 2),
+                        "bottom_pt": round(y1, 2),
+                    }
+                )
+
+    logger.info("MinerU parser found %d image block(s)", len(images))
+    return images