"""Parse MinerU structured JSON (layout.json / middle.json) into field records.""" from __future__ import annotations import logging import re from dataclasses import dataclass, field from bs4 import BeautifulSoup logger = logging.getLogger(__name__) def _extract_table_text(html: str) -> str: """将表格 HTML 转为可供文本匹配的多行字符串。 每行格式:单元格1|单元格2|单元格3 同一行内的单元格用 | 连接,行与行之间用换行分隔。 """ try: soup = BeautifulSoup(html, "html.parser") rows = [] for tr in soup.find_all("tr"): cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])] if any(cells): rows.append("|".join(cells)) return "\n".join(rows) except Exception: # 解析失败时退回正则粗提取 return re.sub(r"<[^>]+>", " ", html).strip() # 1 pt = 0.352778 mm PT_TO_MM = 0.352778 # LaTeX inline-equation → Unicode 映射(仅处理标签文件中常见的符号) _LATEX_TO_UNICODE: dict[str, str] = { r"\times": "×", r"\div": "÷", r"\pm": "±", r"\mp": "∓", r"\cdot": "·", r"\leq": "≤", r"\geq": "≥", r"\neq": "≠", r"\approx": "≈", r"\infty": "∞", r"\circ": "°", r"\degree": "°", r"\alpha": "α", r"\beta": "β", r"\gamma": "γ", r"\delta": "δ", r"\mu": "μ", r"\%": "%", } # MinerU 有时将 ^{\circ} 输出为 ^{circ}(缺少反斜杠) # 用正则统一匹配两种写法 _SUPERSCRIPT_DEGREE_RE = re.compile(r"\^\{\\?circ\}", re.IGNORECASE) @dataclass class MineruDocument: page_width: float # points page_height: float # points fields: list[dict] # list of field dicts ready for the API response def _page_size(page: dict) -> tuple[float, float]: """Return (width, height) in points for a MinerU page entry.""" # MinerU stores page size as [width, height] in `page_size` size = page.get("page_size") or page.get("page_size_pt") or [] if isinstance(size, (list, tuple)) and len(size) >= 2: return float(size[0]), float(size[1]) # Fallback: inspect block bboxes return 595.0, 842.0 # A4 default def _latex_to_text(expr: str) -> str: """将简单的 LaTeX 表达式转换为可读文本(逐一替换已知符号)。""" result = expr.strip() # 优先处理上标度数:^{circ} 或 ^{\circ} → ° result = _SUPERSCRIPT_DEGREE_RE.sub("°", result) # 其他上标 ^{...} / 下标 _{...}:去掉包装,只保留内容 result = re.sub(r"[\^_]\{([^}]*)\}", r"\1", result) for latex, uni in _LATEX_TO_UNICODE.items(): result = result.replace(latex, uni) # 剩余未识别的命令(如 \foo)直接去掉反斜杠,降级为原始字母 result = re.sub(r"\\([A-Za-z]+)", r"\1", result) return result def _span_content(span: dict) -> str: """从 span 中提取可供匹配的文本内容。 - type == "table":解析 html 字段,转为行列文本 - type == "inline_equation":LaTeX → Unicode 文本 - 其他类型:取 content 字段,并修复常见 LaTeX 上标残留(如 ^{circ}) """ span_type = span.get("type") or "" if span_type == "table": html = span.get("html") or "" return _extract_table_text(html) if html else "" if span_type == "inline_equation": return _latex_to_text((span.get("content") or "").strip()) # 普通文本 span:MinerU 有时在 content 中直接嵌入 LaTeX 上标(如 ^{circ}) raw = (span.get("content") or "").strip() return _SUPERSCRIPT_DEGREE_RE.sub("°", raw) def _iter_lines(block: dict): """Yield (line, block) tuples for all lines in a block. Handles two MinerU structures: - Flat: block → lines → spans (text/title/etc.) - Nested: block → blocks → lines → spans (table blocks) """ lines = block.get("lines") if lines: for line in lines: yield line, block else: # Table blocks (and some other types) have a nested `blocks` layer for inner in block.get("blocks", []): for line in inner.get("lines", []): yield line, block def _iter_line_fields(page: dict): """Yield one record per non-empty *line* across the whole page. Each yielded tuple is ``(merged_text, line, first_text_span, block)`` where: - ``merged_text`` – all span contents concatenated (LaTeX already converted) - ``line`` – the MinerU line dict (carries the authoritative bbox) - ``first_text_span`` – first span that has font metadata, or ``None`` - ``block`` – the containing block (carries ``type``) Merging at the line level correctly handles footer / title blocks where a single printed sentence is split across many spans (e.g. text + inline_equation + text …). Table blocks still produce one record per table because they have exactly one span (type="table") per line. """ def _process_block_set(blocks_iter): for block in blocks_iter: for line, src_block in _iter_lines(block): spans = line.get("spans", []) if not spans: continue parts: list[str] = [] first_text_span: dict | None = None table_html: str | None = None for span in spans: content = _span_content(span) if content: parts.append(content) if span.get("type") == "table": # 保留原始 HTML,前端可用于渲染含 colspan/rowspan 的复杂表格 table_html = span.get("html") or None elif first_text_span is None: first_text_span = span merged = "".join(parts) if merged: yield merged, line, first_text_span, src_block, table_html yield from _process_block_set(page.get("para_blocks", [])) yield from _process_block_set(page.get("blocks", [])) def _bbox(obj: dict) -> tuple[float, float, float, float]: """Return (x0, y0, x1, y1) from an object's bbox field.""" bbox = obj.get("bbox") or [0, 0, 0, 0] if isinstance(bbox, (list, tuple)) and len(bbox) >= 4: return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3]) return 0.0, 0.0, 0.0, 0.0 def parse_mineru_fields(data: dict) -> MineruDocument: """Convert raw MinerU structured JSON into a :class:`MineruDocument`. Parameters ---------- data: The parsed JSON dict returned by :class:`~backend.app.mineru_client.MineruClient`. Must contain a ``pdf_info`` list with one entry per page. Returns ------- MineruDocument Holds page dimensions and a flat list of text field dicts. """ pdf_info: list[dict] = data.get("pdf_info", []) if not pdf_info: logger.warning("MinerU JSON contains empty pdf_info") return MineruDocument(page_width=595.0, page_height=842.0, fields=[]) # Use the first page's dimensions for the preview first_page = pdf_info[0] page_width, page_height = _page_size(first_page) fields: list[dict] = [] for page in pdf_info: page_idx = int(page.get("page_idx", 0)) page_num = page_idx + 1 pw, ph = _page_size(page) for content, line, font_span, _block, table_html in _iter_line_fields(page): # bbox comes from the line (covers all spans in one visual row) x0, y0, x1, y1 = _bbox(line) font_size_pt: float | None = None font_name: str | None = None if font_span is not None: raw_size = font_span.get("size") or font_span.get("font_size") if raw_size is not None: try: font_size_pt = float(raw_size) except (TypeError, ValueError): pass font_name = font_span.get("font") or font_span.get("font_name") or None font_height_mm: float | None = ( round(font_size_pt * PT_TO_MM, 2) if font_size_pt else None ) block_type = (_block.get("type") or "text").strip() or "text" fields.append( { "page": page_num, "block_type": block_type, "text": content, "table_html": table_html, "font_name": font_name, "font_size_pt": round(font_size_pt, 2) if font_size_pt else None, "font_height_mm": font_height_mm, "x0_pt": round(x0, 2), "top_pt": round(y0, 2), "x1_pt": round(x1, 2), "bottom_pt": round(y1, 2), } ) logger.info( "MinerU parser extracted %d fields across %d page(s)", len(fields), len(pdf_info), ) return MineruDocument( page_width=page_width, page_height=page_height, fields=fields, ) def parse_mineru_image_blocks(data: dict) -> list[dict]: """从 MinerU 结构化 JSON 中提取所有 image 类型的 block。 Returns ------- list of dict,每项包含: - page : 页码(从 1 起) - block_type : "image" - img_path : MinerU 在 zip 包内记录的相对路径(可能为 None) - x0_pt, top_pt, x1_pt, bottom_pt : block 边界框(与文本字段坐标系相同) """ pdf_info: list[dict] = data.get("pdf_info", []) images: list[dict] = [] for page in pdf_info: page_idx = int(page.get("page_idx", 0)) page_num = page_idx + 1 for blocks_key in ("para_blocks", "blocks"): for block in page.get(blocks_key, []): if (block.get("type") or "").strip().lower() != "image": continue x0, y0, x1, y1 = _bbox(block) # MinerU 有时把图片路径放在这几个字段中 img_path = ( block.get("img_path") or block.get("image_path") or block.get("path") or None ) images.append( { "page": page_num, "block_type": "image", "img_path": img_path, "x0_pt": round(x0, 2), "top_pt": round(y0, 2), "x1_pt": round(x1, 2), "bottom_pt": round(y1, 2), } ) logger.info("MinerU parser found %d image block(s)", len(images)) return images