Files
ZLD_POC/backend/app/mineru_parser.py
2026-04-15 17:18:49 +08:00

300 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Parse MinerU structured JSON (layout.json / middle.json) into field records."""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
def _extract_table_text(html: str) -> str:
"""将表格 HTML 转为可供文本匹配的多行字符串。
每行格式单元格1单元格2单元格3
同一行内的单元格用 连接,行与行之间用换行分隔。
"""
try:
soup = BeautifulSoup(html, "html.parser")
rows = []
for tr in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
if any(cells):
rows.append("".join(cells))
return "\n".join(rows)
except Exception:
# 解析失败时退回正则粗提取
return re.sub(r"<[^>]+>", " ", html).strip()
# 1 pt = 0.352778 mm
PT_TO_MM = 0.352778
# LaTeX inline-equation → Unicode 映射(仅处理标签文件中常见的符号)
_LATEX_TO_UNICODE: dict[str, str] = {
r"\times": "×",
r"\div": "÷",
r"\pm": "±",
r"\mp": "",
r"\cdot": "·",
r"\leq": "",
r"\geq": "",
r"\neq": "",
r"\approx": "",
r"\infty": "",
r"\circ": "°",
r"\degree": "°",
r"\alpha": "α",
r"\beta": "β",
r"\gamma": "γ",
r"\delta": "δ",
r"\mu": "μ",
r"\%": "%",
}
# MinerU 有时将 ^{\circ} 输出为 ^{circ}(缺少反斜杠)
# 用正则统一匹配两种写法
_SUPERSCRIPT_DEGREE_RE = re.compile(r"\^\{\\?circ\}", re.IGNORECASE)
@dataclass
class MineruDocument:
page_width: float # points
page_height: float # points
fields: list[dict] # list of field dicts ready for the API response
def _page_size(page: dict) -> tuple[float, float]:
"""Return (width, height) in points for a MinerU page entry."""
# MinerU stores page size as [width, height] in `page_size`
size = page.get("page_size") or page.get("page_size_pt") or []
if isinstance(size, (list, tuple)) and len(size) >= 2:
return float(size[0]), float(size[1])
# Fallback: inspect block bboxes
return 595.0, 842.0 # A4 default
def _latex_to_text(expr: str) -> str:
"""将简单的 LaTeX 表达式转换为可读文本(逐一替换已知符号)。"""
result = expr.strip()
# 优先处理上标度数:^{circ} 或 ^{\circ} → °
result = _SUPERSCRIPT_DEGREE_RE.sub("°", result)
# 其他上标 ^{...} / 下标 _{...}:去掉包装,只保留内容
result = re.sub(r"[\^_]\{([^}]*)\}", r"\1", result)
for latex, uni in _LATEX_TO_UNICODE.items():
result = result.replace(latex, uni)
# 剩余未识别的命令(如 \foo直接去掉反斜杠降级为原始字母
result = re.sub(r"\\([A-Za-z]+)", r"\1", result)
return result
def _span_content(span: dict) -> str:
"""从 span 中提取可供匹配的文本内容。
- type == "table":解析 html 字段,转为行列文本
- type == "inline_equation"LaTeX → Unicode 文本
- 其他类型:取 content 字段,并修复常见 LaTeX 上标残留(如 ^{circ}
"""
span_type = span.get("type") or ""
if span_type == "table":
html = span.get("html") or ""
return _extract_table_text(html) if html else ""
if span_type == "inline_equation":
return _latex_to_text((span.get("content") or "").strip())
# 普通文本 spanMinerU 有时在 content 中直接嵌入 LaTeX 上标(如 ^{circ}
raw = (span.get("content") or "").strip()
return _SUPERSCRIPT_DEGREE_RE.sub("°", raw)
def _iter_lines(block: dict):
"""Yield (line, block) tuples for all lines in a block.
Handles two MinerU structures:
- Flat: block → lines → spans (text/title/etc.)
- Nested: block → blocks → lines → spans (table blocks)
"""
lines = block.get("lines")
if lines:
for line in lines:
yield line, block
else:
# Table blocks (and some other types) have a nested `blocks` layer
for inner in block.get("blocks", []):
for line in inner.get("lines", []):
yield line, block
def _iter_line_fields(page: dict):
"""Yield one record per non-empty *line* across the whole page.
Each yielded tuple is ``(merged_text, line, first_text_span, block)`` where:
- ``merged_text`` all span contents concatenated (LaTeX already converted)
- ``line`` the MinerU line dict (carries the authoritative bbox)
- ``first_text_span`` first span that has font metadata, or ``None``
- ``block`` the containing block (carries ``type``)
Merging at the line level correctly handles footer / title blocks where a
single printed sentence is split across many spans (e.g. text + inline_equation
+ text …). Table blocks still produce one record per table because they have
exactly one span (type="table") per line.
"""
def _process_block_set(blocks_iter):
for block in blocks_iter:
for line, src_block in _iter_lines(block):
spans = line.get("spans", [])
if not spans:
continue
parts: list[str] = []
first_text_span: dict | None = None
table_html: str | None = None
for span in spans:
content = _span_content(span)
if content:
parts.append(content)
if span.get("type") == "table":
# 保留原始 HTML前端可用于渲染含 colspan/rowspan 的复杂表格
table_html = span.get("html") or None
elif first_text_span is None:
first_text_span = span
merged = "".join(parts)
if merged:
yield merged, line, first_text_span, src_block, table_html
yield from _process_block_set(page.get("para_blocks", []))
yield from _process_block_set(page.get("blocks", []))
def _bbox(obj: dict) -> tuple[float, float, float, float]:
"""Return (x0, y0, x1, y1) from an object's bbox field."""
bbox = obj.get("bbox") or [0, 0, 0, 0]
if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
return 0.0, 0.0, 0.0, 0.0
def parse_mineru_fields(data: dict) -> MineruDocument:
"""Convert raw MinerU structured JSON into a :class:`MineruDocument`.
Parameters
----------
data:
The parsed JSON dict returned by :class:`~backend.app.mineru_client.MineruClient`.
Must contain a ``pdf_info`` list with one entry per page.
Returns
-------
MineruDocument
Holds page dimensions and a flat list of text field dicts.
"""
pdf_info: list[dict] = data.get("pdf_info", [])
if not pdf_info:
logger.warning("MinerU JSON contains empty pdf_info")
return MineruDocument(page_width=595.0, page_height=842.0, fields=[])
# Use the first page's dimensions for the preview
first_page = pdf_info[0]
page_width, page_height = _page_size(first_page)
fields: list[dict] = []
for page in pdf_info:
page_idx = int(page.get("page_idx", 0))
page_num = page_idx + 1
pw, ph = _page_size(page)
for content, line, font_span, _block, table_html in _iter_line_fields(page):
# bbox comes from the line (covers all spans in one visual row)
x0, y0, x1, y1 = _bbox(line)
font_size_pt: float | None = None
font_name: str | None = None
if font_span is not None:
raw_size = font_span.get("size") or font_span.get("font_size")
if raw_size is not None:
try:
font_size_pt = float(raw_size)
except (TypeError, ValueError):
pass
font_name = font_span.get("font") or font_span.get("font_name") or None
font_height_mm: float | None = (
round(font_size_pt * PT_TO_MM, 2) if font_size_pt else None
)
block_type = (_block.get("type") or "text").strip() or "text"
fields.append(
{
"page": page_num,
"block_type": block_type,
"text": content,
"table_html": table_html,
"font_name": font_name,
"font_size_pt": round(font_size_pt, 2) if font_size_pt else None,
"font_height_mm": font_height_mm,
"x0_pt": round(x0, 2),
"top_pt": round(y0, 2),
"x1_pt": round(x1, 2),
"bottom_pt": round(y1, 2),
}
)
logger.info(
"MinerU parser extracted %d fields across %d page(s)",
len(fields),
len(pdf_info),
)
return MineruDocument(
page_width=page_width,
page_height=page_height,
fields=fields,
)
def parse_mineru_image_blocks(data: dict) -> list[dict]:
"""从 MinerU 结构化 JSON 中提取所有 image 类型的 block。
Returns
-------
list of dict每项包含
- page : 页码(从 1 起)
- block_type : "image"
- img_path : MinerU 在 zip 包内记录的相对路径(可能为 None
- x0_pt, top_pt, x1_pt, bottom_pt : block 边界框(与文本字段坐标系相同)
"""
pdf_info: list[dict] = data.get("pdf_info", [])
images: list[dict] = []
for page in pdf_info:
page_idx = int(page.get("page_idx", 0))
page_num = page_idx + 1
for blocks_key in ("para_blocks", "blocks"):
for block in page.get(blocks_key, []):
if (block.get("type") or "").strip().lower() != "image":
continue
x0, y0, x1, y1 = _bbox(block)
# MinerU 有时把图片路径放在这几个字段中
img_path = (
block.get("img_path")
or block.get("image_path")
or block.get("path")
or None
)
images.append(
{
"page": page_num,
"block_type": "image",
"img_path": img_path,
"x0_pt": round(x0, 2),
"top_pt": round(y0, 2),
"x1_pt": round(x1, 2),
"bottom_pt": round(y1, 2),
}
)
logger.info("MinerU parser found %d image block(s)", len(images))
return images