Initial commit: 包装审核 POC、Docker 与前后端
Made-with: Cursor
This commit is contained in:
299
backend/app/mineru_parser.py
Normal file
299
backend/app/mineru_parser.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""Parse MinerU structured JSON (layout.json / middle.json) into field records."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_table_text(html: str) -> str:
|
||||
"""将表格 HTML 转为可供文本匹配的多行字符串。
|
||||
|
||||
每行格式:单元格1|单元格2|单元格3
|
||||
同一行内的单元格用 | 连接,行与行之间用换行分隔。
|
||||
"""
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
rows = []
|
||||
for tr in soup.find_all("tr"):
|
||||
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
|
||||
if any(cells):
|
||||
rows.append("|".join(cells))
|
||||
return "\n".join(rows)
|
||||
except Exception:
|
||||
# 解析失败时退回正则粗提取
|
||||
return re.sub(r"<[^>]+>", " ", html).strip()
|
||||
|
||||
# 1 pt = 0.352778 mm
|
||||
PT_TO_MM = 0.352778
|
||||
|
||||
# LaTeX inline-equation → Unicode 映射(仅处理标签文件中常见的符号)
|
||||
_LATEX_TO_UNICODE: dict[str, str] = {
|
||||
r"\times": "×",
|
||||
r"\div": "÷",
|
||||
r"\pm": "±",
|
||||
r"\mp": "∓",
|
||||
r"\cdot": "·",
|
||||
r"\leq": "≤",
|
||||
r"\geq": "≥",
|
||||
r"\neq": "≠",
|
||||
r"\approx": "≈",
|
||||
r"\infty": "∞",
|
||||
r"\circ": "°",
|
||||
r"\degree": "°",
|
||||
r"\alpha": "α",
|
||||
r"\beta": "β",
|
||||
r"\gamma": "γ",
|
||||
r"\delta": "δ",
|
||||
r"\mu": "μ",
|
||||
r"\%": "%",
|
||||
}
|
||||
|
||||
# MinerU 有时将 ^{\circ} 输出为 ^{circ}(缺少反斜杠)
|
||||
# 用正则统一匹配两种写法
|
||||
_SUPERSCRIPT_DEGREE_RE = re.compile(r"\^\{\\?circ\}", re.IGNORECASE)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MineruDocument:
|
||||
page_width: float # points
|
||||
page_height: float # points
|
||||
fields: list[dict] # list of field dicts ready for the API response
|
||||
|
||||
|
||||
def _page_size(page: dict) -> tuple[float, float]:
|
||||
"""Return (width, height) in points for a MinerU page entry."""
|
||||
# MinerU stores page size as [width, height] in `page_size`
|
||||
size = page.get("page_size") or page.get("page_size_pt") or []
|
||||
if isinstance(size, (list, tuple)) and len(size) >= 2:
|
||||
return float(size[0]), float(size[1])
|
||||
# Fallback: inspect block bboxes
|
||||
return 595.0, 842.0 # A4 default
|
||||
|
||||
|
||||
def _latex_to_text(expr: str) -> str:
|
||||
"""将简单的 LaTeX 表达式转换为可读文本(逐一替换已知符号)。"""
|
||||
result = expr.strip()
|
||||
# 优先处理上标度数:^{circ} 或 ^{\circ} → °
|
||||
result = _SUPERSCRIPT_DEGREE_RE.sub("°", result)
|
||||
# 其他上标 ^{...} / 下标 _{...}:去掉包装,只保留内容
|
||||
result = re.sub(r"[\^_]\{([^}]*)\}", r"\1", result)
|
||||
for latex, uni in _LATEX_TO_UNICODE.items():
|
||||
result = result.replace(latex, uni)
|
||||
# 剩余未识别的命令(如 \foo)直接去掉反斜杠,降级为原始字母
|
||||
result = re.sub(r"\\([A-Za-z]+)", r"\1", result)
|
||||
return result
|
||||
|
||||
|
||||
def _span_content(span: dict) -> str:
|
||||
"""从 span 中提取可供匹配的文本内容。
|
||||
|
||||
- type == "table":解析 html 字段,转为行列文本
|
||||
- type == "inline_equation":LaTeX → Unicode 文本
|
||||
- 其他类型:取 content 字段,并修复常见 LaTeX 上标残留(如 ^{circ})
|
||||
"""
|
||||
span_type = span.get("type") or ""
|
||||
if span_type == "table":
|
||||
html = span.get("html") or ""
|
||||
return _extract_table_text(html) if html else ""
|
||||
if span_type == "inline_equation":
|
||||
return _latex_to_text((span.get("content") or "").strip())
|
||||
# 普通文本 span:MinerU 有时在 content 中直接嵌入 LaTeX 上标(如 ^{circ})
|
||||
raw = (span.get("content") or "").strip()
|
||||
return _SUPERSCRIPT_DEGREE_RE.sub("°", raw)
|
||||
|
||||
|
||||
def _iter_lines(block: dict):
|
||||
"""Yield (line, block) tuples for all lines in a block.
|
||||
|
||||
Handles two MinerU structures:
|
||||
- Flat: block → lines → spans (text/title/etc.)
|
||||
- Nested: block → blocks → lines → spans (table blocks)
|
||||
"""
|
||||
lines = block.get("lines")
|
||||
if lines:
|
||||
for line in lines:
|
||||
yield line, block
|
||||
else:
|
||||
# Table blocks (and some other types) have a nested `blocks` layer
|
||||
for inner in block.get("blocks", []):
|
||||
for line in inner.get("lines", []):
|
||||
yield line, block
|
||||
|
||||
|
||||
def _iter_line_fields(page: dict):
|
||||
"""Yield one record per non-empty *line* across the whole page.
|
||||
|
||||
Each yielded tuple is ``(merged_text, line, first_text_span, block)`` where:
|
||||
- ``merged_text`` – all span contents concatenated (LaTeX already converted)
|
||||
- ``line`` – the MinerU line dict (carries the authoritative bbox)
|
||||
- ``first_text_span`` – first span that has font metadata, or ``None``
|
||||
- ``block`` – the containing block (carries ``type``)
|
||||
|
||||
Merging at the line level correctly handles footer / title blocks where a
|
||||
single printed sentence is split across many spans (e.g. text + inline_equation
|
||||
+ text …). Table blocks still produce one record per table because they have
|
||||
exactly one span (type="table") per line.
|
||||
"""
|
||||
def _process_block_set(blocks_iter):
|
||||
for block in blocks_iter:
|
||||
for line, src_block in _iter_lines(block):
|
||||
spans = line.get("spans", [])
|
||||
if not spans:
|
||||
continue
|
||||
|
||||
parts: list[str] = []
|
||||
first_text_span: dict | None = None
|
||||
table_html: str | None = None
|
||||
for span in spans:
|
||||
content = _span_content(span)
|
||||
if content:
|
||||
parts.append(content)
|
||||
if span.get("type") == "table":
|
||||
# 保留原始 HTML,前端可用于渲染含 colspan/rowspan 的复杂表格
|
||||
table_html = span.get("html") or None
|
||||
elif first_text_span is None:
|
||||
first_text_span = span
|
||||
|
||||
merged = "".join(parts)
|
||||
if merged:
|
||||
yield merged, line, first_text_span, src_block, table_html
|
||||
|
||||
yield from _process_block_set(page.get("para_blocks", []))
|
||||
yield from _process_block_set(page.get("blocks", []))
|
||||
|
||||
|
||||
def _bbox(obj: dict) -> tuple[float, float, float, float]:
|
||||
"""Return (x0, y0, x1, y1) from an object's bbox field."""
|
||||
bbox = obj.get("bbox") or [0, 0, 0, 0]
|
||||
if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
|
||||
return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
|
||||
return 0.0, 0.0, 0.0, 0.0
|
||||
|
||||
|
||||
def parse_mineru_fields(data: dict) -> MineruDocument:
|
||||
"""Convert raw MinerU structured JSON into a :class:`MineruDocument`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data:
|
||||
The parsed JSON dict returned by :class:`~backend.app.mineru_client.MineruClient`.
|
||||
Must contain a ``pdf_info`` list with one entry per page.
|
||||
|
||||
Returns
|
||||
-------
|
||||
MineruDocument
|
||||
Holds page dimensions and a flat list of text field dicts.
|
||||
"""
|
||||
pdf_info: list[dict] = data.get("pdf_info", [])
|
||||
if not pdf_info:
|
||||
logger.warning("MinerU JSON contains empty pdf_info")
|
||||
return MineruDocument(page_width=595.0, page_height=842.0, fields=[])
|
||||
|
||||
# Use the first page's dimensions for the preview
|
||||
first_page = pdf_info[0]
|
||||
page_width, page_height = _page_size(first_page)
|
||||
|
||||
fields: list[dict] = []
|
||||
for page in pdf_info:
|
||||
page_idx = int(page.get("page_idx", 0))
|
||||
page_num = page_idx + 1
|
||||
pw, ph = _page_size(page)
|
||||
|
||||
for content, line, font_span, _block, table_html in _iter_line_fields(page):
|
||||
# bbox comes from the line (covers all spans in one visual row)
|
||||
x0, y0, x1, y1 = _bbox(line)
|
||||
|
||||
font_size_pt: float | None = None
|
||||
font_name: str | None = None
|
||||
if font_span is not None:
|
||||
raw_size = font_span.get("size") or font_span.get("font_size")
|
||||
if raw_size is not None:
|
||||
try:
|
||||
font_size_pt = float(raw_size)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
font_name = font_span.get("font") or font_span.get("font_name") or None
|
||||
|
||||
font_height_mm: float | None = (
|
||||
round(font_size_pt * PT_TO_MM, 2) if font_size_pt else None
|
||||
)
|
||||
|
||||
block_type = (_block.get("type") or "text").strip() or "text"
|
||||
|
||||
fields.append(
|
||||
{
|
||||
"page": page_num,
|
||||
"block_type": block_type,
|
||||
"text": content,
|
||||
"table_html": table_html,
|
||||
"font_name": font_name,
|
||||
"font_size_pt": round(font_size_pt, 2) if font_size_pt else None,
|
||||
"font_height_mm": font_height_mm,
|
||||
"x0_pt": round(x0, 2),
|
||||
"top_pt": round(y0, 2),
|
||||
"x1_pt": round(x1, 2),
|
||||
"bottom_pt": round(y1, 2),
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"MinerU parser extracted %d fields across %d page(s)",
|
||||
len(fields),
|
||||
len(pdf_info),
|
||||
)
|
||||
return MineruDocument(
|
||||
page_width=page_width,
|
||||
page_height=page_height,
|
||||
fields=fields,
|
||||
)
|
||||
|
||||
|
||||
def parse_mineru_image_blocks(data: dict) -> list[dict]:
|
||||
"""从 MinerU 结构化 JSON 中提取所有 image 类型的 block。
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of dict,每项包含:
|
||||
- page : 页码(从 1 起)
|
||||
- block_type : "image"
|
||||
- img_path : MinerU 在 zip 包内记录的相对路径(可能为 None)
|
||||
- x0_pt, top_pt, x1_pt, bottom_pt : block 边界框(与文本字段坐标系相同)
|
||||
"""
|
||||
pdf_info: list[dict] = data.get("pdf_info", [])
|
||||
images: list[dict] = []
|
||||
|
||||
for page in pdf_info:
|
||||
page_idx = int(page.get("page_idx", 0))
|
||||
page_num = page_idx + 1
|
||||
|
||||
for blocks_key in ("para_blocks", "blocks"):
|
||||
for block in page.get(blocks_key, []):
|
||||
if (block.get("type") or "").strip().lower() != "image":
|
||||
continue
|
||||
x0, y0, x1, y1 = _bbox(block)
|
||||
# MinerU 有时把图片路径放在这几个字段中
|
||||
img_path = (
|
||||
block.get("img_path")
|
||||
or block.get("image_path")
|
||||
or block.get("path")
|
||||
or None
|
||||
)
|
||||
images.append(
|
||||
{
|
||||
"page": page_num,
|
||||
"block_type": "image",
|
||||
"img_path": img_path,
|
||||
"x0_pt": round(x0, 2),
|
||||
"top_pt": round(y0, 2),
|
||||
"x1_pt": round(x1, 2),
|
||||
"bottom_pt": round(y1, 2),
|
||||
}
|
||||
)
|
||||
|
||||
logger.info("MinerU parser found %d image block(s)", len(images))
|
||||
return images
|
||||
Reference in New Issue
Block a user