Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
This commit is contained in:
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions

View File

@@ -0,0 +1,299 @@
"""Parse MinerU structured JSON (layout.json / middle.json) into field records."""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
def _extract_table_text(html: str) -> str:
"""将表格 HTML 转为可供文本匹配的多行字符串。
每行格式单元格1单元格2单元格3
同一行内的单元格用 连接,行与行之间用换行分隔。
"""
try:
soup = BeautifulSoup(html, "html.parser")
rows = []
for tr in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
if any(cells):
rows.append("".join(cells))
return "\n".join(rows)
except Exception:
# 解析失败时退回正则粗提取
return re.sub(r"<[^>]+>", " ", html).strip()
# 1 pt = 0.352778 mm
PT_TO_MM = 0.352778
# LaTeX inline-equation → Unicode 映射(仅处理标签文件中常见的符号)
_LATEX_TO_UNICODE: dict[str, str] = {
r"\times": "×",
r"\div": "÷",
r"\pm": "±",
r"\mp": "",
r"\cdot": "·",
r"\leq": "",
r"\geq": "",
r"\neq": "",
r"\approx": "",
r"\infty": "",
r"\circ": "°",
r"\degree": "°",
r"\alpha": "α",
r"\beta": "β",
r"\gamma": "γ",
r"\delta": "δ",
r"\mu": "μ",
r"\%": "%",
}
# MinerU 有时将 ^{\circ} 输出为 ^{circ}(缺少反斜杠)
# 用正则统一匹配两种写法
_SUPERSCRIPT_DEGREE_RE = re.compile(r"\^\{\\?circ\}", re.IGNORECASE)
@dataclass
class MineruDocument:
page_width: float # points
page_height: float # points
fields: list[dict] # list of field dicts ready for the API response
def _page_size(page: dict) -> tuple[float, float]:
"""Return (width, height) in points for a MinerU page entry."""
# MinerU stores page size as [width, height] in `page_size`
size = page.get("page_size") or page.get("page_size_pt") or []
if isinstance(size, (list, tuple)) and len(size) >= 2:
return float(size[0]), float(size[1])
# Fallback: inspect block bboxes
return 595.0, 842.0 # A4 default
def _latex_to_text(expr: str) -> str:
"""将简单的 LaTeX 表达式转换为可读文本(逐一替换已知符号)。"""
result = expr.strip()
# 优先处理上标度数:^{circ} 或 ^{\circ} → °
result = _SUPERSCRIPT_DEGREE_RE.sub("°", result)
# 其他上标 ^{...} / 下标 _{...}:去掉包装,只保留内容
result = re.sub(r"[\^_]\{([^}]*)\}", r"\1", result)
for latex, uni in _LATEX_TO_UNICODE.items():
result = result.replace(latex, uni)
# 剩余未识别的命令(如 \foo直接去掉反斜杠降级为原始字母
result = re.sub(r"\\([A-Za-z]+)", r"\1", result)
return result
def _span_content(span: dict) -> str:
"""从 span 中提取可供匹配的文本内容。
- type == "table":解析 html 字段,转为行列文本
- type == "inline_equation"LaTeX → Unicode 文本
- 其他类型:取 content 字段,并修复常见 LaTeX 上标残留(如 ^{circ}
"""
span_type = span.get("type") or ""
if span_type == "table":
html = span.get("html") or ""
return _extract_table_text(html) if html else ""
if span_type == "inline_equation":
return _latex_to_text((span.get("content") or "").strip())
# 普通文本 spanMinerU 有时在 content 中直接嵌入 LaTeX 上标(如 ^{circ}
raw = (span.get("content") or "").strip()
return _SUPERSCRIPT_DEGREE_RE.sub("°", raw)
def _iter_lines(block: dict):
"""Yield (line, block) tuples for all lines in a block.
Handles two MinerU structures:
- Flat: block → lines → spans (text/title/etc.)
- Nested: block → blocks → lines → spans (table blocks)
"""
lines = block.get("lines")
if lines:
for line in lines:
yield line, block
else:
# Table blocks (and some other types) have a nested `blocks` layer
for inner in block.get("blocks", []):
for line in inner.get("lines", []):
yield line, block
def _iter_line_fields(page: dict):
"""Yield one record per non-empty *line* across the whole page.
Each yielded tuple is ``(merged_text, line, first_text_span, block)`` where:
- ``merged_text`` all span contents concatenated (LaTeX already converted)
- ``line`` the MinerU line dict (carries the authoritative bbox)
- ``first_text_span`` first span that has font metadata, or ``None``
- ``block`` the containing block (carries ``type``)
Merging at the line level correctly handles footer / title blocks where a
single printed sentence is split across many spans (e.g. text + inline_equation
+ text …). Table blocks still produce one record per table because they have
exactly one span (type="table") per line.
"""
def _process_block_set(blocks_iter):
for block in blocks_iter:
for line, src_block in _iter_lines(block):
spans = line.get("spans", [])
if not spans:
continue
parts: list[str] = []
first_text_span: dict | None = None
table_html: str | None = None
for span in spans:
content = _span_content(span)
if content:
parts.append(content)
if span.get("type") == "table":
# 保留原始 HTML前端可用于渲染含 colspan/rowspan 的复杂表格
table_html = span.get("html") or None
elif first_text_span is None:
first_text_span = span
merged = "".join(parts)
if merged:
yield merged, line, first_text_span, src_block, table_html
yield from _process_block_set(page.get("para_blocks", []))
yield from _process_block_set(page.get("blocks", []))
def _bbox(obj: dict) -> tuple[float, float, float, float]:
"""Return (x0, y0, x1, y1) from an object's bbox field."""
bbox = obj.get("bbox") or [0, 0, 0, 0]
if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
return float(bbox[0]), float(bbox[1]), float(bbox[2]), float(bbox[3])
return 0.0, 0.0, 0.0, 0.0
def parse_mineru_fields(data: dict) -> MineruDocument:
"""Convert raw MinerU structured JSON into a :class:`MineruDocument`.
Parameters
----------
data:
The parsed JSON dict returned by :class:`~backend.app.mineru_client.MineruClient`.
Must contain a ``pdf_info`` list with one entry per page.
Returns
-------
MineruDocument
Holds page dimensions and a flat list of text field dicts.
"""
pdf_info: list[dict] = data.get("pdf_info", [])
if not pdf_info:
logger.warning("MinerU JSON contains empty pdf_info")
return MineruDocument(page_width=595.0, page_height=842.0, fields=[])
# Use the first page's dimensions for the preview
first_page = pdf_info[0]
page_width, page_height = _page_size(first_page)
fields: list[dict] = []
for page in pdf_info:
page_idx = int(page.get("page_idx", 0))
page_num = page_idx + 1
pw, ph = _page_size(page)
for content, line, font_span, _block, table_html in _iter_line_fields(page):
# bbox comes from the line (covers all spans in one visual row)
x0, y0, x1, y1 = _bbox(line)
font_size_pt: float | None = None
font_name: str | None = None
if font_span is not None:
raw_size = font_span.get("size") or font_span.get("font_size")
if raw_size is not None:
try:
font_size_pt = float(raw_size)
except (TypeError, ValueError):
pass
font_name = font_span.get("font") or font_span.get("font_name") or None
font_height_mm: float | None = (
round(font_size_pt * PT_TO_MM, 2) if font_size_pt else None
)
block_type = (_block.get("type") or "text").strip() or "text"
fields.append(
{
"page": page_num,
"block_type": block_type,
"text": content,
"table_html": table_html,
"font_name": font_name,
"font_size_pt": round(font_size_pt, 2) if font_size_pt else None,
"font_height_mm": font_height_mm,
"x0_pt": round(x0, 2),
"top_pt": round(y0, 2),
"x1_pt": round(x1, 2),
"bottom_pt": round(y1, 2),
}
)
logger.info(
"MinerU parser extracted %d fields across %d page(s)",
len(fields),
len(pdf_info),
)
return MineruDocument(
page_width=page_width,
page_height=page_height,
fields=fields,
)
def parse_mineru_image_blocks(data: dict) -> list[dict]:
"""从 MinerU 结构化 JSON 中提取所有 image 类型的 block。
Returns
-------
list of dict每项包含
- page : 页码(从 1 起)
- block_type : "image"
- img_path : MinerU 在 zip 包内记录的相对路径(可能为 None
- x0_pt, top_pt, x1_pt, bottom_pt : block 边界框(与文本字段坐标系相同)
"""
pdf_info: list[dict] = data.get("pdf_info", [])
images: list[dict] = []
for page in pdf_info:
page_idx = int(page.get("page_idx", 0))
page_num = page_idx + 1
for blocks_key in ("para_blocks", "blocks"):
for block in page.get(blocks_key, []):
if (block.get("type") or "").strip().lower() != "image":
continue
x0, y0, x1, y1 = _bbox(block)
# MinerU 有时把图片路径放在这几个字段中
img_path = (
block.get("img_path")
or block.get("image_path")
or block.get("path")
or None
)
images.append(
{
"page": page_num,
"block_type": "image",
"img_path": img_path,
"x0_pt": round(x0, 2),
"top_pt": round(y0, 2),
"x1_pt": round(x1, 2),
"bottom_pt": round(y1, 2),
}
)
logger.info("MinerU parser found %d image block(s)", len(images))
return images