from __future__ import annotations from backend.app.mineru_parser import parse_mineru_fields def test_parse_mineru_fields_extracts_text_and_bbox() -> None: payload = { "pdf_info": [ { "page_idx": 0, "page_size": [2772, 1961], "para_blocks": [ { "bbox": [704, 134, 2106, 229], "type": "title", "lines": [ { "spans": [ { "type": "text", "content": "食品名称:天问礼品粽", "bbox": [704, 134, 2106, 229], } ] } ], } ], } ] } parsed = parse_mineru_fields(payload) assert parsed.page_width == 2772 assert parsed.page_height == 1961 assert parsed.fields == [ { "page": 1, "text": "食品名称:天问礼品粽", "font_name": "", "font_size_pt": None, "font_height_mm": None, "x0_pt": 704.0, "top_pt": 134.0, "x1_pt": 2106.0, "bottom_pt": 229.0, } ] def test_parse_mineru_fields_turns_table_html_into_text() -> None: payload = { "pdf_info": [ { "page_idx": 0, "page_size": [1000, 800], "para_blocks": [ { "bbox": [10, 20, 300, 200], "type": "table", "lines": [ { "spans": [ { "type": "table", "html": "
品种规格
黑猪肉粽130克×1
", } ] } ], } ], } ] } parsed = parse_mineru_fields(payload) assert parsed.fields[0]["text"] == "品种 规格 黑猪肉粽 130克×1" def test_parse_mineru_fields_skips_empty_decorative_blocks() -> None: payload = { "pdf_info": [ { "page_idx": 0, "page_size": [1000, 800], "para_blocks": [ {"bbox": [1, 2, 3, 4], "type": "image", "lines": [{"spans": [{"type": "image"}]}]}, {"bbox": [5, 6, 7, 8], "type": "text", "lines": [{"spans": [{"content": " "}]}]}, ], } ] } parsed = parse_mineru_fields(payload) assert parsed.fields == []