Files
ZLD_POC/tests/backend/test_mineru_parser.py
2026-04-15 17:18:49 +08:00

100 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
from backend.app.mineru_parser import parse_mineru_fields
def test_parse_mineru_fields_extracts_text_and_bbox() -> None:
payload = {
"pdf_info": [
{
"page_idx": 0,
"page_size": [2772, 1961],
"para_blocks": [
{
"bbox": [704, 134, 2106, 229],
"type": "title",
"lines": [
{
"spans": [
{
"type": "text",
"content": "食品名称:天问礼品粽",
"bbox": [704, 134, 2106, 229],
}
]
}
],
}
],
}
]
}
parsed = parse_mineru_fields(payload)
assert parsed.page_width == 2772
assert parsed.page_height == 1961
assert parsed.fields == [
{
"page": 1,
"text": "食品名称:天问礼品粽",
"font_name": "",
"font_size_pt": None,
"font_height_mm": None,
"x0_pt": 704.0,
"top_pt": 134.0,
"x1_pt": 2106.0,
"bottom_pt": 229.0,
}
]
def test_parse_mineru_fields_turns_table_html_into_text() -> None:
payload = {
"pdf_info": [
{
"page_idx": 0,
"page_size": [1000, 800],
"para_blocks": [
{
"bbox": [10, 20, 300, 200],
"type": "table",
"lines": [
{
"spans": [
{
"type": "table",
"html": "<table><tr><td>品种</td><td>规格</td></tr><tr><td>黑猪肉粽</td><td>130克×1</td></tr></table>",
}
]
}
],
}
],
}
]
}
parsed = parse_mineru_fields(payload)
assert parsed.fields[0]["text"] == "品种 规格 黑猪肉粽 130克×1"
def test_parse_mineru_fields_skips_empty_decorative_blocks() -> None:
payload = {
"pdf_info": [
{
"page_idx": 0,
"page_size": [1000, 800],
"para_blocks": [
{"bbox": [1, 2, 3, 4], "type": "image", "lines": [{"spans": [{"type": "image"}]}]},
{"bbox": [5, 6, 7, 8], "type": "text", "lines": [{"spans": [{"content": " "}]}]},
],
}
]
}
parsed = parse_mineru_fields(payload)
assert parsed.fields == []