100 lines
3.0 KiB
Python
100 lines
3.0 KiB
Python
from __future__ import annotations
|
||
|
||
from backend.app.mineru_parser import parse_mineru_fields
|
||
|
||
|
||
def test_parse_mineru_fields_extracts_text_and_bbox() -> None:
|
||
payload = {
|
||
"pdf_info": [
|
||
{
|
||
"page_idx": 0,
|
||
"page_size": [2772, 1961],
|
||
"para_blocks": [
|
||
{
|
||
"bbox": [704, 134, 2106, 229],
|
||
"type": "title",
|
||
"lines": [
|
||
{
|
||
"spans": [
|
||
{
|
||
"type": "text",
|
||
"content": "食品名称:天问礼品粽",
|
||
"bbox": [704, 134, 2106, 229],
|
||
}
|
||
]
|
||
}
|
||
],
|
||
}
|
||
],
|
||
}
|
||
]
|
||
}
|
||
|
||
parsed = parse_mineru_fields(payload)
|
||
|
||
assert parsed.page_width == 2772
|
||
assert parsed.page_height == 1961
|
||
assert parsed.fields == [
|
||
{
|
||
"page": 1,
|
||
"text": "食品名称:天问礼品粽",
|
||
"font_name": "",
|
||
"font_size_pt": None,
|
||
"font_height_mm": None,
|
||
"x0_pt": 704.0,
|
||
"top_pt": 134.0,
|
||
"x1_pt": 2106.0,
|
||
"bottom_pt": 229.0,
|
||
}
|
||
]
|
||
|
||
|
||
def test_parse_mineru_fields_turns_table_html_into_text() -> None:
|
||
payload = {
|
||
"pdf_info": [
|
||
{
|
||
"page_idx": 0,
|
||
"page_size": [1000, 800],
|
||
"para_blocks": [
|
||
{
|
||
"bbox": [10, 20, 300, 200],
|
||
"type": "table",
|
||
"lines": [
|
||
{
|
||
"spans": [
|
||
{
|
||
"type": "table",
|
||
"html": "<table><tr><td>品种</td><td>规格</td></tr><tr><td>黑猪肉粽</td><td>130克×1</td></tr></table>",
|
||
}
|
||
]
|
||
}
|
||
],
|
||
}
|
||
],
|
||
}
|
||
]
|
||
}
|
||
|
||
parsed = parse_mineru_fields(payload)
|
||
|
||
assert parsed.fields[0]["text"] == "品种 规格 黑猪肉粽 130克×1"
|
||
|
||
|
||
def test_parse_mineru_fields_skips_empty_decorative_blocks() -> None:
|
||
payload = {
|
||
"pdf_info": [
|
||
{
|
||
"page_idx": 0,
|
||
"page_size": [1000, 800],
|
||
"para_blocks": [
|
||
{"bbox": [1, 2, 3, 4], "type": "image", "lines": [{"spans": [{"type": "image"}]}]},
|
||
{"bbox": [5, 6, 7, 8], "type": "text", "lines": [{"spans": [{"content": " "}]}]},
|
||
],
|
||
}
|
||
]
|
||
}
|
||
|
||
parsed = parse_mineru_fields(payload)
|
||
|
||
assert parsed.fields == []
|