Files
ZLD_POC/backend/app/pipeline.py
2026-04-15 17:18:49 +08:00

508 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Core processing pipeline: AI → PDF → PNG → Qwen crop → MinerU → validate."""
from __future__ import annotations
import logging
import os
import shutil
import subprocess
from pathlib import Path
from backend.app.barcode_detector import detect_barcodes
from backend.app.image_classifier import is_qr_code
from backend.app.mineru_client import MineruClient, MineruClientError
from backend.app.mineru_parser import parse_mineru_fields, parse_mineru_image_blocks
from backend.app.text_validation import validate_field_against_word
from backend.app.word_parser import extract_word_html, extract_word_text
logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- #
# Environment helpers #
# --------------------------------------------------------------------------- #
def _get_mineru_api_key() -> str:
"""Read MINERU_API_KEY from the process environment or the project .env file."""
value = os.environ.get("MINERU_API_KEY", "").strip()
if value:
return value
for candidate in (
Path(__file__).resolve().parents[2] / ".env",
Path(__file__).resolve().parents[3] / ".env",
):
if not candidate.exists():
continue
for raw in candidate.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, val = line.split("=", 1)
if key.strip() == "MINERU_API_KEY":
cleaned = val.strip().strip('"').strip("'")
if cleaned:
logger.info("Loaded MINERU_API_KEY from %s", candidate)
return cleaned
return ""
# --------------------------------------------------------------------------- #
# AI → PDF conversion #
# --------------------------------------------------------------------------- #
def _ai_to_pdf(ai_path: Path, output_dir: Path) -> Path:
"""Convert an Adobe Illustrator file to PDF, keeping the original filename stem.
Modern .ai files (CS and later) are internally PDF-based; pypdf can copy
them directly. Legacy EPS-based .ai files require Ghostscript.
If the uploaded file is already a PDF it is copied as-is.
"""
output_dir.mkdir(parents=True, exist_ok=True)
pdf_path = output_dir / f"{ai_path.stem}.pdf"
with ai_path.open("rb") as fh:
header = fh.read(8)
if header.startswith(b"%PDF-"):
# PDF-based .ai or an actual PDF re-write with pypdf for cleanliness
try:
from pypdf import PdfReader, PdfWriter
reader = PdfReader(str(ai_path))
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
with pdf_path.open("wb") as fh:
writer.write(fh)
logger.info("Converted PDF-based .ai via pypdf: %s", ai_path.name)
except Exception as exc:
logger.warning("pypdf failed (%s), falling back to direct copy", exc)
shutil.copy2(ai_path, pdf_path)
else:
# Legacy EPS-based .ai → Ghostscript
gs = shutil.which("/opt/homebrew/bin/gs") or shutil.which("gs") or shutil.which("ghostscript")
if gs is None:
raise RuntimeError(
"Cannot convert legacy .ai file: Ghostscript is not installed. "
"Run: brew install ghostscript"
)
import subprocess
result = subprocess.run(
[gs, "-dNOPAUSE", "-dBATCH", "-dSAFER",
"-sDEVICE=pdfwrite", f"-sOutputFile={pdf_path}", str(ai_path)],
capture_output=True, text=True, timeout=120,
)
if result.returncode != 0:
raise RuntimeError(
f"Ghostscript failed (exit {result.returncode}):\n{result.stderr.strip()}"
)
logger.info("Converted legacy .ai via Ghostscript: %s", ai_path.name)
return pdf_path
# --------------------------------------------------------------------------- #
# PDF → PNG rasterisation #
# --------------------------------------------------------------------------- #
def _pdf_to_png(pdf_path: Path, output_dir: Path, dpi: int = 150) -> Path:
"""Rasterise the first page of a PDF to a PNG.
Tries, in order:
1. Ghostscript (if installed)
2. PyMuPDF (pip install pymupdf)
Uses a safe output filename ``page1.png`` to avoid issues with special
characters in the source PDF name.
Returns the path of the generated PNG.
"""
output_dir.mkdir(parents=True, exist_ok=True)
# Use a safe filename special chars / spaces in the PDF stem can cause
# Ghostscript to silently produce no output.
png_path = output_dir / "page1.png"
# ── 1. Ghostscript ────────────────────────────────────────────────────── #
gs = (
shutil.which("/opt/homebrew/bin/gs")
or shutil.which("/usr/local/bin/gs")
or shutil.which("ghostscript")
)
if gs:
result = subprocess.run(
[
gs, "-dNOPAUSE", "-dBATCH", "-dSAFER",
"-sDEVICE=png16m", f"-r{dpi}",
"-dFirstPage=1", "-dLastPage=1",
f"-sOutputFile={png_path}", str(pdf_path),
],
capture_output=True, text=True, timeout=60,
)
if result.returncode == 0 and png_path.exists():
w, h = _png_size(png_path)
logger.info(
"Rasterised PDF → PNG via Ghostscript at %d DPI: %dx%d px (%d KB)",
dpi, w, h, png_path.stat().st_size // 1024,
)
return png_path
logger.warning("Ghostscript rasterisation failed (exit %d): %s",
result.returncode, result.stderr[:300])
# ── 2. PyMuPDF fallback ───────────────────────────────────────────────── #
try:
import fitz # PyMuPDF
doc = fitz.open(str(pdf_path))
page = doc[0]
zoom = dpi / 72.0
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
pix.save(str(png_path))
doc.close()
w, h = _png_size(png_path)
logger.info(
"Rasterised PDF → PNG via PyMuPDF at %d DPI: %dx%d px (%d KB)",
dpi, w, h, png_path.stat().st_size // 1024,
)
return png_path
except ImportError:
raise RuntimeError(
"Cannot rasterise PDF to PNG: neither Ghostscript nor PyMuPDF is "
"available. Run: pip install pymupdf OR brew install ghostscript"
)
except Exception as exc:
raise RuntimeError(f"Cannot rasterise PDF to PNG: {exc}") from exc
def _png_size(png_path: Path) -> tuple[int, int]:
"""Return (width, height) in pixels of a PNG file."""
from PIL import Image
with Image.open(png_path) as img:
return img.size # (width, height)
# --------------------------------------------------------------------------- #
# Qwen VL region crop #
# --------------------------------------------------------------------------- #
def _crop_label_region(png_path: Path, output_dir: Path) -> Path:
"""Detect the main label area with Qwen VL and crop to it.
If DASHSCOPE_API_KEY is missing or detection fails, returns the original
PNG unchanged so the pipeline continues without interruption.
"""
from backend.app.region_detector import (
_get_api_key,
crop_and_save,
detect_regions,
merge_regions,
)
api_key = _get_api_key()
if not api_key:
logger.info("DASHSCOPE_API_KEY not configured skipping AI crop, using full image")
return png_path
try:
regions, _ = detect_regions(png_path, api_key=api_key, api_max_side=1024)
except Exception as exc:
logger.warning("Qwen region detection failed (%s) using full image", exc)
return png_path
if not regions:
logger.warning("No regions detected by Qwen using full image")
return png_path
merged = merge_regions(regions)
output_dir.mkdir(parents=True, exist_ok=True)
cropped_png = output_dir / "cropped_label.png"
# crop_and_save writes to numbered files; rename for predictability
results = crop_and_save(png_path, [merged], output_dir / "_tmp")
if not results:
return png_path
import shutil as _sh
_sh.move(results[0]["path"], str(cropped_png))
w, h = _png_size(cropped_png)
logger.info(
"Qwen crop: bbox=(%d,%d)-(%d,%d) → %s (%dx%d px)",
merged.x1, merged.y1, merged.x2, merged.y2,
cropped_png.name, w, h,
)
return cropped_png
# --------------------------------------------------------------------------- #
# MinerU image-block QR processing #
# --------------------------------------------------------------------------- #
def _process_image_blocks(
mineru_data: dict,
source_image: Path,
output_dir: Path,
) -> list[dict]:
"""对 MinerU 解析出的每个 image 类型 block 执行二维码识别流程。
流程
----
1. 从 mineru_data 中提取所有 image block含 bbox 坐标)。
2. 按 bbox 从 source_image高清裁剪图中裁出对应区域保存为临时 PNG。
3. 调用 Qwen VL 判断裁出的图片是否为二维码/条形码。
4. 如果判断为"",再调用 zxing 条码模块进行精确解码。
5. 返回每个 image block 的处理结果列表。
Parameters
----------
mineru_data:
MinerU 结构化 JSON包含 pdf_info
source_image:
用于裁剪的高清源图(即发送给 MinerU 的那张 PNG
output_dir:
裁剪图临时存放目录。
Returns
-------
list of dict
每项对应一个 image block包含
- page, block_type, x0_pt, top_pt, x1_pt, bottom_pt
- is_qr_code : bool — 大模型语义判断结果
- barcodes : list — zxing 解码结果is_qr_code=False 时为空列表)
- crop_path : str — 裁剪图相对路径(调试用)
"""
from PIL import Image
image_blocks = parse_mineru_image_blocks(mineru_data)
if not image_blocks:
return []
output_dir.mkdir(parents=True, exist_ok=True)
results: list[dict] = []
with Image.open(source_image) as src_img:
img_w, img_h = src_img.size
for idx, block in enumerate(image_blocks, start=1):
# ── 裁剪 ──────────────────────────────────────────────────────── #
x0 = max(0, int(block["x0_pt"]))
y0 = max(0, int(block["top_pt"]))
x1 = min(img_w, int(block["x1_pt"]))
y1 = min(img_h, int(block["bottom_pt"]))
if x1 <= x0 or y1 <= y0:
logger.warning(
"_process_image_blocks: block %d 边界框无效 (%d,%d)-(%d,%d),跳过",
idx, x0, y0, x1, y1,
)
results.append({**block, "is_qr_code": False, "barcodes": [], "crop_path": None})
continue
crop = src_img.crop((x0, y0, x1, y1))
crop_file = output_dir / f"block_{idx:03d}_p{block['page']}.png"
crop.save(crop_file)
logger.info(
"_process_image_blocks: block %d saved crop %s (%dx%d px)",
idx, crop_file.name, x1 - x0, y1 - y0,
)
# ── Qwen VL 语义判断 ──────────────────────────────────────────── #
qr_detected = is_qr_code(crop_file)
# ── 条码解码(仅在语义判断为二维码时执行)────────────────────── #
barcodes: list[dict] = []
if qr_detected:
logger.info(
"_process_image_blocks: block %d 被识别为二维码,启动条码解码",
idx,
)
raw_barcodes = detect_barcodes(crop_file)
barcodes = [
{
"format": b.format,
"format_label": b.format_label,
"text": b.text,
"x0": b.x0,
"y0": b.y0,
"x1": b.x1,
"y1": b.y1,
"valid": b.valid,
}
for b in raw_barcodes
]
if barcodes:
logger.info(
"_process_image_blocks: block %d 条码解码成功,共 %d",
idx, len(barcodes),
)
else:
logger.warning(
"_process_image_blocks: block %d 语义判断为二维码,但 zxing 未能解码",
idx,
)
results.append(
{
**block,
"is_qr_code": qr_detected,
"barcodes": barcodes,
"crop_path": str(crop_file),
}
)
return results
# --------------------------------------------------------------------------- #
# Public API #
# --------------------------------------------------------------------------- #
def process_document(
ai_path: Path,
word_path: Path,
output_dir: Path,
job_id: str,
) -> dict:
"""Full pipeline: AI → PDF → PNG → Qwen crop → MinerU → validate.
Steps
-----
1. AI / PDF file → clean PDF
2. PDF → high-res PNG (Ghostscript, 150 DPI)
3. PNG → Qwen VL detects main label area → cropped PNG
(graceful fallback to full PNG when key is absent)
4. Cropped PNG → MinerU structured-JSON extraction
5. MinerU fields → validate against Word reference document
Returns
-------
dict
``{ preview: {...}, fields: [...] }`` matching the frontend
``ProcessResponse`` type. ``preview.type`` is ``"png"`` and
``pageWidthPt`` / ``pageHeightPt`` hold the cropped image dimensions
in pixels (coord system is pixel-aligned for the PNG overlay).
"""
output_dir.mkdir(parents=True, exist_ok=True)
# ── 1. AI → PDF ──────────────────────────────────────────────────────── #
logger.info("Step 1/5 Converting AI to PDF: %s", ai_path.name)
pdf_path = _ai_to_pdf(ai_path, output_dir)
# ── 2. PDF → PNG ─────────────────────────────────────────────────────── #
logger.info("Step 2/5 Rasterising PDF to PNG (150 DPI)")
png_path = _pdf_to_png(pdf_path, output_dir / "raster", dpi=150)
# ── 3. Qwen VL crop ───────────────────────────────────────────────────── #
logger.info("Step 3/5 AI region detection & crop")
cropped_path = _crop_label_region(png_path, output_dir / "crop")
# Relative URL fragment understood by /api/files/{job_id}/{file_path}
cropped_rel = cropped_path.relative_to(output_dir).as_posix()
img_w, img_h = _png_size(cropped_path)
# ── 3b. Barcode detection ─────────────────────────────────────────────── #
logger.info("Step 3b Scanning for barcodes / QR codes")
barcodes = detect_barcodes(cropped_path)
# Crop each barcode region for frontend display
barcode_crops_dir = output_dir / "barcode_crops"
barcode_crops_dir.mkdir(parents=True, exist_ok=True)
from PIL import Image as _PILImage # noqa: PLC0415
with _PILImage.open(cropped_path) as _src_img:
_src_w, _src_h = _src_img.size
for _bi, _b in enumerate(barcodes):
_pad = 12
_cx0 = max(0, _b.x0 - _pad)
_cy0 = max(0, _b.y0 - _pad)
_cx1 = min(_src_w, _b.x1 + _pad)
_cy1 = min(_src_h, _b.y1 + _pad)
_crop = _src_img.crop((_cx0, _cy0, _cx1, _cy1))
_crop.save(barcode_crops_dir / f"barcode_{_bi}.png")
barcode_results = [
{
"format": b.format,
"format_label": b.format_label,
"text": b.text,
"x0": b.x0,
"y0": b.y0,
"x1": b.x1,
"y1": b.y1,
"valid": b.valid,
"crop_url": f"/api/files/{job_id}/barcode_crops/barcode_{i}.png",
}
for i, b in enumerate(barcodes)
]
logger.info("Step 3b Found %d barcode(s)", len(barcode_results))
# ── 4. MinerU parsing ────────────────────────────────────────────────── #
logger.info("Step 4/5 Sending cropped PNG to MinerU: %s", cropped_path.name)
mineru_api_key = _get_mineru_api_key()
if not mineru_api_key:
raise RuntimeError("MINERU_API_KEY is not configured")
mineru_dir = output_dir / "mineru"
client = MineruClient(api_key=mineru_api_key)
mineru_data = client.parse_image(cropped_path, mineru_dir)
# ── 5. Parse + validate ───────────────────────────────────────────────── #
logger.info("Step 5/5 Parsing MinerU result and validating against Word")
doc = parse_mineru_fields(mineru_data)
word_text = extract_word_text(word_path)
word_html = extract_word_html(word_path)
fields: list[dict] = []
for idx, field in enumerate(doc.fields, start=1):
validation = validate_field_against_word(field["text"], word_text)
fields.append(
{
"id": f"field-{idx}",
**field,
"normalized_text": validation.normalized_text,
"validation_status": validation.status,
"validation_reason": validation.reason,
"matched_excerpt": validation.matched_excerpt,
}
)
_STATUS_RANK = {"matched": 0, "unmatched": 1, "empty_or_garbled": 2}
fields.sort(key=lambda f: (
_STATUS_RANK.get(f["validation_status"], 9),
f["page"],
f["top_pt"],
f["x0_pt"],
))
logger.info(
"Pipeline done: job_id=%s fields=%d matched=%d unmatched=%d garbled=%d",
job_id,
len(fields),
sum(1 for f in fields if f["validation_status"] == "matched"),
sum(1 for f in fields if f["validation_status"] == "unmatched"),
sum(1 for f in fields if f["validation_status"] == "empty_or_garbled"),
)
# ── 5b. Image blocks: QR semantic check → barcode decode ─────────────── #
image_block_results = _process_image_blocks(
mineru_data=mineru_data,
source_image=cropped_path,
output_dir=output_dir / "image_blocks",
)
logger.info("Step 5b Processed %d image block(s) from MinerU", len(image_block_results))
return {
"preview": {
# type='png': frontend renders <img> + overlay (not PDF canvas)
"type": "png",
"url": f"/api/files/{job_id}/{cropped_rel}",
# For PNG the "pt" fields carry pixel dimensions so overlay
# scale factors remain 1:1 at 100% zoom.
"pageWidthPt": img_w,
"pageHeightPt": img_h,
},
"fields": fields,
"word_text": word_text,
"word_html": word_html,
"barcodes": barcode_results,
"image_blocks": image_block_results,
}