"""Core processing pipeline: AI → PDF → PNG → Qwen crop → MinerU → validate.""" from __future__ import annotations import logging import os import shutil import subprocess from pathlib import Path from backend.app.barcode_detector import detect_barcodes from backend.app.image_classifier import is_qr_code from backend.app.mineru_client import MineruClient, MineruClientError from backend.app.mineru_parser import parse_mineru_fields, parse_mineru_image_blocks from backend.app.text_validation import validate_field_against_word from backend.app.word_parser import extract_word_html, extract_word_text logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # # Environment helpers # # --------------------------------------------------------------------------- # def _get_mineru_api_key() -> str: """Read MINERU_API_KEY from the process environment or the project .env file.""" value = os.environ.get("MINERU_API_KEY", "").strip() if value: return value for candidate in ( Path(__file__).resolve().parents[2] / ".env", Path(__file__).resolve().parents[3] / ".env", ): if not candidate.exists(): continue for raw in candidate.read_text(encoding="utf-8").splitlines(): line = raw.strip() if not line or line.startswith("#") or "=" not in line: continue key, val = line.split("=", 1) if key.strip() == "MINERU_API_KEY": cleaned = val.strip().strip('"').strip("'") if cleaned: logger.info("Loaded MINERU_API_KEY from %s", candidate) return cleaned return "" # --------------------------------------------------------------------------- # # AI → PDF conversion # # --------------------------------------------------------------------------- # def _ai_to_pdf(ai_path: Path, output_dir: Path) -> Path: """Convert an Adobe Illustrator file to PDF, keeping the original filename stem. Modern .ai files (CS and later) are internally PDF-based; pypdf can copy them directly. Legacy EPS-based .ai files require Ghostscript. If the uploaded file is already a PDF it is copied as-is. """ output_dir.mkdir(parents=True, exist_ok=True) pdf_path = output_dir / f"{ai_path.stem}.pdf" with ai_path.open("rb") as fh: header = fh.read(8) if header.startswith(b"%PDF-"): # PDF-based .ai or an actual PDF – re-write with pypdf for cleanliness try: from pypdf import PdfReader, PdfWriter reader = PdfReader(str(ai_path)) writer = PdfWriter() for page in reader.pages: writer.add_page(page) with pdf_path.open("wb") as fh: writer.write(fh) logger.info("Converted PDF-based .ai via pypdf: %s", ai_path.name) except Exception as exc: logger.warning("pypdf failed (%s), falling back to direct copy", exc) shutil.copy2(ai_path, pdf_path) else: # Legacy EPS-based .ai → Ghostscript gs = shutil.which("/opt/homebrew/bin/gs") or shutil.which("gs") or shutil.which("ghostscript") if gs is None: raise RuntimeError( "Cannot convert legacy .ai file: Ghostscript is not installed. " "Run: brew install ghostscript" ) import subprocess result = subprocess.run( [gs, "-dNOPAUSE", "-dBATCH", "-dSAFER", "-sDEVICE=pdfwrite", f"-sOutputFile={pdf_path}", str(ai_path)], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: raise RuntimeError( f"Ghostscript failed (exit {result.returncode}):\n{result.stderr.strip()}" ) logger.info("Converted legacy .ai via Ghostscript: %s", ai_path.name) return pdf_path # --------------------------------------------------------------------------- # # PDF → PNG rasterisation # # --------------------------------------------------------------------------- # def _pdf_to_png(pdf_path: Path, output_dir: Path, dpi: int = 150) -> Path: """Rasterise the first page of a PDF to a PNG. Tries, in order: 1. Ghostscript (if installed) 2. PyMuPDF (pip install pymupdf) Uses a safe output filename ``page1.png`` to avoid issues with special characters in the source PDF name. Returns the path of the generated PNG. """ output_dir.mkdir(parents=True, exist_ok=True) # Use a safe filename – special chars / spaces in the PDF stem can cause # Ghostscript to silently produce no output. png_path = output_dir / "page1.png" # ── 1. Ghostscript ────────────────────────────────────────────────────── # gs = ( shutil.which("/opt/homebrew/bin/gs") or shutil.which("/usr/local/bin/gs") or shutil.which("ghostscript") ) if gs: result = subprocess.run( [ gs, "-dNOPAUSE", "-dBATCH", "-dSAFER", "-sDEVICE=png16m", f"-r{dpi}", "-dFirstPage=1", "-dLastPage=1", f"-sOutputFile={png_path}", str(pdf_path), ], capture_output=True, text=True, timeout=60, ) if result.returncode == 0 and png_path.exists(): w, h = _png_size(png_path) logger.info( "Rasterised PDF → PNG via Ghostscript at %d DPI: %dx%d px (%d KB)", dpi, w, h, png_path.stat().st_size // 1024, ) return png_path logger.warning("Ghostscript rasterisation failed (exit %d): %s", result.returncode, result.stderr[:300]) # ── 2. PyMuPDF fallback ───────────────────────────────────────────────── # try: import fitz # PyMuPDF doc = fitz.open(str(pdf_path)) page = doc[0] zoom = dpi / 72.0 mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat, alpha=False) pix.save(str(png_path)) doc.close() w, h = _png_size(png_path) logger.info( "Rasterised PDF → PNG via PyMuPDF at %d DPI: %dx%d px (%d KB)", dpi, w, h, png_path.stat().st_size // 1024, ) return png_path except ImportError: raise RuntimeError( "Cannot rasterise PDF to PNG: neither Ghostscript nor PyMuPDF is " "available. Run: pip install pymupdf OR brew install ghostscript" ) except Exception as exc: raise RuntimeError(f"Cannot rasterise PDF to PNG: {exc}") from exc def _png_size(png_path: Path) -> tuple[int, int]: """Return (width, height) in pixels of a PNG file.""" from PIL import Image with Image.open(png_path) as img: return img.size # (width, height) # --------------------------------------------------------------------------- # # Qwen VL region crop # # --------------------------------------------------------------------------- # def _crop_label_region(png_path: Path, output_dir: Path) -> Path: """Detect the main label area with Qwen VL and crop to it. If DASHSCOPE_API_KEY is missing or detection fails, returns the original PNG unchanged so the pipeline continues without interruption. """ from backend.app.region_detector import ( _get_api_key, crop_and_save, detect_regions, merge_regions, ) api_key = _get_api_key() if not api_key: logger.info("DASHSCOPE_API_KEY not configured – skipping AI crop, using full image") return png_path try: regions, _ = detect_regions(png_path, api_key=api_key, api_max_side=1024) except Exception as exc: logger.warning("Qwen region detection failed (%s) – using full image", exc) return png_path if not regions: logger.warning("No regions detected by Qwen – using full image") return png_path merged = merge_regions(regions) output_dir.mkdir(parents=True, exist_ok=True) cropped_png = output_dir / "cropped_label.png" # crop_and_save writes to numbered files; rename for predictability results = crop_and_save(png_path, [merged], output_dir / "_tmp") if not results: return png_path import shutil as _sh _sh.move(results[0]["path"], str(cropped_png)) w, h = _png_size(cropped_png) logger.info( "Qwen crop: bbox=(%d,%d)-(%d,%d) → %s (%dx%d px)", merged.x1, merged.y1, merged.x2, merged.y2, cropped_png.name, w, h, ) return cropped_png # --------------------------------------------------------------------------- # # MinerU image-block QR processing # # --------------------------------------------------------------------------- # def _process_image_blocks( mineru_data: dict, source_image: Path, output_dir: Path, ) -> list[dict]: """对 MinerU 解析出的每个 image 类型 block 执行二维码识别流程。 流程 ---- 1. 从 mineru_data 中提取所有 image block(含 bbox 坐标)。 2. 按 bbox 从 source_image(高清裁剪图)中裁出对应区域,保存为临时 PNG。 3. 调用 Qwen VL 判断裁出的图片是否为二维码/条形码。 4. 如果判断为"是",再调用 zxing 条码模块进行精确解码。 5. 返回每个 image block 的处理结果列表。 Parameters ---------- mineru_data: MinerU 结构化 JSON(包含 pdf_info)。 source_image: 用于裁剪的高清源图(即发送给 MinerU 的那张 PNG)。 output_dir: 裁剪图临时存放目录。 Returns ------- list of dict 每项对应一个 image block,包含: - page, block_type, x0_pt, top_pt, x1_pt, bottom_pt - is_qr_code : bool — 大模型语义判断结果 - barcodes : list — zxing 解码结果(is_qr_code=False 时为空列表) - crop_path : str — 裁剪图相对路径(调试用) """ from PIL import Image image_blocks = parse_mineru_image_blocks(mineru_data) if not image_blocks: return [] output_dir.mkdir(parents=True, exist_ok=True) results: list[dict] = [] with Image.open(source_image) as src_img: img_w, img_h = src_img.size for idx, block in enumerate(image_blocks, start=1): # ── 裁剪 ──────────────────────────────────────────────────────── # x0 = max(0, int(block["x0_pt"])) y0 = max(0, int(block["top_pt"])) x1 = min(img_w, int(block["x1_pt"])) y1 = min(img_h, int(block["bottom_pt"])) if x1 <= x0 or y1 <= y0: logger.warning( "_process_image_blocks: block %d 边界框无效 (%d,%d)-(%d,%d),跳过", idx, x0, y0, x1, y1, ) results.append({**block, "is_qr_code": False, "barcodes": [], "crop_path": None}) continue crop = src_img.crop((x0, y0, x1, y1)) crop_file = output_dir / f"block_{idx:03d}_p{block['page']}.png" crop.save(crop_file) logger.info( "_process_image_blocks: block %d saved crop %s (%dx%d px)", idx, crop_file.name, x1 - x0, y1 - y0, ) # ── Qwen VL 语义判断 ──────────────────────────────────────────── # qr_detected = is_qr_code(crop_file) # ── 条码解码(仅在语义判断为二维码时执行)────────────────────── # barcodes: list[dict] = [] if qr_detected: logger.info( "_process_image_blocks: block %d 被识别为二维码,启动条码解码", idx, ) raw_barcodes = detect_barcodes(crop_file) barcodes = [ { "format": b.format, "format_label": b.format_label, "text": b.text, "x0": b.x0, "y0": b.y0, "x1": b.x1, "y1": b.y1, "valid": b.valid, } for b in raw_barcodes ] if barcodes: logger.info( "_process_image_blocks: block %d 条码解码成功,共 %d 条", idx, len(barcodes), ) else: logger.warning( "_process_image_blocks: block %d 语义判断为二维码,但 zxing 未能解码", idx, ) results.append( { **block, "is_qr_code": qr_detected, "barcodes": barcodes, "crop_path": str(crop_file), } ) return results # --------------------------------------------------------------------------- # # Public API # # --------------------------------------------------------------------------- # def process_document( ai_path: Path, word_path: Path, output_dir: Path, job_id: str, ) -> dict: """Full pipeline: AI → PDF → PNG → Qwen crop → MinerU → validate. Steps ----- 1. AI / PDF file → clean PDF 2. PDF → high-res PNG (Ghostscript, 150 DPI) 3. PNG → Qwen VL detects main label area → cropped PNG (graceful fallback to full PNG when key is absent) 4. Cropped PNG → MinerU structured-JSON extraction 5. MinerU fields → validate against Word reference document Returns ------- dict ``{ preview: {...}, fields: [...] }`` matching the frontend ``ProcessResponse`` type. ``preview.type`` is ``"png"`` and ``pageWidthPt`` / ``pageHeightPt`` hold the cropped image dimensions in pixels (coord system is pixel-aligned for the PNG overlay). """ output_dir.mkdir(parents=True, exist_ok=True) # ── 1. AI → PDF ──────────────────────────────────────────────────────── # logger.info("Step 1/5 – Converting AI to PDF: %s", ai_path.name) pdf_path = _ai_to_pdf(ai_path, output_dir) # ── 2. PDF → PNG ─────────────────────────────────────────────────────── # logger.info("Step 2/5 – Rasterising PDF to PNG (150 DPI)") png_path = _pdf_to_png(pdf_path, output_dir / "raster", dpi=150) # ── 3. Qwen VL crop ───────────────────────────────────────────────────── # logger.info("Step 3/5 – AI region detection & crop") cropped_path = _crop_label_region(png_path, output_dir / "crop") # Relative URL fragment understood by /api/files/{job_id}/{file_path} cropped_rel = cropped_path.relative_to(output_dir).as_posix() img_w, img_h = _png_size(cropped_path) # ── 3b. Barcode detection ─────────────────────────────────────────────── # logger.info("Step 3b – Scanning for barcodes / QR codes") barcodes = detect_barcodes(cropped_path) # Crop each barcode region for frontend display barcode_crops_dir = output_dir / "barcode_crops" barcode_crops_dir.mkdir(parents=True, exist_ok=True) from PIL import Image as _PILImage # noqa: PLC0415 with _PILImage.open(cropped_path) as _src_img: _src_w, _src_h = _src_img.size for _bi, _b in enumerate(barcodes): _pad = 12 _cx0 = max(0, _b.x0 - _pad) _cy0 = max(0, _b.y0 - _pad) _cx1 = min(_src_w, _b.x1 + _pad) _cy1 = min(_src_h, _b.y1 + _pad) _crop = _src_img.crop((_cx0, _cy0, _cx1, _cy1)) _crop.save(barcode_crops_dir / f"barcode_{_bi}.png") barcode_results = [ { "format": b.format, "format_label": b.format_label, "text": b.text, "x0": b.x0, "y0": b.y0, "x1": b.x1, "y1": b.y1, "valid": b.valid, "crop_url": f"/api/files/{job_id}/barcode_crops/barcode_{i}.png", } for i, b in enumerate(barcodes) ] logger.info("Step 3b – Found %d barcode(s)", len(barcode_results)) # ── 4. MinerU parsing ────────────────────────────────────────────────── # logger.info("Step 4/5 – Sending cropped PNG to MinerU: %s", cropped_path.name) mineru_api_key = _get_mineru_api_key() if not mineru_api_key: raise RuntimeError("MINERU_API_KEY is not configured") mineru_dir = output_dir / "mineru" client = MineruClient(api_key=mineru_api_key) mineru_data = client.parse_image(cropped_path, mineru_dir) # ── 5. Parse + validate ───────────────────────────────────────────────── # logger.info("Step 5/5 – Parsing MinerU result and validating against Word") doc = parse_mineru_fields(mineru_data) word_text = extract_word_text(word_path) word_html = extract_word_html(word_path) fields: list[dict] = [] for idx, field in enumerate(doc.fields, start=1): validation = validate_field_against_word(field["text"], word_text) fields.append( { "id": f"field-{idx}", **field, "normalized_text": validation.normalized_text, "validation_status": validation.status, "validation_reason": validation.reason, "matched_excerpt": validation.matched_excerpt, } ) _STATUS_RANK = {"matched": 0, "unmatched": 1, "empty_or_garbled": 2} fields.sort(key=lambda f: ( _STATUS_RANK.get(f["validation_status"], 9), f["page"], f["top_pt"], f["x0_pt"], )) logger.info( "Pipeline done: job_id=%s fields=%d matched=%d unmatched=%d garbled=%d", job_id, len(fields), sum(1 for f in fields if f["validation_status"] == "matched"), sum(1 for f in fields if f["validation_status"] == "unmatched"), sum(1 for f in fields if f["validation_status"] == "empty_or_garbled"), ) # ── 5b. Image blocks: QR semantic check → barcode decode ─────────────── # image_block_results = _process_image_blocks( mineru_data=mineru_data, source_image=cropped_path, output_dir=output_dir / "image_blocks", ) logger.info("Step 5b – Processed %d image block(s) from MinerU", len(image_block_results)) return { "preview": { # type='png': frontend renders + overlay (not PDF canvas) "type": "png", "url": f"/api/files/{job_id}/{cropped_rel}", # For PNG the "pt" fields carry pixel dimensions so overlay # scale factors remain 1:1 at 100% zoom. "pageWidthPt": img_w, "pageHeightPt": img_h, }, "fields": fields, "word_text": word_text, "word_html": word_html, "barcodes": barcode_results, "image_blocks": image_block_results, }