Initial commit: 包装审核 POC、Docker 与前后端

Made-with: Cursor
2026-04-15 17:18:49 +08:00
commit bbb4dd43b3
74 changed files with 297415 additions and 0 deletions
--- a/backend/app/pipeline.py
+++ b/backend/app/pipeline.py
@@ -0,0 +1,507 @@
+"""Core processing pipeline: AI → PDF → PNG → Qwen crop → MinerU → validate."""
+from __future__ import annotations
+
+import logging
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+from backend.app.barcode_detector import detect_barcodes
+from backend.app.image_classifier import is_qr_code
+from backend.app.mineru_client import MineruClient, MineruClientError
+from backend.app.mineru_parser import parse_mineru_fields, parse_mineru_image_blocks
+from backend.app.text_validation import validate_field_against_word
+from backend.app.word_parser import extract_word_html, extract_word_text
+
+logger = logging.getLogger(__name__)
+
+# --------------------------------------------------------------------------- #
+# Environment helpers                                                           #
+# --------------------------------------------------------------------------- #
+
+def _get_mineru_api_key() -> str:
+    """Read MINERU_API_KEY from the process environment or the project .env file."""
+    value = os.environ.get("MINERU_API_KEY", "").strip()
+    if value:
+        return value
+
+    for candidate in (
+        Path(__file__).resolve().parents[2] / ".env",
+        Path(__file__).resolve().parents[3] / ".env",
+    ):
+        if not candidate.exists():
+            continue
+        for raw in candidate.read_text(encoding="utf-8").splitlines():
+            line = raw.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            key, val = line.split("=", 1)
+            if key.strip() == "MINERU_API_KEY":
+                cleaned = val.strip().strip('"').strip("'")
+                if cleaned:
+                    logger.info("Loaded MINERU_API_KEY from %s", candidate)
+                    return cleaned
+    return ""
+
+
+# --------------------------------------------------------------------------- #
+# AI → PDF conversion                                                           #
+# --------------------------------------------------------------------------- #
+
+def _ai_to_pdf(ai_path: Path, output_dir: Path) -> Path:
+    """Convert an Adobe Illustrator file to PDF, keeping the original filename stem.
+
+    Modern .ai files (CS and later) are internally PDF-based; pypdf can copy
+    them directly.  Legacy EPS-based .ai files require Ghostscript.
+    If the uploaded file is already a PDF it is copied as-is.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pdf_path = output_dir / f"{ai_path.stem}.pdf"
+
+    with ai_path.open("rb") as fh:
+        header = fh.read(8)
+
+    if header.startswith(b"%PDF-"):
+        # PDF-based .ai or an actual PDF – re-write with pypdf for cleanliness
+        try:
+            from pypdf import PdfReader, PdfWriter
+
+            reader = PdfReader(str(ai_path))
+            writer = PdfWriter()
+            for page in reader.pages:
+                writer.add_page(page)
+            with pdf_path.open("wb") as fh:
+                writer.write(fh)
+            logger.info("Converted PDF-based .ai via pypdf: %s", ai_path.name)
+        except Exception as exc:
+            logger.warning("pypdf failed (%s), falling back to direct copy", exc)
+            shutil.copy2(ai_path, pdf_path)
+    else:
+        # Legacy EPS-based .ai → Ghostscript
+        gs = shutil.which("/opt/homebrew/bin/gs") or shutil.which("gs") or shutil.which("ghostscript")
+        if gs is None:
+            raise RuntimeError(
+                "Cannot convert legacy .ai file: Ghostscript is not installed. "
+                "Run:  brew install ghostscript"
+            )
+        import subprocess
+
+        result = subprocess.run(
+            [gs, "-dNOPAUSE", "-dBATCH", "-dSAFER",
+             "-sDEVICE=pdfwrite", f"-sOutputFile={pdf_path}", str(ai_path)],
+            capture_output=True, text=True, timeout=120,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Ghostscript failed (exit {result.returncode}):\n{result.stderr.strip()}"
+            )
+        logger.info("Converted legacy .ai via Ghostscript: %s", ai_path.name)
+
+    return pdf_path
+
+
+# --------------------------------------------------------------------------- #
+# PDF → PNG rasterisation                                                       #
+# --------------------------------------------------------------------------- #
+
+def _pdf_to_png(pdf_path: Path, output_dir: Path, dpi: int = 150) -> Path:
+    """Rasterise the first page of a PDF to a PNG.
+
+    Tries, in order:
+    1. Ghostscript (if installed)
+    2. PyMuPDF  (pip install pymupdf)
+
+    Uses a safe output filename ``page1.png`` to avoid issues with special
+    characters in the source PDF name.
+    Returns the path of the generated PNG.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Use a safe filename – special chars / spaces in the PDF stem can cause
+    # Ghostscript to silently produce no output.
+    png_path = output_dir / "page1.png"
+
+    # ── 1. Ghostscript ────────────────────────────────────────────────────── #
+    gs = (
+        shutil.which("/opt/homebrew/bin/gs")
+        or shutil.which("/usr/local/bin/gs")
+        or shutil.which("ghostscript")
+    )
+    if gs:
+        result = subprocess.run(
+            [
+                gs, "-dNOPAUSE", "-dBATCH", "-dSAFER",
+                "-sDEVICE=png16m", f"-r{dpi}",
+                "-dFirstPage=1", "-dLastPage=1",
+                f"-sOutputFile={png_path}", str(pdf_path),
+            ],
+            capture_output=True, text=True, timeout=60,
+        )
+        if result.returncode == 0 and png_path.exists():
+            w, h = _png_size(png_path)
+            logger.info(
+                "Rasterised PDF → PNG via Ghostscript at %d DPI: %dx%d px (%d KB)",
+                dpi, w, h, png_path.stat().st_size // 1024,
+            )
+            return png_path
+        logger.warning("Ghostscript rasterisation failed (exit %d): %s",
+                       result.returncode, result.stderr[:300])
+
+    # ── 2. PyMuPDF fallback ───────────────────────────────────────────────── #
+    try:
+        import fitz  # PyMuPDF
+
+        doc = fitz.open(str(pdf_path))
+        page = doc[0]
+        zoom = dpi / 72.0
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+        pix.save(str(png_path))
+        doc.close()
+        w, h = _png_size(png_path)
+        logger.info(
+            "Rasterised PDF → PNG via PyMuPDF at %d DPI: %dx%d px (%d KB)",
+            dpi, w, h, png_path.stat().st_size // 1024,
+        )
+        return png_path
+    except ImportError:
+        raise RuntimeError(
+            "Cannot rasterise PDF to PNG: neither Ghostscript nor PyMuPDF is "
+            "available.  Run:  pip install pymupdf  OR  brew install ghostscript"
+        )
+    except Exception as exc:
+        raise RuntimeError(f"Cannot rasterise PDF to PNG: {exc}") from exc
+
+
+def _png_size(png_path: Path) -> tuple[int, int]:
+    """Return (width, height) in pixels of a PNG file."""
+    from PIL import Image
+    with Image.open(png_path) as img:
+        return img.size  # (width, height)
+
+
+# --------------------------------------------------------------------------- #
+# Qwen VL region crop                                                           #
+# --------------------------------------------------------------------------- #
+
+def _crop_label_region(png_path: Path, output_dir: Path) -> Path:
+    """Detect the main label area with Qwen VL and crop to it.
+
+    If DASHSCOPE_API_KEY is missing or detection fails, returns the original
+    PNG unchanged so the pipeline continues without interruption.
+    """
+    from backend.app.region_detector import (
+        _get_api_key,
+        crop_and_save,
+        detect_regions,
+        merge_regions,
+    )
+
+    api_key = _get_api_key()
+    if not api_key:
+        logger.info("DASHSCOPE_API_KEY not configured – skipping AI crop, using full image")
+        return png_path
+
+    try:
+        regions, _ = detect_regions(png_path, api_key=api_key, api_max_side=1024)
+    except Exception as exc:
+        logger.warning("Qwen region detection failed (%s) – using full image", exc)
+        return png_path
+
+    if not regions:
+        logger.warning("No regions detected by Qwen – using full image")
+        return png_path
+
+    merged = merge_regions(regions)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    cropped_png = output_dir / "cropped_label.png"
+
+    # crop_and_save writes to numbered files; rename for predictability
+    results = crop_and_save(png_path, [merged], output_dir / "_tmp")
+    if not results:
+        return png_path
+
+    import shutil as _sh
+    _sh.move(results[0]["path"], str(cropped_png))
+
+    w, h = _png_size(cropped_png)
+    logger.info(
+        "Qwen crop: bbox=(%d,%d)-(%d,%d) → %s (%dx%d px)",
+        merged.x1, merged.y1, merged.x2, merged.y2,
+        cropped_png.name, w, h,
+    )
+    return cropped_png
+
+
+# --------------------------------------------------------------------------- #
+# MinerU image-block QR processing                                              #
+# --------------------------------------------------------------------------- #
+
+def _process_image_blocks(
+    mineru_data: dict,
+    source_image: Path,
+    output_dir: Path,
+) -> list[dict]:
+    """对 MinerU 解析出的每个 image 类型 block 执行二维码识别流程。
+
+    流程
+    ----
+    1. 从 mineru_data 中提取所有 image block（含 bbox 坐标）。
+    2. 按 bbox 从 source_image（高清裁剪图）中裁出对应区域，保存为临时 PNG。
+    3. 调用 Qwen VL 判断裁出的图片是否为二维码/条形码。
+    4. 如果判断为"是"，再调用 zxing 条码模块进行精确解码。
+    5. 返回每个 image block 的处理结果列表。
+
+    Parameters
+    ----------
+    mineru_data:
+        MinerU 结构化 JSON（包含 pdf_info）。
+    source_image:
+        用于裁剪的高清源图（即发送给 MinerU 的那张 PNG）。
+    output_dir:
+        裁剪图临时存放目录。
+
+    Returns
+    -------
+    list of dict
+        每项对应一个 image block，包含：
+        - page, block_type, x0_pt, top_pt, x1_pt, bottom_pt
+        - is_qr_code     : bool  — 大模型语义判断结果
+        - barcodes       : list  — zxing 解码结果（is_qr_code=False 时为空列表）
+        - crop_path      : str   — 裁剪图相对路径（调试用）
+    """
+    from PIL import Image
+
+    image_blocks = parse_mineru_image_blocks(mineru_data)
+    if not image_blocks:
+        return []
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    results: list[dict] = []
+
+    with Image.open(source_image) as src_img:
+        img_w, img_h = src_img.size
+
+        for idx, block in enumerate(image_blocks, start=1):
+            # ── 裁剪 ──────────────────────────────────────────────────────── #
+            x0 = max(0, int(block["x0_pt"]))
+            y0 = max(0, int(block["top_pt"]))
+            x1 = min(img_w, int(block["x1_pt"]))
+            y1 = min(img_h, int(block["bottom_pt"]))
+
+            if x1 <= x0 or y1 <= y0:
+                logger.warning(
+                    "_process_image_blocks: block %d 边界框无效 (%d,%d)-(%d,%d)，跳过",
+                    idx, x0, y0, x1, y1,
+                )
+                results.append({**block, "is_qr_code": False, "barcodes": [], "crop_path": None})
+                continue
+
+            crop = src_img.crop((x0, y0, x1, y1))
+            crop_file = output_dir / f"block_{idx:03d}_p{block['page']}.png"
+            crop.save(crop_file)
+            logger.info(
+                "_process_image_blocks: block %d saved crop %s (%dx%d px)",
+                idx, crop_file.name, x1 - x0, y1 - y0,
+            )
+
+            # ── Qwen VL 语义判断 ──────────────────────────────────────────── #
+            qr_detected = is_qr_code(crop_file)
+
+            # ── 条码解码（仅在语义判断为二维码时执行）────────────────────── #
+            barcodes: list[dict] = []
+            if qr_detected:
+                logger.info(
+                    "_process_image_blocks: block %d 被识别为二维码，启动条码解码",
+                    idx,
+                )
+                raw_barcodes = detect_barcodes(crop_file)
+                barcodes = [
+                    {
+                        "format": b.format,
+                        "format_label": b.format_label,
+                        "text": b.text,
+                        "x0": b.x0,
+                        "y0": b.y0,
+                        "x1": b.x1,
+                        "y1": b.y1,
+                        "valid": b.valid,
+                    }
+                    for b in raw_barcodes
+                ]
+                if barcodes:
+                    logger.info(
+                        "_process_image_blocks: block %d 条码解码成功，共 %d 条",
+                        idx, len(barcodes),
+                    )
+                else:
+                    logger.warning(
+                        "_process_image_blocks: block %d 语义判断为二维码，但 zxing 未能解码",
+                        idx,
+                    )
+
+            results.append(
+                {
+                    **block,
+                    "is_qr_code": qr_detected,
+                    "barcodes": barcodes,
+                    "crop_path": str(crop_file),
+                }
+            )
+
+    return results
+
+
+# --------------------------------------------------------------------------- #
+# Public API                                                                    #
+# --------------------------------------------------------------------------- #
+
+def process_document(
+    ai_path: Path,
+    word_path: Path,
+    output_dir: Path,
+    job_id: str,
+) -> dict:
+    """Full pipeline: AI → PDF → PNG → Qwen crop → MinerU → validate.
+
+    Steps
+    -----
+    1. AI / PDF file  → clean PDF
+    2. PDF            → high-res PNG (Ghostscript, 150 DPI)
+    3. PNG            → Qwen VL detects main label area → cropped PNG
+                        (graceful fallback to full PNG when key is absent)
+    4. Cropped PNG    → MinerU structured-JSON extraction
+    5. MinerU fields  → validate against Word reference document
+
+    Returns
+    -------
+    dict
+        ``{ preview: {...}, fields: [...] }`` matching the frontend
+        ``ProcessResponse`` type.  ``preview.type`` is ``"png"`` and
+        ``pageWidthPt`` / ``pageHeightPt`` hold the cropped image dimensions
+        in pixels (coord system is pixel-aligned for the PNG overlay).
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # ── 1. AI → PDF ──────────────────────────────────────────────────────── #
+    logger.info("Step 1/5 – Converting AI to PDF: %s", ai_path.name)
+    pdf_path = _ai_to_pdf(ai_path, output_dir)
+
+    # ── 2. PDF → PNG ─────────────────────────────────────────────────────── #
+    logger.info("Step 2/5 – Rasterising PDF to PNG (150 DPI)")
+    png_path = _pdf_to_png(pdf_path, output_dir / "raster", dpi=150)
+
+    # ── 3. Qwen VL crop ───────────────────────────────────────────────────── #
+    logger.info("Step 3/5 – AI region detection & crop")
+    cropped_path = _crop_label_region(png_path, output_dir / "crop")
+
+    # Relative URL fragment understood by /api/files/{job_id}/{file_path}
+    cropped_rel = cropped_path.relative_to(output_dir).as_posix()
+    img_w, img_h = _png_size(cropped_path)
+
+    # ── 3b. Barcode detection ─────────────────────────────────────────────── #
+    logger.info("Step 3b – Scanning for barcodes / QR codes")
+    barcodes = detect_barcodes(cropped_path)
+
+    # Crop each barcode region for frontend display
+    barcode_crops_dir = output_dir / "barcode_crops"
+    barcode_crops_dir.mkdir(parents=True, exist_ok=True)
+    from PIL import Image as _PILImage  # noqa: PLC0415
+    with _PILImage.open(cropped_path) as _src_img:
+        _src_w, _src_h = _src_img.size
+        for _bi, _b in enumerate(barcodes):
+            _pad = 12
+            _cx0 = max(0, _b.x0 - _pad)
+            _cy0 = max(0, _b.y0 - _pad)
+            _cx1 = min(_src_w, _b.x1 + _pad)
+            _cy1 = min(_src_h, _b.y1 + _pad)
+            _crop = _src_img.crop((_cx0, _cy0, _cx1, _cy1))
+            _crop.save(barcode_crops_dir / f"barcode_{_bi}.png")
+
+    barcode_results = [
+        {
+            "format": b.format,
+            "format_label": b.format_label,
+            "text": b.text,
+            "x0": b.x0,
+            "y0": b.y0,
+            "x1": b.x1,
+            "y1": b.y1,
+            "valid": b.valid,
+            "crop_url": f"/api/files/{job_id}/barcode_crops/barcode_{i}.png",
+        }
+        for i, b in enumerate(barcodes)
+    ]
+    logger.info("Step 3b – Found %d barcode(s)", len(barcode_results))
+
+    # ── 4. MinerU parsing ────────────────────────────────────────────────── #
+    logger.info("Step 4/5 – Sending cropped PNG to MinerU: %s", cropped_path.name)
+    mineru_api_key = _get_mineru_api_key()
+    if not mineru_api_key:
+        raise RuntimeError("MINERU_API_KEY is not configured")
+
+    mineru_dir = output_dir / "mineru"
+    client = MineruClient(api_key=mineru_api_key)
+    mineru_data = client.parse_image(cropped_path, mineru_dir)
+
+    # ── 5. Parse + validate ───────────────────────────────────────────────── #
+    logger.info("Step 5/5 – Parsing MinerU result and validating against Word")
+    doc = parse_mineru_fields(mineru_data)
+    word_text = extract_word_text(word_path)
+    word_html = extract_word_html(word_path)
+
+    fields: list[dict] = []
+    for idx, field in enumerate(doc.fields, start=1):
+        validation = validate_field_against_word(field["text"], word_text)
+        fields.append(
+            {
+                "id": f"field-{idx}",
+                **field,
+                "normalized_text": validation.normalized_text,
+                "validation_status": validation.status,
+                "validation_reason": validation.reason,
+                "matched_excerpt": validation.matched_excerpt,
+            }
+        )
+
+    _STATUS_RANK = {"matched": 0, "unmatched": 1, "empty_or_garbled": 2}
+    fields.sort(key=lambda f: (
+        _STATUS_RANK.get(f["validation_status"], 9),
+        f["page"],
+        f["top_pt"],
+        f["x0_pt"],
+    ))
+
+    logger.info(
+        "Pipeline done: job_id=%s fields=%d matched=%d unmatched=%d garbled=%d",
+        job_id,
+        len(fields),
+        sum(1 for f in fields if f["validation_status"] == "matched"),
+        sum(1 for f in fields if f["validation_status"] == "unmatched"),
+        sum(1 for f in fields if f["validation_status"] == "empty_or_garbled"),
+    )
+
+    # ── 5b. Image blocks: QR semantic check → barcode decode ─────────────── #
+    image_block_results = _process_image_blocks(
+        mineru_data=mineru_data,
+        source_image=cropped_path,
+        output_dir=output_dir / "image_blocks",
+    )
+    logger.info("Step 5b – Processed %d image block(s) from MinerU", len(image_block_results))
+
+    return {
+        "preview": {
+            # type='png': frontend renders <img> + overlay (not PDF canvas)
+            "type": "png",
+            "url": f"/api/files/{job_id}/{cropped_rel}",
+            # For PNG the "pt" fields carry pixel dimensions so overlay
+            # scale factors remain 1:1 at 100% zoom.
+            "pageWidthPt": img_w,
+            "pageHeightPt": img_h,
+        },
+        "fields": fields,
+        "word_text": word_text,
+        "word_html": word_html,
+        "barcodes": barcode_results,
+        "image_blocks": image_block_results,
+    }