ZLD_POC/backend/app/image_classifier.py

"""用 Qwen VL 对图片内容做语义分类，判断是否为二维码/条码。

调用方式
--------
::

    from backend.app.image_classifier import is_qr_code

    result = is_qr_code(Path("crop.png"), api_key="sk-...")
    if result:
        # 再交给条码识别模块处理
        ...

设计原则
--------
* 只做"是/否"的单一判断，不解码内容（解码交给 barcode_detector）。
* 复用 region_detector 中已有的 API key / base_url 读取逻辑。
* 网络或模型调用失败时返回 False，保证 pipeline 可降级运行。
"""
from __future__ import annotations

import base64
import io
import logging
from pathlib import Path

logger = logging.getLogger(__name__)

# 使用轻量级的 7B 视觉模型，速度快、成本低
_DEFAULT_MODEL = "qwen2.5-vl-7b-instruct"

_CLASSIFY_PROMPT = (
    "请仔细观察这张图片。\n"
    "问题：图片中是否包含二维码（QR Code）或任何类型的条形码？\n"
    '请只回答"是"或"否"，不要输出其他任何内容。'
)


def _encode_image(image_path: Path, max_side: int = 512) -> str:
    """将图片缩放后编码为 base64 PNG 字符串。

    对小图（如 MinerU 裁出的图片块）保持原尺寸；
    对大图做等比缩放以减少 token 消耗。
    """
    from PIL import Image

    with Image.open(image_path) as img:
        img = img.convert("RGB")
        w, h = img.size
        if max(w, h) > max_side:
            scale = max_side / max(w, h)
            img = img.resize((max(1, round(w * scale)), max(1, round(h * scale))), Image.LANCZOS)

        buf = io.BytesIO()
        img.save(buf, format="PNG")

    return base64.b64encode(buf.getvalue()).decode()


def is_qr_code(
    image_path: Path,
    api_key: str | None = None,
    model: str = _DEFAULT_MODEL,
) -> bool:
    """调用 Qwen VL 判断图片是否包含二维码或条形码。

    Parameters
    ----------
    image_path:
        待分类的图片路径。
    api_key:
        DashScope API Key；若为 None 则从环境变量 / .env 文件自动读取。
    model:
        使用的模型名称，默认为 qwen2.5-vl-7b-instruct。

    Returns
    -------
    bool
        True  → 大模型认为图片中存在二维码/条形码
        False → 不存在，或调用失败（降级返回 False）
    """
    # 延迟导入，避免在未配置环境时影响模块加载
    from backend.app.region_detector import _get_api_key, _get_base_url
    from openai import OpenAI

    key = api_key or _get_api_key()
    if not key:
        logger.warning("image_classifier: DASHSCOPE_API_KEY 未配置，跳过 QR 语义判断")
        return False

    try:
        b64 = _encode_image(image_path)
    except Exception as exc:
        logger.warning("image_classifier: 图片编码失败 (%s)，跳过分类", exc)
        return False

    client = OpenAI(api_key=key, base_url=_get_base_url())
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/png;base64,{b64}"},
                        },
                        {"type": "text", "text": _CLASSIFY_PROMPT},
                    ],
                }
            ],
            max_tokens=10,
            temperature=0.0,
        )
    except Exception as exc:
        logger.warning("image_classifier: Qwen VL 调用失败 (%s)，跳过分类", exc)
        return False

    raw = (response.choices[0].message.content or "").strip()
    logger.debug("image_classifier: 模型原始回复 = %r", raw)

    # 兼容"是"/"否"以及"Yes"/"No"等输出
    answer = raw.lower()
    result = answer.startswith("是") or answer.startswith("yes")
    logger.info(
        "image_classifier: %s → %s（原始回复：%r）",
        image_path.name,
        "二维码/条码" if result else "非二维码",
        raw,
    )
    return result