画像の物体検出軽量化（ラズパイ５）

Yoloは性能的には十分すぎる感じですが、やはりリソースを相当に消費します

https://isehara-3lv.sakura.ne.jp/blog/2025/04/12/yoloとefficientnetの違いと用途/

で、目的を人検出に絞って軽量化を検討してみた

efficientdet_lite0.tfliteのモデルをTFliteで動かすのが現状の割とメジャーな選択だと思われるのでやってみた

<ターゲット>

・ラズパイ５：8GBメモリ/256GB SSD

<コード>

LLMとの対話で生成された最終的なコード、カメラ画像のデータ(RGB)の並べ替えが必要です、パッケージ競合でPython用の環境は新しく作成しています

import cv2
import numpy as np
import tflite_runtime.interpreter as tflite
from picamera2 import Picamera2

# EfficientDet Lite0 モデルロード
interpreter = tflite.Interpreter(model_path="efficientdet_lite0.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# ラベル読み込み
with open("coco_labels_91.txt", "r") as f:
    labels = [line.strip() for line in f.readlines()]

def preprocess_image(image):
    resized = cv2.resize(image, (320, 320))
    resized = resized[:, :, [2, 1, 0]]  # RGBに
    return np.expand_dims(resized, axis=0).astype(np.uint8)

def postprocess_results(boxes, scores, classes, count, image_shape):
    detections = []
    for i in range(count):
        if scores[i] > 0.4:
            ymin, xmin, ymax, xmax = boxes[i]
            (left, right, top, bottom) = (
                int(xmin * image_shape[1]), int(xmax * image_shape[1]),
                int(ymin * image_shape[0]), int(ymax * image_shape[0])
            )
            detections.append({
                'box': (left, top, right, bottom),
                'class_id': int(classes[i]),
                'score': float(scores[i]),
                'label': labels[int(classes[i])] if int(classes[i]) < len(labels) else f"id:{int(classes[i])}"
            })
    return detections

def draw_detections(image, detections):
    for detection in detections:
        left, top, right, bottom = detection['box']
        label = f"{detection['label']} {detection['score']*100:.1f}%"
        cv2.rectangle(image, (left, top), (right, bottom), (0, 255, 0), 2)
        cv2.putText(image, label, (left, top - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)

def main():
    # Picamera2 初期化
    picam2 = Picamera2()
    picam2.preview_configuration.main.size = (640, 480)
    picam2.preview_configuration.main.format = "RGB888"
    picam2.preview_configuration.align()
    picam2.configure("preview")
    picam2.start()

    while True:
        frame = picam2.capture_array()

        input_data = preprocess_image(frame)
        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()

        boxes = interpreter.get_tensor(output_details[0]['index'])[0]
        classes = interpreter.get_tensor(output_details[1]['index'])[0]
        scores = interpreter.get_tensor(output_details[2]['index'])[0]
        count = int(interpreter.get_tensor(output_details[3]['index'])[0])

        detections = postprocess_results(boxes, scores, classes, count, frame.shape)
        draw_detections(frame, detections)

        cv2.imshow("EfficientDet + PiCamera2", frame)
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

<実行結果>

タブレットに写した人物画像から人検出を行っていますが、それなりの確度で検出できてます、精度そのものはYoloよりは低い

ラズパイ５のリソースはCPUが100%程度で、メモリフットプリント的にも余裕はあります、CPU温度はファンは常時回転中で60℃程度、長時間だと全体がほんわりと暖かくなりますが、夏場も何とか耐えそう

ここのパーツはほぼ決まりだろうから、全体構成図を掲載、これからパーツを結合してアプリを作ります

admin