yolov4的几种部署方式

YoloV4模型部署

针对yolov4的不同应用场景，这里提供几种部署方式。

opencv
onnx
TensorRT
triton server

1. opencv

针对原始的darknet模型，可以直接使用opencv加载，opencv针对yolo做了一定的硬件加速。加载代码如下：

import numpy as np
import cv2
import time
import os
import requests
from datetime import datetime


class ObjectRecognition(object):
    def __init__(self,
                 label_path,
                 config_path,
                 weights_path,
                 upload_url,
                 model_name='fire',
                 confidence_thre=0.5,
                 nms_thre=0.3,
                 alert_label=None):
        self.alert_label = alert_label if alert_label else []
        self.confidence_thre = confidence_thre
        self.nms_thre = nms_thre
        self.upload_url = upload_url.rstrip('/') + "/"
        self.model_name = model_name
        self.net = self.load_model(config_path, weights_path)

        # 加载类别标签文件
        self.LABELS = open(label_path).read().strip().split("\n")
        self.nclass = len(self.LABELS)
        # 获取YOLO输出层的名字
        ln = self.net.getLayerNames()
        self.ln = [ln[i[0] - 1] for i in self.net.getUnconnectedOutLayers()]

    def upload_image(self, img):
        file_name = "recognized_{}.jpg".format(int(time.time()))
        img_dir = "/".join([self.model_name, datetime.now().strftime("%Y%m")])
        data = cv2.imencode(".jpg", img)[1].tobytes()
        files = [
            ('file', (file_name, data, 'image/png'))
        ]

        result = requests.post(self.upload_url + img_dir, files=files)
        if result.status_code == 200:
            path = img_dir + "/" + file_name
        else:
            path = ""
        return path

    def load_model(self, config_path, weights_path):
        # 加载模型配置和权重文件
        net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
        return net

    def detect(self, img):
        s = time.time()
        # 为每个类别的边界框随机匹配相应颜色
        np.random.seed(42)
        COLORS = np.random.randint(0, 255, size=(self.nclass, 3), dtype='uint8')
        # 载入图片并获取其维度
        (H, W) = img.shape[:2]

        # 将图片构建成一个blob，设置图片尺寸，然后执行一次
        # YOLO前馈网络计算，最终获取边界框和相应概率 216
        blob = cv2.dnn.blobFromImage(img, 1 / 255.0, (416, 416), swapRB=True, crop=False)

        self.net.setInput(blob)

        layerOutputs = self.net.forward(self.ln)

        end = time.time()
        # 初始化边界框，置信度（概率）以及类别
        boxes = []
        confidences = []
        classIDs = []
        # 迭代每个输出层，总共三个
        for output in layerOutputs:
            # 迭代每个检测
            for detection in output:
                # 提取类别ID和置信度
                scores = detection[5:]
                classID = np.argmax(scores)
                confidence = scores[classID]
                # 只保留置信度大于某值的边界框
                if confidence > self.confidence_thre:
                    # 将边界框的坐标还原至与原图片相匹配，记住YOLO返回的是
                    # 边界框的中心坐标以及边界框的宽度和高度
                    box = detection[0:4] * np.array([W, H, W, H])
                    (centerX, centerY, width, height) = box.astype("int")
                    # 计算边界框的左上角位置
                    x = int(centerX - (width / 2))
                    y = int(centerY - (height / 2))
                    # 更新边界框，置信度（概率）以及类别
                    boxes.append([x, y, int(width), int(height)])
                    confidences.append(float(confidence))
                    classIDs.append(classID)

        # 使用非极大值抑制方法抑制弱、重叠边界框
        idxs = cv2.dnn.NMSBoxes(boxes, confidences, self.confidence_thre, self.nms_thre)
        loc = []
        flag = False
        # 确保至少一个边界框
        if len(idxs) > 0:

            # 迭代每个边界框
            for i in idxs.flatten():

                if classIDs[i] in self.alert_label:
                    flag = True

                    # 提取边界框的坐标
                    (x, y) = (boxes[i][0], boxes[i][1])
                    (w, h) = (boxes[i][2], boxes[i][3])
                    # 绘制边界框以及在左上角添加类别标签和置信度
                    color = [int(c) for c in COLORS[classIDs[i]]]
                    cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
                    text = '{}: {:.3f}'.format(self.LABELS[classIDs[i]], confidences[i])
                    (text_w, text_h), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                    cv2.rectangle(img, (x, y - text_h - baseline), (x + text_w, y), color, -1)
                    cv2.putText(img, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
                    loc.append([x, y, w, h])
            m = time.time()

            if flag:
                # 上传至云端文件服务器
                img_path = self.upload_image(img)
            else:
                img_path = ""
            e = time.time()
            print("检测时间", m-s, '上传时间', e-m, "整体时间", e-s)
            return img_path, flag
        else:
            return "", flag

2. onnx

针对有GPU的环境，可以统一将darknet转化为onnx格式，可使用转化工具：

对于转化之后的onnx文件，可尝试用如下方式加载预测：

import numpy as np
import cv2
import time
import onnxruntime
import requests
from datetime import datetime
import math
import urllib


class ObjectRecognition(object):
    def __init__(self,
                 label_path,
                 onnx_path,
                 upload_url='http://192.168.0.15:8080/ai/upload/',
                 model_name='fire',
                 confidence_thre=0.5,
                 nms_thre=0.3,
                 alert_label=None):
        self.alert_label = alert_label if alert_label else []

        # 加载类别标签文件
        self.class_names = open(label_path).read().strip().split("\n")

        self.confidence_thre = confidence_thre
        self.nms_thre = nms_thre
        self.upload_url = upload_url.rstrip('/') + "/"
        self.model_name = model_name
        self.session = onnxruntime.InferenceSession(onnx_path)

    def upload_image(self, img):
        file_name = "recognized_{}.jpg".format(int(time.time()))
        img_dir = "/".join([self.model_name, datetime.now().strftime("%Y%m")])
        data = cv2.imencode(".jpg", img)[1].tobytes()
        files = [
            ('file', (file_name, data, 'image/png'))
        ]

        result = requests.post(self.upload_url + img_dir, files=files)
        if result.status_code == 200:
            path = img_dir + "/" + file_name
        else:
            path = ""
        return path

    def detect(self, img):
        IN_IMAGE_H = self.session.get_inputs()[0].shape[2]
        IN_IMAGE_W = self.session.get_inputs()[0].shape[3]

        # Input
        resized = cv2.resize(img, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
        img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
        img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)
        img_in /= 255.0
        print("Shape of the network input: ", img_in.shape)

        # Compute
        input_name = self.session.get_inputs()[0].name

        outputs = self.session.run(None, {input_name: img_in})

        boxes = self.post_processing(0.4, 0.6, outputs)

        img_path, flag = self.plot_boxes(img, boxes[0])
        return img_path, flag

    def post_processing(self, conf_thresh, nms_thresh, output):
        # [batch, num, 1, 4]
        box_array = output[0]
        # [batch, num, num_classes]
        confs = output[1]

        t1 = time.time()

        if type(box_array).__name__ != 'ndarray':
            box_array = box_array.cpu().detach().numpy()
            confs = confs.cpu().detach().numpy()

        num_classes = confs.shape[2]

        # [batch, num, 4]
        box_array = box_array[:, :, 0]

        # [batch, num, num_classes] --> [batch, num]
        max_conf = np.max(confs, axis=2)
        max_id = np.argmax(confs, axis=2)

        t2 = time.time()

        bboxes_batch = []
        for i in range(box_array.shape[0]):

            argwhere = max_conf[i] > conf_thresh
            l_box_array = box_array[i, argwhere, :]
            l_max_conf = max_conf[i, argwhere]
            l_max_id = max_id[i, argwhere]

            bboxes = []
            # nms for each class
            for j in range(num_classes):

                cls_argwhere = l_max_id == j
                ll_box_array = l_box_array[cls_argwhere, :]
                ll_max_conf = l_max_conf[cls_argwhere]
                ll_max_id = l_max_id[cls_argwhere]

                keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh)

                if (keep.size > 0):
                    ll_box_array = ll_box_array[keep, :]
                    ll_max_conf = ll_max_conf[keep]
                    ll_max_id = ll_max_id[keep]

                    for k in range(ll_box_array.shape[0]):
                        bboxes.append(
                            [ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3],
                             ll_max_conf[k],
                             ll_max_conf[k], ll_max_id[k]])

            bboxes_batch.append(bboxes)

        t3 = time.time()
        return bboxes_batch

    def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False):
        # print(boxes.shape)
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = boxes[:, 2]
        y2 = boxes[:, 3]

        areas = (x2 - x1) * (y2 - y1)
        order = confs.argsort()[::-1]

        keep = []
        while order.size > 0:
            idx_self = order[0]
            idx_other = order[1:]

            keep.append(idx_self)

            xx1 = np.maximum(x1[idx_self], x1[idx_other])
            yy1 = np.maximum(y1[idx_self], y1[idx_other])
            xx2 = np.minimum(x2[idx_self], x2[idx_other])
            yy2 = np.minimum(y2[idx_self], y2[idx_other])

            w = np.maximum(0.0, xx2 - xx1)
            h = np.maximum(0.0, yy2 - yy1)
            inter = w * h

            if min_mode:
                over = inter / np.minimum(areas[order[0]], areas[order[1:]])
            else:
                over = inter / (areas[order[0]] + areas[order[1:]] - inter)

            inds = np.where(over <= nms_thresh)[0]
            order = order[inds + 1]

        return np.array(keep)

    def plot_boxes(self, img, boxes):
        COLORS = np.random.randint(0, 255, size=(len(self.class_names), 3), dtype='uint8')

        width = img.shape[1]
        height = img.shape[0]
        flag = False
        for i in range(len(boxes)):

            box = boxes[i]
            x1 = int(box[0] * width)
            y1 = int(box[1] * height)
            x2 = int(box[2] * width)
            y2 = int(box[3] * height)

            if len(box) >= 7 and self.class_names:
                cls_conf = box[5]
                cls_id = box[6]

                if cls_id in self.alert_label:
                    flag = True

                color = [int(c) for c in COLORS[cls_id]]
                text = '{}: {:.3f}'.format(self.class_names[cls_id], cls_conf)
                img = cv2.putText(img, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
            img = cv2.rectangle(img, (x1, y1), (x2, y2), color, 1)

        # cv2.imwrite('predictions_onnx.jpg', img)
        if flag:
            # 上传至云端文件服务器
            img_path = self.upload_image(img)
        else:
            img_path = ""
        return img_path, flag

3. triton server

具体的triton部署方式参考：NVIDIA triton 服务部署与测试

4. tensorRt

可以利用TensorRT自带的转换工具，轻松将onnx模型转为TensorRT模型。

1	/usr/src/tensorrt/bin/trtexec --onnx=/sdk/sunshine/trained_model/yolov4/onnx/helmet/yolov4_helmet_1_3_608_608_static.onnx --explicitBatch --saveEngine=yolov4_helmet_1_3_608_608_static.trt

如果想要转化为半精度，在后面添加参数--fp16即可

加载预测TensorRT模型，可使用如下方式：

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import requests
import time
from datetime import datetime
import cv2
import urllib
import numpy as np

TRT_LOGGER = trt.Logger()


class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


class TRTModelPredict:
    def __init__(self, engine_path, shape=(608, 608)):
        shape = (1, 3, shape[0], shape[1])
        self.engine = self.get_engine(engine_path)
        self.context = self.engine.create_execution_context()

        self.buffers = self.allocate_buffers(self.engine, 1)
        self.context.set_binding_shape(0, shape)

    def allocate_buffers(self, engine, batch_size):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in engine:

            size = trt.volume(engine.get_binding_shape(binding)) * batch_size
            dims = engine.get_binding_shape(binding)

            # in case batch dimension is -1 (dynamic)
            if dims[0] < 0:
                size *= -1

            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(device_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))
        return inputs, outputs, bindings, stream

    def get_engine(self, engine_path):
        # If a serialized engine exists, use it instead of building an engine.
        print("Reading engine from file {}".format(engine_path))
        with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())

    def do_inference(self, img_in):

        inputs, outputs, bindings, stream = self.buffers
        inputs[0].host = img_in
        for i in range(2):
            # Transfer input data to the GPU.
            [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
            # Run inference.
            self.context.execute_async(bindings=bindings, stream_handle=stream.handle)
            # Transfer predictions back from the GPU.
            [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
            # Synchronize the stream
            stream.synchronize()
        # Return only the host outputs.
        return [out.host for out in outputs]


class Hat(TRTModelPredict):
    def __init__(self, label_path, engine_path, shape=(608, 608), upload_url='http://192.168.0.15:8080/ai/upload/',
                 model_name='fire', alert_label=None):
        self.shape = shape
        self.alert_label = alert_label if alert_label else []
        self.class_names = open(label_path).read().strip().split("\n")
        self.model_name = model_name
        self.upload_url = upload_url.rstrip('/') + "/"
        super(Hat, self).__init__(engine_path, shape)

    def pre_process(self, image_src):
        resized = cv2.resize(image_src, self.shape, interpolation=cv2.INTER_LINEAR)
        img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
        img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)
        img_in /= 255.0
        img_in = np.ascontiguousarray(img_in)
        return img_in

    def post_processing(self, conf_thresh, nms_thresh, output):
        box_array = output[0]
        # [batch, num, num_classes]
        confs = output[1]

        if type(box_array).__name__ != 'ndarray':
            box_array = box_array.cpu().detach().numpy()
            confs = confs.cpu().detach().numpy()

        num_classes = confs.shape[2]

        # [batch, num, 4]
        box_array = box_array[:, :, 0]

        # [batch, num, num_classes] --> [batch, num]
        max_conf = np.max(confs, axis=2)
        max_id = np.argmax(confs, axis=2)

        bboxes_batch = []
        for i in range(box_array.shape[0]):

            argwhere = max_conf[i] > conf_thresh
            l_box_array = box_array[i, argwhere, :]
            l_max_conf = max_conf[i, argwhere]
            l_max_id = max_id[i, argwhere]

            bboxes = []
            # nms for each class
            for j in range(num_classes):

                cls_argwhere = l_max_id == j
                ll_box_array = l_box_array[cls_argwhere, :]
                ll_max_conf = l_max_conf[cls_argwhere]
                ll_max_id = l_max_id[cls_argwhere]

                keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh)

                if (keep.size > 0):
                    ll_box_array = ll_box_array[keep, :]
                    ll_max_conf = ll_max_conf[keep]
                    ll_max_id = ll_max_id[keep]

                    for k in range(ll_box_array.shape[0]):
                        bboxes.append(
                            [ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3],
                             ll_max_conf[k],
                             ll_max_conf[k], ll_max_id[k]])

            bboxes_batch.append(bboxes)

        return bboxes_batch

    def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False):
        # print(boxes.shape)
        x1 = boxes[:, 0]
        y1 = boxes[:, 1]
        x2 = boxes[:, 2]
        y2 = boxes[:, 3]

        areas = (x2 - x1) * (y2 - y1)
        order = confs.argsort()[::-1]

        keep = []
        while order.size > 0:
            idx_self = order[0]
            idx_other = order[1:]

            keep.append(idx_self)

            xx1 = np.maximum(x1[idx_self], x1[idx_other])
            yy1 = np.maximum(y1[idx_self], y1[idx_other])
            xx2 = np.minimum(x2[idx_self], x2[idx_other])
            yy2 = np.minimum(y2[idx_self], y2[idx_other])

            w = np.maximum(0.0, xx2 - xx1)
            h = np.maximum(0.0, yy2 - yy1)
            inter = w * h

            if min_mode:
                over = inter / np.minimum(areas[order[0]], areas[order[1:]])
            else:
                over = inter / (areas[order[0]] + areas[order[1:]] - inter)

            inds = np.where(over <= nms_thresh)[0]
            order = order[inds + 1]

        return np.array(keep)

    def detect(self, img):
        img_in = self.pre_process(img)
        trt_outputs = self.do_inference(img_in)
        trt_outputs[0] = trt_outputs[0].reshape(1, -1, 1, 4)
        trt_outputs[1] = trt_outputs[1].reshape(1, -1, len(self.class_names))

        boxes = self.post_processing(0.4, 0.6, trt_outputs)
        return self.plot_boxes(img, boxes[0])


    def plot_boxes(self, img, boxes):
        COLORS = np.random.randint(0, 255, size=(len(self.class_names), 3), dtype='uint8')

        width = img.shape[1]
        height = img.shape[0]
        flag = False
        for i in range(len(boxes)):

            box = boxes[i]
            x1 = int(box[0] * width)
            y1 = int(box[1] * height)
            x2 = int(box[2] * width)
            y2 = int(box[3] * height)

            if len(box) >= 7 and self.class_names:
                cls_conf = box[5]
                cls_id = box[6]

                if cls_id in self.alert_label:
                    flag = True

                color = [int(c) for c in COLORS[cls_id]]
                text = '{}: {:.3f}'.format(self.class_names[cls_id], cls_conf)
                img = cv2.putText(img, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
            img = cv2.rectangle(img, (x1, y1), (x2, y2), color, 1)

        # cv2.imwrite('predictions_onnx.jpg', img)
        if flag:
            # 上传至云端文件服务器
            img_path = self.upload_image(img)
        else:
            img_path = ""
        return img_path, flag

    def upload_image(self, img):
        file_name = "recognized_{}.jpg".format(int(time.time()))
        img_dir = "/".join([self.model_name, datetime.now().strftime("%Y%m")])
        data = cv2.imencode(".jpg", img)[1].tobytes()
        files = [
            ('file', (file_name, data, 'image/png'))
        ]

        result = requests.post(self.upload_url + img_dir, files=files)
        if result.status_code == 200:
            path = img_dir + "/" + file_name
        else:
            path = ""
        return path

性能对比

使用安全帽模型，测试对比三种预测方式的情况，性能如下。

模型格式	预测时间	预测置信度
opencv	0.3789951801300049	0.533
onnx	0.44134068489074707	0.425
TensorRT	0.12151765823364258	0.423

opencv的速度居然比onnx还快，离谱。仔细检查后发现，jetson上面装的是onnxruntime而不是onnxruntime-gpu，初步怀疑是并没有用上GPU加速。