0%

yolov4的几种部署方式

YoloV4模型部署

针对yolov4的不同应用场景,这里提供几种部署方式。

  • opencv
  • onnx
  • TensorRT
  • triton server

1. opencv

针对原始的darknet模型,可以直接使用opencv加载,opencv针对yolo做了一定的硬件加速。加载代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
import cv2
import time
import os
import requests
from datetime import datetime


class ObjectRecognition(object):
def __init__(self,
label_path,
config_path,
weights_path,
upload_url,
model_name='fire',
confidence_thre=0.5,
nms_thre=0.3,
alert_label=None):
self.alert_label = alert_label if alert_label else []
self.confidence_thre = confidence_thre
self.nms_thre = nms_thre
self.upload_url = upload_url.rstrip('/') + "/"
self.model_name = model_name
self.net = self.load_model(config_path, weights_path)

# 加载类别标签文件
self.LABELS = open(label_path).read().strip().split("\n")
self.nclass = len(self.LABELS)
# 获取YOLO输出层的名字
ln = self.net.getLayerNames()
self.ln = [ln[i[0] - 1] for i in self.net.getUnconnectedOutLayers()]

def upload_image(self, img):
file_name = "recognized_{}.jpg".format(int(time.time()))
img_dir = "/".join([self.model_name, datetime.now().strftime("%Y%m")])
data = cv2.imencode(".jpg", img)[1].tobytes()
files = [
('file', (file_name, data, 'image/png'))
]

result = requests.post(self.upload_url + img_dir, files=files)
if result.status_code == 200:
path = img_dir + "/" + file_name
else:
path = ""
return path

def load_model(self, config_path, weights_path):
# 加载模型配置和权重文件
net = cv2.dnn.readNetFromDarknet(config_path, weights_path)
return net

def detect(self, img):
s = time.time()
# 为每个类别的边界框随机匹配相应颜色
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(self.nclass, 3), dtype='uint8')
# 载入图片并获取其维度
(H, W) = img.shape[:2]

# 将图片构建成一个blob,设置图片尺寸,然后执行一次
# YOLO前馈网络计算,最终获取边界框和相应概率 216
blob = cv2.dnn.blobFromImage(img, 1 / 255.0, (416, 416), swapRB=True, crop=False)

self.net.setInput(blob)

layerOutputs = self.net.forward(self.ln)

end = time.time()
# 初始化边界框,置信度(概率)以及类别
boxes = []
confidences = []
classIDs = []
# 迭代每个输出层,总共三个
for output in layerOutputs:
# 迭代每个检测
for detection in output:
# 提取类别ID和置信度
scores = detection[5:]
classID = np.argmax(scores)
confidence = scores[classID]
# 只保留置信度大于某值的边界框
if confidence > self.confidence_thre:
# 将边界框的坐标还原至与原图片相匹配,记住YOLO返回的是
# 边界框的中心坐标以及边界框的宽度和高度
box = detection[0:4] * np.array([W, H, W, H])
(centerX, centerY, width, height) = box.astype("int")
# 计算边界框的左上角位置
x = int(centerX - (width / 2))
y = int(centerY - (height / 2))
# 更新边界框,置信度(概率)以及类别
boxes.append([x, y, int(width), int(height)])
confidences.append(float(confidence))
classIDs.append(classID)

# 使用非极大值抑制方法抑制弱、重叠边界框
idxs = cv2.dnn.NMSBoxes(boxes, confidences, self.confidence_thre, self.nms_thre)
loc = []
flag = False
# 确保至少一个边界框
if len(idxs) > 0:

# 迭代每个边界框
for i in idxs.flatten():

if classIDs[i] in self.alert_label:
flag = True

# 提取边界框的坐标
(x, y) = (boxes[i][0], boxes[i][1])
(w, h) = (boxes[i][2], boxes[i][3])
# 绘制边界框以及在左上角添加类别标签和置信度
color = [int(c) for c in COLORS[classIDs[i]]]
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
text = '{}: {:.3f}'.format(self.LABELS[classIDs[i]], confidences[i])
(text_w, text_h), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
cv2.rectangle(img, (x, y - text_h - baseline), (x + text_w, y), color, -1)
cv2.putText(img, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
loc.append([x, y, w, h])
m = time.time()

if flag:
# 上传至云端文件服务器
img_path = self.upload_image(img)
else:
img_path = ""
e = time.time()
print("检测时间", m-s, '上传时间', e-m, "整体时间", e-s)
return img_path, flag
else:
return "", flag

2. onnx

针对有GPU的环境,可以统一将darknet转化为onnx格式,可使用转化工具:

对于转化之后的onnx文件,可尝试用如下方式加载预测:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import numpy as np
import cv2
import time
import onnxruntime
import requests
from datetime import datetime
import math
import urllib


class ObjectRecognition(object):
def __init__(self,
label_path,
onnx_path,
upload_url='http://192.168.0.15:8080/ai/upload/',
model_name='fire',
confidence_thre=0.5,
nms_thre=0.3,
alert_label=None):
self.alert_label = alert_label if alert_label else []

# 加载类别标签文件
self.class_names = open(label_path).read().strip().split("\n")

self.confidence_thre = confidence_thre
self.nms_thre = nms_thre
self.upload_url = upload_url.rstrip('/') + "/"
self.model_name = model_name
self.session = onnxruntime.InferenceSession(onnx_path)

def upload_image(self, img):
file_name = "recognized_{}.jpg".format(int(time.time()))
img_dir = "/".join([self.model_name, datetime.now().strftime("%Y%m")])
data = cv2.imencode(".jpg", img)[1].tobytes()
files = [
('file', (file_name, data, 'image/png'))
]

result = requests.post(self.upload_url + img_dir, files=files)
if result.status_code == 200:
path = img_dir + "/" + file_name
else:
path = ""
return path

def detect(self, img):
IN_IMAGE_H = self.session.get_inputs()[0].shape[2]
IN_IMAGE_W = self.session.get_inputs()[0].shape[3]

# Input
resized = cv2.resize(img, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
img_in = np.expand_dims(img_in, axis=0)
img_in /= 255.0
print("Shape of the network input: ", img_in.shape)

# Compute
input_name = self.session.get_inputs()[0].name

outputs = self.session.run(None, {input_name: img_in})

boxes = self.post_processing(0.4, 0.6, outputs)

img_path, flag = self.plot_boxes(img, boxes[0])
return img_path, flag

def post_processing(self, conf_thresh, nms_thresh, output):
# [batch, num, 1, 4]
box_array = output[0]
# [batch, num, num_classes]
confs = output[1]

t1 = time.time()

if type(box_array).__name__ != 'ndarray':
box_array = box_array.cpu().detach().numpy()
confs = confs.cpu().detach().numpy()

num_classes = confs.shape[2]

# [batch, num, 4]
box_array = box_array[:, :, 0]

# [batch, num, num_classes] --> [batch, num]
max_conf = np.max(confs, axis=2)
max_id = np.argmax(confs, axis=2)

t2 = time.time()

bboxes_batch = []
for i in range(box_array.shape[0]):

argwhere = max_conf[i] > conf_thresh
l_box_array = box_array[i, argwhere, :]
l_max_conf = max_conf[i, argwhere]
l_max_id = max_id[i, argwhere]

bboxes = []
# nms for each class
for j in range(num_classes):

cls_argwhere = l_max_id == j
ll_box_array = l_box_array[cls_argwhere, :]
ll_max_conf = l_max_conf[cls_argwhere]
ll_max_id = l_max_id[cls_argwhere]

keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh)

if (keep.size > 0):
ll_box_array = ll_box_array[keep, :]
ll_max_conf = ll_max_conf[keep]
ll_max_id = ll_max_id[keep]

for k in range(ll_box_array.shape[0]):
bboxes.append(
[ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3],
ll_max_conf[k],
ll_max_conf[k], ll_max_id[k]])

bboxes_batch.append(bboxes)

t3 = time.time()
return bboxes_batch

def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False):
# print(boxes.shape)
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]

areas = (x2 - x1) * (y2 - y1)
order = confs.argsort()[::-1]

keep = []
while order.size > 0:
idx_self = order[0]
idx_other = order[1:]

keep.append(idx_self)

xx1 = np.maximum(x1[idx_self], x1[idx_other])
yy1 = np.maximum(y1[idx_self], y1[idx_other])
xx2 = np.minimum(x2[idx_self], x2[idx_other])
yy2 = np.minimum(y2[idx_self], y2[idx_other])

w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
inter = w * h

if min_mode:
over = inter / np.minimum(areas[order[0]], areas[order[1:]])
else:
over = inter / (areas[order[0]] + areas[order[1:]] - inter)

inds = np.where(over <= nms_thresh)[0]
order = order[inds + 1]

return np.array(keep)

def plot_boxes(self, img, boxes):
COLORS = np.random.randint(0, 255, size=(len(self.class_names), 3), dtype='uint8')

width = img.shape[1]
height = img.shape[0]
flag = False
for i in range(len(boxes)):

box = boxes[i]
x1 = int(box[0] * width)
y1 = int(box[1] * height)
x2 = int(box[2] * width)
y2 = int(box[3] * height)

if len(box) >= 7 and self.class_names:
cls_conf = box[5]
cls_id = box[6]

if cls_id in self.alert_label:
flag = True

color = [int(c) for c in COLORS[cls_id]]
text = '{}: {:.3f}'.format(self.class_names[cls_id], cls_conf)
img = cv2.putText(img, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
img = cv2.rectangle(img, (x1, y1), (x2, y2), color, 1)

# cv2.imwrite('predictions_onnx.jpg', img)
if flag:
# 上传至云端文件服务器
img_path = self.upload_image(img)
else:
img_path = ""
return img_path, flag

3. triton server

具体的triton部署方式参考:NVIDIA triton 服务部署与测试

4. tensorRt

可以利用TensorRT自带的转换工具,轻松将onnx模型转为TensorRT模型。

1
/usr/src/tensorrt/bin/trtexec --onnx=/sdk/sunshine/trained_model/yolov4/onnx/helmet/yolov4_helmet_1_3_608_608_static.onnx --explicitBatch --saveEngine=yolov4_helmet_1_3_608_608_static.trt

如果想要转化为半精度,在后面添加参数--fp16即可

加载预测TensorRT模型,可使用如下方式:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import requests
import time
from datetime import datetime
import cv2
import urllib
import numpy as np

TRT_LOGGER = trt.Logger()


class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem

def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):
return self.__str__()


class TRTModelPredict:
def __init__(self, engine_path, shape=(608, 608)):
shape = (1, 3, shape[0], shape[1])
self.engine = self.get_engine(engine_path)
self.context = self.engine.create_execution_context()

self.buffers = self.allocate_buffers(self.engine, 1)
self.context.set_binding_shape(0, shape)

def allocate_buffers(self, engine, batch_size):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:

size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dims = engine.get_binding_shape(binding)

# in case batch dimension is -1 (dynamic)
if dims[0] < 0:
size *= -1

dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream

def get_engine(self, engine_path):
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(engine_path))
with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())

def do_inference(self, img_in):

inputs, outputs, bindings, stream = self.buffers
inputs[0].host = img_in
for i in range(2):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
self.context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]


class Hat(TRTModelPredict):
def __init__(self, label_path, engine_path, shape=(608, 608), upload_url='http://192.168.0.15:8080/ai/upload/',
model_name='fire', alert_label=None):
self.shape = shape
self.alert_label = alert_label if alert_label else []
self.class_names = open(label_path).read().strip().split("\n")
self.model_name = model_name
self.upload_url = upload_url.rstrip('/') + "/"
super(Hat, self).__init__(engine_path, shape)

def pre_process(self, image_src):
resized = cv2.resize(image_src, self.shape, interpolation=cv2.INTER_LINEAR)
img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
img_in = np.expand_dims(img_in, axis=0)
img_in /= 255.0
img_in = np.ascontiguousarray(img_in)
return img_in

def post_processing(self, conf_thresh, nms_thresh, output):
box_array = output[0]
# [batch, num, num_classes]
confs = output[1]

if type(box_array).__name__ != 'ndarray':
box_array = box_array.cpu().detach().numpy()
confs = confs.cpu().detach().numpy()

num_classes = confs.shape[2]

# [batch, num, 4]
box_array = box_array[:, :, 0]

# [batch, num, num_classes] --> [batch, num]
max_conf = np.max(confs, axis=2)
max_id = np.argmax(confs, axis=2)

bboxes_batch = []
for i in range(box_array.shape[0]):

argwhere = max_conf[i] > conf_thresh
l_box_array = box_array[i, argwhere, :]
l_max_conf = max_conf[i, argwhere]
l_max_id = max_id[i, argwhere]

bboxes = []
# nms for each class
for j in range(num_classes):

cls_argwhere = l_max_id == j
ll_box_array = l_box_array[cls_argwhere, :]
ll_max_conf = l_max_conf[cls_argwhere]
ll_max_id = l_max_id[cls_argwhere]

keep = self.nms_cpu(ll_box_array, ll_max_conf, nms_thresh)

if (keep.size > 0):
ll_box_array = ll_box_array[keep, :]
ll_max_conf = ll_max_conf[keep]
ll_max_id = ll_max_id[keep]

for k in range(ll_box_array.shape[0]):
bboxes.append(
[ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3],
ll_max_conf[k],
ll_max_conf[k], ll_max_id[k]])

bboxes_batch.append(bboxes)

return bboxes_batch

def nms_cpu(self, boxes, confs, nms_thresh=0.5, min_mode=False):
# print(boxes.shape)
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]

areas = (x2 - x1) * (y2 - y1)
order = confs.argsort()[::-1]

keep = []
while order.size > 0:
idx_self = order[0]
idx_other = order[1:]

keep.append(idx_self)

xx1 = np.maximum(x1[idx_self], x1[idx_other])
yy1 = np.maximum(y1[idx_self], y1[idx_other])
xx2 = np.minimum(x2[idx_self], x2[idx_other])
yy2 = np.minimum(y2[idx_self], y2[idx_other])

w = np.maximum(0.0, xx2 - xx1)
h = np.maximum(0.0, yy2 - yy1)
inter = w * h

if min_mode:
over = inter / np.minimum(areas[order[0]], areas[order[1:]])
else:
over = inter / (areas[order[0]] + areas[order[1:]] - inter)

inds = np.where(over <= nms_thresh)[0]
order = order[inds + 1]

return np.array(keep)

def detect(self, img):
img_in = self.pre_process(img)
trt_outputs = self.do_inference(img_in)
trt_outputs[0] = trt_outputs[0].reshape(1, -1, 1, 4)
trt_outputs[1] = trt_outputs[1].reshape(1, -1, len(self.class_names))

boxes = self.post_processing(0.4, 0.6, trt_outputs)
return self.plot_boxes(img, boxes[0])


def plot_boxes(self, img, boxes):
COLORS = np.random.randint(0, 255, size=(len(self.class_names), 3), dtype='uint8')

width = img.shape[1]
height = img.shape[0]
flag = False
for i in range(len(boxes)):

box = boxes[i]
x1 = int(box[0] * width)
y1 = int(box[1] * height)
x2 = int(box[2] * width)
y2 = int(box[3] * height)

if len(box) >= 7 and self.class_names:
cls_conf = box[5]
cls_id = box[6]

if cls_id in self.alert_label:
flag = True

color = [int(c) for c in COLORS[cls_id]]
text = '{}: {:.3f}'.format(self.class_names[cls_id], cls_conf)
img = cv2.putText(img, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
img = cv2.rectangle(img, (x1, y1), (x2, y2), color, 1)

# cv2.imwrite('predictions_onnx.jpg', img)
if flag:
# 上传至云端文件服务器
img_path = self.upload_image(img)
else:
img_path = ""
return img_path, flag

def upload_image(self, img):
file_name = "recognized_{}.jpg".format(int(time.time()))
img_dir = "/".join([self.model_name, datetime.now().strftime("%Y%m")])
data = cv2.imencode(".jpg", img)[1].tobytes()
files = [
('file', (file_name, data, 'image/png'))
]

result = requests.post(self.upload_url + img_dir, files=files)
if result.status_code == 200:
path = img_dir + "/" + file_name
else:
path = ""
return path

性能对比

使用安全帽模型,测试对比三种预测方式的情况,性能如下。

模型格式 预测时间 预测置信度 备注
opencv 0.3789951801300049 0.533
onnx 0.44134068489074707 0.425
TensorRT 0.12151765823364258 0.423

opencv的速度居然比onnx还快,离谱。仔细检查后发现,jetson上面装的是onnxruntime而不是onnxruntime-gpu,初步怀疑是并没有用上GPU加速。