0%

pytorch2tensorrt全流程

服务器环境:

NVIDIA Jetson AGX Xavier - Jetpack 4.5.1 [L4T 32.5.1]

  • torch ==1.9.0
  • onnx == 1.10.2
  • pycuda==2020.1
  • cv2==4.1.1
  • tensorrt=7.1.3.0
  • python 3.6.9

1. pytorch2onnx

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import torch
from torchvision import models


def transform_to_onnx(model, shape, onnx_file_name, input_names=["input"], output_names=['boxes', 'confs']):
batch_size = shape[0]
dynamic = False
if batch_size <= 0:
dynamic = True
if dynamic:
x = torch.randn(shape, requires_grad=True)
dynamic_axes = {name: {0: "batch_size"} for name in input_names + output_names}
# Export the model
print('Export the onnx model ...')
torch.onnx.export(model,
x,
onnx_file_name,
export_params=True,
input_names=input_names, output_names=output_names,
dynamic_axes=dynamic_axes
)
print('Onnx model exporting done')
return onnx_file_name

else:
x = torch.randn(shape, requires_grad=True)
# Export the model
print('Export the onnx model ...')
torch.onnx.export(model,
x,
onnx_file_name,
export_params=True,
do_constant_folding=True,
input_names=input_names, output_names=output_names,
dynamic_axes=None)

print('Onnx model exporting done')

利用resnet模型 测试一下

1
2
3
4
5
6
7
model = models.resnet50(pretrained=True)
transform_to_onnx(model,
shape=(1, 3, 224, 224),
onnx_file_name="resnet50.onnx",
input_names=["input"],
output_names=["output"]
)

在当前目前成功生成resnet50.onnx,转化正确。

2. 可视化onnx

可以利用netron工具可视化onnx模型。

1
python -m pip install netron
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
netron -h
usage: netron [-h] [-v] [-b] [-p PORT] [--host HOST] [--log] [MODEL_FILE]

Viewer for neural network, deep learning and machine learning models.

positional arguments:
MODEL_FILE model file to serve

optional arguments:
-h, --help show this help message and exit
-v, --version print version
-b, --browse launch web browser
-p PORT, --port PORT port to serve
--host HOST host to serve
--log log details to console

启动netron

1
nerton --host 0.0.0.0 --port 8080

选择onnx,可以看到具体的模型参数。

image-20211209153618873

2. onnx2trt

利用tensorrt自带的工具,可以轻松完成onnx到trt的转换。

1
/usr/src/bin/trtexec --onnx=resnet50.onnx --explicitBatch --saveEngine=resnet50.trt --fp16

也可以将该命令添加到环境变量中。

1
ln -s /usr/src/bin/trtexec /usr/bin/trtexec

3. 测试

构建一个通用的TensorRT模型加载工具,如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# author: sunshine
# datetime:2021/12/9 下午2:39

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit # 重要


TRT_LOGGER = trt.Logger()


class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem

def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):
return self.__str__()


class TRTModelPredict:
def __init__(self, engine_path, shape=(608, 608)):
shape = (1, 3, shape[0], shape[1])
self.engine = self.get_engine(engine_path)
self.context = self.engine.create_execution_context()

self.buffers = self.allocate_buffers(self.engine, 1)
self.context.set_binding_shape(0, shape)

def allocate_buffers(self, engine, batch_size):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:

size = trt.volume(engine.get_binding_shape(binding)) * batch_size
dims = engine.get_binding_shape(binding)

# in case batch dimension is -1 (dynamic)
if dims[0] < 0:
size *= -1

dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream

def get_engine(self, engine_path):
# If a serialized engine exists, use it instead of building an engine.
print("Reading engine from file {}".format(engine_path))
with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())

def do_inference(self, img_in):

inputs, outputs, bindings, stream = self.buffers
inputs[0].host = img_in
for i in range(2):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
self.context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]

以安全帽识别为例,继承 TRTModelPredict​类,并实现数据前后处理,即可完成模型预测。

结果如下:

image-20211209174956684

详细代码:https://github.com/fushengwuyu/torch2tensorrt_demos