从零开始 TensorRT（7）C++ 篇：解析 ONNX

前言

学习资料：
B站视频配套代码 cookbook

示例

参考源码：cookbook → 04-BuildEngineByONNXParser → pyTorch-ONNX-TensorRT

源码

C++ 代码量较多，已上传 GitHub
OpenCV 安装：

apt install libopencv-dev

（1）按 Python 篇中的方式将 RenNet-18 转为 ONNX

python generate_onnx.py

（2）编译运行

mkdir build
cd build
cmake ..
make
cd ../bin./demo
./demo --fp16
./demo --int8

解析

在 cookbook 中，createCalibrationAndInferenceData.py 将 MNIST 数据存储为 npz 文件，并在 C++ 部分直接读取 Numpy 文件中的数据用于推理和校正，避免了图片解码的相关代码。
本文示例依然是参考 cookbook，使用 ResNet 进行推理。将读取 Numpy 文件的部分改为读取本地图像，并利用 OpenCV 对图像进行预处理，Int8 模式中的校正器部分代码也有所改动。

（1）预处理

std::vector<float> loadImg(const std::string filename, int width, int height, int channel) {cv::Mat image = cv::imread(filename, cv::IMREAD_COLOR);if (image.empty()) {std::cerr << "Error: Unable to read image file." << std::endl;return std::vector<float>();}cv::cvtColor(image, image, cv::COLOR_BGR2RGB);cv::resize(image, image, cv::Size(width, height));image.convertTo(image, CV_32F, 1.0 / 255.0);cv::Scalar meanData(0.485, 0.456, 0.406);cv::Scalar stdData(0.229, 0.224, 0.225);cv::subtract(image, meanData, image);cv::divide(image, stdData, image);"上面图像读取、resize、归一化、标准化都是调用 OpenCV API 与 Python 代码大同小异""下面是对数组维度进行调整: (h,w,3)->(3,h,w)""这里先把图像拆分成三个通道, 依次将三通道中的数据放到data中""通常会直接对图像数据进行遍历放到data中, 效率应该更高"std::vector<cv::Mat> channels;cv::split(image, channels);std::vector<float> data(channel * height * width);int idx = 0;for (int c = 0; c < channel; ++c) {for (int h = 0; h < height; ++h) {for (int w = 0; w < width; ++w) {data[idx++] = channels[c].at<float>(h, w);}}}return data;
}

（2）校准器

"主要是构造函数和 getBatch 与 cookbook 有所不同"
"先看原版"
"这里与 Python 篇中的校准器有所不同"
"Python: 在所有校准数据中随机抽样 batchsize 个循环校正 nCalibration 次"
"C++: 在所有校准数据中依次获取 batchsize 个, 直到剩余数据不足一个 batch, nCalibration 参数并没有用到"
MyCalibrator::MyCalibrator(const std::string &calibrationDataFile, const int nCalibration, const Dims32 dim, const std::string &cacheFile):nCalibration(nCalibration), dim(dim), cacheFile(cacheFile), iBatch(0)
{cnpy::npz_t    npzFile = cnpy::npz_load(calibrationDataFile);cnpy::NpyArray array   = npzFile[std::string("calibrationData")];pData                  = array.data<float>();if (pData == nullptr){std::cout << "Failed getting calibration data!" << std::endl;return;}"nBatch 代替 nCalibration"nBatch   = array.num_bytes() / bufferSize;  "此处源码明显有误, 应该在 bufferSize 计算之后""nElement 计算数组中元素个数, 即 c*h*w"nElement = 1;for (int i = 0; i < dim.nbDims; ++i){nElement *= dim.d[i];}"bufferSize 为数组空间大小"bufferSize = sizeof(float) * nElement;cudaMalloc((void **)&bufferD, bufferSize);return;
}bool MyCalibrator::getBatch(void *bindings[], char const *names[], int32_t nbBindings) noexcept
{if (iBatch < nBatch){cudaMemcpy(bufferD, &pData[iBatch * nElement], bufferSize, cudaMemcpyHostToDevice);bindings[0] = bufferD;iBatch++;return true;}else{return false;}
}

"本文示例将 calibrationDataDir 文件夹内的图像文件作为校准数据, 代替 cookbook 中的 Numpy 数据"
MyCalibrator::MyCalibrator(const std::string &calibrationDataDir, const int nCalibration, const Dims32 dim, const std::string &cacheFile):nCalibration(nCalibration), dim(dim), cacheFile(cacheFile), iBatch(0) {"range-based loop, 用于遍历容器或其他可迭代对象中元素的循环结构""与 Python 中的循环类似 for entry in os.listdir(dir)""const: 变量只读""auto: 自动推导类型""&: 引用, 避免拷贝""fs::directory_iterator: C++17中<filesystem>提供的功能"for (const auto& entry : fs::directory_iterator(calibrationDataDir)) {if (fs::is_regular_file(entry)) {files.push_back(entry.path().string());}}nBatch = files.size() / dim.d[0];nElement = 1;for (int i = 0; i < dim.nbDims; ++i) {nElement *= dim.d[i];}bufferSize = sizeof(float) * nElement;cudaMalloc((void **)&bufferD, bufferSize);return;
}bool MyCalibrator::getBatch(void* bindings[], char const* names[], int32_t nbBindings) noexcept {if (iBatch < nBatch) {for (int i = 0; i < dim.d[0]; ++i) {"逐个读取图像, 并把数据拷贝到 bufferD 中对应位置"std::vector<float> img = loadImg(files[iBatch*dim.d[0]+i], dim.d[3], dim.d[2], dim.d[1]);cudaMemcpy(&bufferD[i*img.size()], img.data(), img.size()*sizeof(float), cudaMemcpyHostToDevice);}bindings[0] = bufferD;iBatch++;return true;}else {return false;}
}

一个奇怪的 Bug

在 int8 模式下，最初设置校正时 BatchSize 为1 calibrationBatchSize {1};，常见输入 BatchSize 为 4 profile->setDimensions(inputTensor->getName(), OptProfileSelector::kOPT, Dims32 {4, {4, nChannel, nHeight, nWidth}}); 时出现如下报错

Succeeded parsing .onnx file!
Failed finding cache file!
ERROR: 1: [calibrator.cpp::add::793] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 1: [executionContext.cpp::commonEmitDebugTensor::1855] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 1: [resizingAllocator.cpp::deallocate::105] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
...
ERROR: 1: [resizingAllocator.cpp::deallocate::105] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 3: [engine.cpp::~Engine::298] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/engine.cpp::~Engine::298, condition: mExecutionContextCounter.use_count() == 1. Destroying an engine object before destroying the IExecutionContext objects it created leads to undefined behavior.
)
ERROR: 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
ERROR: 1: [cudaResources.cpp::~ScopedCudaStream::47] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
ERROR: 2: [calibrator.cpp::calibrateEngine::1181] Error Code 2: Internal Error (Assertion context->executeV2(&bindings[0]) failed. )
Failed building serialized engine!