pointnet C++推理部署--tensorrt框架

classification

在这里插入图片描述

如上图所示,由于直接export出的onnx文件有两个输出节点,不方便处理,所以编写脚本删除不需要的输出节点193:

import onnxonnx_model = onnx.load("cls.onnx")
graph = onnx_model.graphinputs = graph.input
for input in inputs:print('input',input.name)outputs = graph.output
for output in outputs:print('output',output.name)graph.output.remove(outputs[1])
onnx.save(onnx_model, 'cls_modified.onnx')

C++推理代码:

#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <NvOnnxParser.h>const int point_num = 1024;void pc_normalize(std::vector<float>& points)
{float mean_x = 0, mean_y = 0, mean_z = 0;for (size_t i = 0; i < point_num; ++i){mean_x += points[3 * i];mean_y += points[3 * i + 1];mean_z += points[3 * i + 2];}mean_x /= point_num;mean_y /= point_num;mean_z /= point_num;for (size_t i = 0; i < point_num; ++i){points[3 * i] -= mean_x;points[3 * i + 1] -= mean_y;points[3 * i + 2] -= mean_z;}float m = 0;for (size_t i = 0; i < point_num; ++i){if (sqrt(pow(points[3 * i], 2) + pow(points[3 * i + 1], 2) + pow(points[3 * i + 2], 2)) > m)m = sqrt(pow(points[3 * i], 2) + pow(points[3 * i + 1], 2) + pow(points[3 * i + 2], 2));}for (size_t i = 0; i < point_num; ++i){points[3 * i] /= m;points[3 * i + 1] /= m;points[3 * i + 2] /= m;}
}class TRTLogger : public nvinfer1::ILogger 
{
public:virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{if (severity <= Severity::kINFO) printf(msg);}
} logger;std::vector<unsigned char> load_file(const std::string& file) 
{std::ifstream in(file, std::ios::in | std::ios::binary);if (!in.is_open())return {};in.seekg(0, std::ios::end);size_t length = in.tellg();std::vector<uint8_t> data;if (length > 0) {in.seekg(0, std::ios::beg);data.resize(length);in.read((char*)& data[0], length);}in.close();return data;
}void classfier(std::vector<float> & points)
{TRTLogger logger;nvinfer1::ICudaEngine* engine;//#define BUILD_ENGINE#ifdef  BUILD_ENGINEnvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);if (!parser->parseFromFile("cls_modified.onnx", 1)){printf("Failed to parser onnx\n");return;}int maxBatchSize = 1;config->setMaxWorkspaceSize(1 << 32);engine = builder->buildEngineWithConfig(*network, *config);if (engine == nullptr) {printf("Build engine failed.\n");return;}nvinfer1::IHostMemory* model_data = engine->serialize();FILE* f = fopen("cls.engine", "wb");fwrite(model_data->data(), 1, model_data->size(), f);fclose(f);model_data->destroy();parser->destroy();engine->destroy();network->destroy();config->destroy();builder->destroy();
#endif  auto engine_data = load_file("cls.engine");nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());if (engine == nullptr){printf("Deserialize cuda engine failed.\n");runtime->destroy();return;}nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();cudaStream_t stream = nullptr;cudaStreamCreate(&stream);float* input_data_host = nullptr;const size_t input_numel = 1 * 3 * point_num;cudaMallocHost(&input_data_host, input_numel * sizeof(float));for (size_t i = 0; i < 3; i++){for (size_t j = 0; j < point_num; j++){input_data_host[point_num * i + j] = points[3 * j + i];}}float* input_data_device = nullptr;float output_data_host[10];float* output_data_device = nullptr;cudaMalloc(&input_data_device, input_numel * sizeof(float));cudaMalloc(&output_data_device, sizeof(output_data_host));cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);float* bindings[] = { input_data_device, output_data_device };bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);cudaStreamSynchronize(stream);int predict_label = std::max_element(output_data_host, output_data_host + 10) - output_data_host;std::cout << "\npredict_label: " << predict_label << std::endl;cudaStreamDestroy(stream);execution_context->destroy();engine->destroy();runtime->destroy();
}int main()
{std::vector<float> points;std::ifstream infile;float x, y, z, nx, ny, nz;char ch;infile.open("bed_0610.txt");for (size_t i = 0; i < point_num; i++){infile >> x >> ch >> y >> ch >> z >> ch >> nx >> ch >> ny >> ch >> nz;points.push_back(x);points.push_back(y);points.push_back(z);}infile.close();pc_normalize(points);classfier(points);return 0;
}

其中推理引擎的构建也可以直接使用tensorrt的bin目录下的trtexec.exe。
LZ也实现了cuda版本的前处理代码,但似乎效率比cpu前处理还低。可能是数据量不够大吧(才10^3数量级),而且目前LZ的cuda水平也只是入门阶段…

#include <iostream>
#include <fstream>
#include <vector>
#include <algorithm>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>
#include <NvOnnxParser.h>const int point_num = 1024;
const int thread_num = 1024;
const int block_num = 1;__global__ void array_sum(float* data, float* val, int N)
{__shared__ double share_dTemp[thread_num];const int nStep = gridDim.x * blockDim.x;const int tid = blockIdx.x * blockDim.x + threadIdx.x;double dTempSum = 0.0;for (int i = tid; i < N; i += nStep){dTempSum += data[i];}share_dTemp[threadIdx.x] = dTempSum;__syncthreads();for (int i = blockDim.x / 2; i != 0; i /= 2){if (threadIdx.x < i){share_dTemp[threadIdx.x] += share_dTemp[threadIdx.x + i];}__syncthreads();}if (0 == threadIdx.x){atomicAdd(val, share_dTemp[0]);}
}__global__ void array_sub(float* data, float val, int N)
{const int tid = blockIdx.x * blockDim.x + threadIdx.x;const int nStep = blockDim.x * gridDim.x;for (int i = tid; i < N; i += nStep){data[i] = data[i] - val;}
}__global__ void array_L2(float* in, float* out, int N)
{const int tid = blockIdx.x * blockDim.x + threadIdx.x;const int nStep = blockDim.x * gridDim.x;for (int i = tid; i < N; i += nStep){out[i] = sqrt(pow(in[i], 2) + pow(in[i + N], 2) + pow(in[i + 2 * N], 2));}
}__global__ void array_max(float* mem, int numbers) 
{int tid = threadIdx.x;int idof = blockIdx.x * blockDim.x;int idx = tid + idof;extern __shared__ float tep[];if (idx >= numbers) return;tep[tid] = mem[idx];unsigned int bi = 0;for (int s = 1; s < blockDim.x; s = (s << 1)){unsigned int kid = tid << (bi + 1);if ((kid + s) >= blockDim.x || (idof + kid + s) >= numbers) break;tep[kid] = tep[kid] > tep[kid + s] ? tep[kid] : tep[kid + s];++bi;__syncthreads();}if (tid == 0) {mem[blockIdx.x] = tep[0];}
}__global__ void array_div(float* data, float val, int N)
{const int tid = blockIdx.x * blockDim.x + threadIdx.x;const int nStep = blockDim.x * gridDim.x;for (int i = tid; i < N; i += nStep){data[i] = data[i] / val;}
}void pc_normalize_gpu(float* points)
{float *mean_x = NULL,  *mean_y = NULL,  *mean_z = NULL;cudaMalloc((void**)& mean_x, sizeof(float));cudaMalloc((void**)& mean_y, sizeof(float));cudaMalloc((void**)& mean_z, sizeof(float));array_sum << <thread_num, block_num >> > (points + 0 * point_num, mean_x, point_num);array_sum << <thread_num, block_num >> > (points + 1 * point_num, mean_y, point_num);array_sum << <thread_num, block_num >> > (points + 2 * point_num, mean_z, point_num);float mx, my, mz;cudaMemcpy(&mx, mean_x, sizeof(float), cudaMemcpyDeviceToHost);cudaMemcpy(&my, mean_y, sizeof(float), cudaMemcpyDeviceToHost);cudaMemcpy(&mz, mean_z, sizeof(float), cudaMemcpyDeviceToHost);array_sub << <thread_num, block_num >> > (points + 0 * point_num, mx / point_num, point_num);array_sub << <thread_num, block_num >> > (points + 1 * point_num, my / point_num, point_num);array_sub << <thread_num, block_num >> > (points + 2 * point_num, mz / point_num, point_num);//float* pts = (float*)malloc(sizeof(float) * point_num);//cudaMemcpy(pts, points, sizeof(float) * point_num, cudaMemcpyDeviceToHost);//for (size_t i = 0; i < point_num; i++)//{//	std::cout << pts[i] << std::endl;//}float* L2 = NULL;cudaMalloc((void**)& L2, sizeof(float) * point_num);array_L2 << <thread_num, block_num >> > (points, L2, point_num);//float* l2 = (float*)malloc(sizeof(float) * point_num);//cudaMemcpy(l2, L2, sizeof(float) * point_num, cudaMemcpyDeviceToHost);//for (size_t i = 0; i < point_num; i++)//{//	std::cout << l2[i] << std::endl;//}int tmp_num = point_num;int share_size = sizeof(float) * thread_num;int block_num = (tmp_num + thread_num - 1) / thread_num;do {array_max << <block_num, thread_num, share_size >> > (L2, thread_num);tmp_num = block_num;block_num = (tmp_num + thread_num - 1) / thread_num;} while (tmp_num > 1);float max;cudaMemcpy(&max, L2, sizeof(float), cudaMemcpyDeviceToHost);//std::cout << max << std::endl;array_div << <thread_num, block_num >> > (points + 0 * point_num, max, point_num);array_div << <thread_num, block_num >> > (points + 1 * point_num, max, point_num);array_div << <thread_num, block_num >> > (points + 2 * point_num, max, point_num);}class TRTLogger : public nvinfer1::ILogger 
{
public:virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{if (severity <= Severity::kINFO) printf(msg);}
} logger;std::vector<unsigned char> load_file(const std::string& file) 
{std::ifstream in(file, std::ios::in | std::ios::binary);if (!in.is_open())return {};in.seekg(0, std::ios::end);size_t length = in.tellg();std::vector<uint8_t> data;if (length > 0) {in.seekg(0, std::ios::beg);data.resize(length);in.read((char*)& data[0], length);}in.close();return data;
}void classfier(std::vector<float> & points)
{TRTLogger logger;nvinfer1::ICudaEngine* engine;//#define BUILD_ENGINE#ifdef  BUILD_ENGINEnvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);if (!parser->parseFromFile("cls_modified.onnx", 1)){printf("Failed to parser onnx\n");return;}int maxBatchSize = 1;config->setMaxWorkspaceSize(1 << 32);engine = builder->buildEngineWithConfig(*network, *config);if (engine == nullptr) {printf("Build engine failed.\n");return;}nvinfer1::IHostMemory* model_data = engine->serialize();FILE* f = fopen("cls.engine", "wb");fwrite(model_data->data(), 1, model_data->size(), f);fclose(f);model_data->destroy();parser->destroy();engine->destroy();network->destroy();config->destroy();builder->destroy();
#endif  auto engine_data = load_file("cls.engine");nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());if (engine == nullptr){printf("Deserialize cuda engine failed.\n");runtime->destroy();return;}nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();cudaStream_t stream = nullptr;cudaStreamCreate(&stream);float* input_data_host = nullptr;const size_t input_numel = 1 * 3 * point_num;cudaMallocHost(&input_data_host, input_numel * sizeof(float));for (size_t i = 0; i < 3; i++){for (size_t j = 0; j < point_num; j++){input_data_host[point_num * i + j] = points[3 * j + i];}}float* input_data_device = nullptr;float output_data_host[10];float* output_data_device = nullptr;cudaMalloc(&input_data_device, input_numel * sizeof(float));cudaMalloc(&output_data_device, sizeof(output_data_host));cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);pc_normalize_gpu(input_data_device);float* bindings[] = { input_data_device, output_data_device };bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);cudaStreamSynchronize(stream);int predict_label = std::max_element(output_data_host, output_data_host + 10) - output_data_host;std::cout << "\npredict_label: " << predict_label << std::endl;cudaStreamDestroy(stream);execution_context->destroy();engine->destroy();runtime->destroy();
}int main()
{std::vector<float> points;std::ifstream infile;float x, y, z, nx, ny, nz;char ch;infile.open("sofa_0020.txt");for (size_t i = 0; i < point_num; i++){infile >> x >> ch >> y >> ch >> z >> ch >> nx >> ch >> ny >> ch >> nz;points.push_back(x);points.push_back(y);points.push_back(z);}infile.close();classfier(points);return 0;
}

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/44189.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

【C++】C++入门基础:引用详解

本篇继续分享关于C入门的相关知识&#xff0c;有关命名空间、缺省参数和函数重载的部分欢迎阅读我的上一篇文章【C】C入门基础详解&#xff08;1&#xff09;_王笃笃的博客-CSDN博客 继续我们的学习 引用 在C语言中我们接触过指针&#xff0c;很多人都或多或少为他感到头痛过…

使用SSH隧道将Ubuntu云服务器Jupyter Notebook端口映射到本地

本文主要实现了在Ubuntu云服务器后台运行Jupyter Notebook&#xff0c;并使用SSH隧道将服务器端口映射到本地 1. 生成配置文件 运行以下命令生成Jupyter Notebook的配置文件&#xff1a; jupyter notebook --generate-config这将在用户主目录下生成一个名为.jupyter的文件夹&…

【傅里叶级数与傅里叶变换】数学推导——3、[Part4:傅里叶级数的复数形式] + [Part5:从傅里叶级数推导傅里叶变换] + 总结

文章内容来自DR_CAN关于傅里叶变换的视频&#xff0c;本篇文章提供了一些基础知识点&#xff0c;比如三角函数常用的导数、三角函数换算公式等。 文章全部链接&#xff1a; 基础知识点 Part1&#xff1a;三角函数系的正交性 Part2&#xff1a;T2π的周期函数的傅里叶级数展开 P…

【Rust日报】2023-08-18 RustShip:一个新的 Rust 播客

探索 Rust 编译器基准测试套件 在最近关于 Rust 编译器 CI&#xff08;持续集成&#xff09;和基准测试基础设施的文章中&#xff0c;作者承诺写一篇关于运行时基准测试的博客文章&#xff0c;这是 Rust 编译器基准测试套件的新补充。然而&#xff0c;在这样做之前&#xff0c;…

回归预测 | MATLAB实现SSA-SVM麻雀搜索算法优化支持向量机多输入单输出回归预测(多指标,多图)

回归预测 | MATLAB实现SSA-SVM麻雀搜索算法优化支持向量机多输入单输出回归预测&#xff08;多指标&#xff0c;多图&#xff09; 目录 回归预测 | MATLAB实现SSA-SVM麻雀搜索算法优化支持向量机多输入单输出回归预测&#xff08;多指标&#xff0c;多图&#xff09;效果一览基…

aardio窗体缩放自动匹配批量生成plus实例

import win.ui; /*DSG{{*/ var winform win.form(text"窗体缩放批量生成plus";right759;bottom469;bgcolor15780518) winform.add( custom{cls"custom";text"自定义控件";left3;top6;right753;bottom460;ah1;aw1;bgcolor15780518;z1} ) /*}}*//…

UML基础模型

目录 1.抽象类2.接口3.继承4.实现接口5.关联关系6.聚合关系7.合成&#xff08;组合&#xff09;关系8.依赖关系 1.抽象类 矩形框代表一个类&#xff08;Class&#xff09;。 类图分为三层&#xff1a; 第一层显示类的名称&#xff0c;如果是抽象类&#xff0c;就用斜体显示&am…

操作系统的体系结构、内核、虚拟机

&#x1f40c;个人主页&#xff1a; &#x1f40c; 叶落闲庭 &#x1f4a8;我的专栏&#xff1a;&#x1f4a8; c语言 数据结构 javaweb 石可破也&#xff0c;而不可夺坚&#xff1b;丹可磨也&#xff0c;而不可夺赤。 操作系统结构 一、操作系统体系结构1.1操作系统的内核1.1.…

TiDB 多集群告警监控-中章-融合多集群 Grafana

作者&#xff1a; longzhuquan 原文来源&#xff1a; https://tidb.net/blog/ac730b0f 背景 随着公司XC改造步伐的前进&#xff0c;越来越多的业务选择 TiDB&#xff0c;由于各个业务之间需要物理隔离&#xff0c;避免不了的 TiDB 集群数量越来越多。虽然每套 TiDB 集群均有…

Gateway网关路由以及predicates用法(项目中使用场景)

1.Gatewaynacos整合微服务 服务注册在nacos上&#xff0c;通过Gateway路由网关配置统一路由访问 这里主要通过yml方式说明&#xff1a; route: config: #type:database nacos yml data-type: yml group: DEFAULT_GROUP data-id: jeecg-gateway-router 配置路由&#xff1a;…

宁德时代与陕汽签署十年战略合作协议,助力商用车电动化进程

据报道&#xff0c;宁德时代新能源科技股份有限公司与陕西汽车控股集团有限公司已经签署了一项为期十年的战略合作协议。双方的合作旨在推动商用车电池技术的发展&#xff0c;并面向商用车全领域应用。 这次战略合作具有重要意义&#xff0c;为宁德时代和陕汽启动了全面合作的序…

2021年3月全国计算机等级考试真题(C语言二级)

2021年3月全国计算机等级考试真题&#xff08;C语言二级&#xff09; 第1题 算法空间复杂度的度量方法是&#xff08;&#xff09; A. 算法程序的长度 B. 算法所处理的数据量 C. 执行算法所需要的工作单元 D. 执行算法所需要的存储空间 正确答案&#xff1a;D 第2题 下列叙…

【自创】关于前端js的“嵌套地狱”的遍历算法

欢迎大家关注我的CSDN账号 欢迎大家关注我的哔哩哔哩账号&#xff1a;卢淼儿的个人空间-卢淼儿个人主页-哔哩哔哩视频 此saas系统我会在9月2号之前&#xff0c;在csdn及哔哩哔哩上发布成套系列教学视频。敬请期待&#xff01;&#xff01;&#xff01; 首先看图 这是我们要解…

Spring Boot 知识集锦之Spring-Batch批处理组件详解

文章目录 0.前言1.参考文档2.基础介绍2.1. 核心组件 3.步骤3.1. 引入依赖3.2. 配置文件3.3. 核心源码 4.示例项目5.总结 0.前言 背景&#xff1a; 一直零散的使用着Spring Boot 的各种组件和特性&#xff0c;从未系统性的学习和总结&#xff0c;本次借着这个机会搞一波。共同学…

无涯教程-TensorFlow - TensorBoard可视化

TensorFlow包含一个可视化工具&#xff0c;称为TensorBoard&#xff0c;它用于分析数据流图&#xff0c;还用于了解机器学习模型。 TensorBoard的重要功能包括查看有关垂直对齐的任何图形的参数和详细信息的不同类型统计的视图。 深度神经网络包括多达36&#xff0c;000个节点…

HCIP——VLAN实验2

一.实验要求 1.PC1/3的接口均为access模式&#xff0c;且属于van2&#xff0c;在同一网段 2.PC2/4/5/6的IP地址在同一网段&#xff0c;与PC1/3不在同一网段 3.PC2可以访问4/5/6&#xff0c;PC4不能访问5/6&#xff0c;PC5不能访问PC6 4.所有PC通过DHCP获取ip地址&#xff0c;PC…

《合成孔径雷达成像算法与实现》Figure3.10

代码复现如下&#xff1a; clc clear close all% 参数设置 TBP 100; % 时间带宽积 T 7.2e-6; % 脉冲持续时间 t_0 1e-6; % 脉冲回波时延% 参数计算 B TBP/T; …

unity 之Transform组件(汇总)

文章目录 理论指导结合例子 理论指导 当在Unity中处理3D场景中的游戏对象时&#xff0c;Transform 组件是至关重要的组件之一。它管理了游戏对象的位置、旋转和缩放&#xff0c;并提供了许多方法来操纵和操作这些属性。以下是关于Transform 组件的详细介绍&#xff1a; 位置&a…

C++进阶 特殊类的设计

本篇博客介绍&#xff1a;介绍几种特殊的类 特殊类的设计 设计一个类不能被拷贝设计一个类 只能在堆上创建对象设计一个类 只能在栈上创造对象设计一个类不能被继承单例模式饿汉模式懒汉模式单例模式对象的释放问题 总结 设计一个类不能被拷贝 我们的拷贝只会发生在两个场景当…

利用POM完成脚本分离实现企业级自动化(POM设计模式+页面的框架封装+测试报告截图)

利用POM完成脚本分离实现企业级自动化&#xff08;POM设计模式页面的框架封装测试报告截图&#xff09; 项目-测试-手工测试 项目-测试-手工测试 1.了解需求&#xff1b; 2.编写测试用例&#xff08;开始&#xff09;——功能测试组会去做的事情 3.执行测试用例——发送测试报…