tensorrt的安装和使用

安装

提前安装好 CUDA 和 CUDNN,登录 NVIDIA 官方网站下载和主机 CUDA 版本适配的 TensorRT 压缩包即可。

以 CUDA 版本是 10.2 为例,选择适配 CUDA 10.2 的 tar 包,然后执行类似如下的命令安装并测试:

#安装c++版本
cd /the/path/of/tensorrt/tar/gz/file 
tar -zxvf TensorRT-8.2.5.1.linux.x86_64-gnu.cuda-10.2.cudnn8.2.tar.gz 
export TENSORRT_DIR=$(pwd)/TensorRT-8.2.5.1 
export LD_LIBRARY_PATH=$TENSORRT_DIR/lib:$LD_LIBRARY_PATH #安装python版本
pip install TensorRT-8.2.5.1/python/tensorrt-8.2.5.1-cp37-none-linux_x86_64.whl 
python -c "import tensorrt;print(tensorrt.__version__)" #打印8.2.5.1,则说明安装成功

构建trt模型

手动搭建

使用python接口

import tensorrt as trt verbose = True 
IN_NAME = 'input' 
OUT_NAME = 'output' 
IN_H = 224 
IN_W = 224 
BATCH_SIZE = 1 EXPLICIT_BATCH = 1 << (int)( trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger() 
with trt.Builder(TRT_LOGGER) as builder, builder.create_builder_config( 
) as config, builder.create_network(EXPLICIT_BATCH) as network: # define network input_tensor = network.add_input( name=IN_NAME, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H, IN_W)) pool = network.add_pooling( input=input_tensor, type=trt.PoolingType.MAX, window_size=(2, 2)) pool.stride = (2, 2) pool.get_output(0).name = OUT_NAME network.mark_output(pool.get_output(0)) # serialize the model to engine file profile = builder.create_optimization_profile() profile.set_shape_input('input', *[[BATCH_SIZE, 3, IN_H, IN_W]]*3)  builder.max_batch_size = 1 config.max_workspace_size = 1 << 30 engine = builder.build_engine(network, config) with open('model_python_trt.engine', mode='wb') as f: f.write(bytearray(engine.serialize())) print("generating file done!")

使用c++接口

#include <fstream> 
#include <iostream> #include <NvInfer.h> 
#include <../samples/common/logger.h> using namespace nvinfer1; 
using namespace sample; const char* IN_NAME = "input"; 
const char* OUT_NAME = "output"; 
static const int IN_H = 224; 
static const int IN_W = 224; 
static const int BATCH_SIZE = 1; 
static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); int main(int argc, char** argv) 
{ // Create builder Logger m_logger; IBuilder* builder = createInferBuilder(m_logger); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network INetworkDefinition* network = builder->createNetworkV2(EXPLICIT_BATCH); ITensor* input_tensor = network->addInput(IN_NAME, DataType::kFLOAT, Dims4{ BATCH_SIZE, 3, IN_H, IN_W }); IPoolingLayer* pool = network->addPoolingNd(*input_tensor, PoolingType::kMAX, DimsHW{ 2, 2 }); pool->setStrideNd(DimsHW{ 2, 2 }); pool->getOutput(0)->setName(OUT_NAME); network->markOutput(*pool->getOutput(0)); // Build engine IOptimizationProfile* profile = builder->createOptimizationProfile(); profile->setDimensions(IN_NAME, OptProfileSelector::kMIN, Dims4(BATCH_SIZE, 3, IN_H, IN_W)); profile->setDimensions(IN_NAME, OptProfileSelector::kOPT, Dims4(BATCH_SIZE, 3, IN_H, IN_W)); profile->setDimensions(IN_NAME, OptProfileSelector::kMAX, Dims4(BATCH_SIZE, 3, IN_H, IN_W)); config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); // Serialize the model to engine file IHostMemory* modelStream{ nullptr }; assert(engine != nullptr); modelStream = engine->serialize(); std::ofstream p("model.engine", std::ios::binary); if (!p) { std::cerr << "could not open output file to save model" << std::endl; return -1; } p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size()); std::cout << "generating file done!" << std::endl; // Release resources modelStream->destroy(); network->destroy(); engine->destroy(); builder->destroy(); config->destroy(); return 0; 
} 

onnx模型转换

trtexec

使用python接口

import torch 
import onnx 
import tensorrt as trt onnx_model = 'model.onnx' class NaiveModel(torch.nn.Module): def __init__(self): super().__init__() self.pool = torch.nn.MaxPool2d(2, 2) def forward(self, x): return self.pool(x) device = torch.device('cuda:0') # generate ONNX model 
torch.onnx.export(NaiveModel(), torch.randn(1, 3, 224, 224), onnx_model, input_names=['input'], output_names=['output'], opset_version=11) 
onnx_model = onnx.load(onnx_model) # create builder and network 
logger = trt.Logger(trt.Logger.ERROR) 
builder = trt.Builder(logger) 
EXPLICIT_BATCH = 1 << (int)( trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 
network = builder.create_network(EXPLICIT_BATCH) # parse onnx 
parser = trt.OnnxParser(network, logger) if not parser.parse(onnx_model.SerializeToString()): error_msgs = '' for error in range(parser.num_errors): error_msgs += f'{parser.get_error(error)}\n' raise RuntimeError(f'Failed to parse onnx, {error_msgs}') config = builder.create_builder_config() 
config.max_workspace_size = 1<<20 
profile = builder.create_optimization_profile() profile.set_shape('input', [1,3 ,224 ,224], [1,3,224, 224], [1,3 ,224 ,224]) 
config.add_optimization_profile(profile) 
# create engine 
with torch.cuda.device(device): engine = builder.build_engine(network, config) with open('model.engine', mode='wb') as f: f.write(bytearray(engine.serialize())) print("generating file done!") 

使用c++接口

#include <fstream> 
#include <iostream> #include <NvInfer.h> 
#include <NvOnnxParser.h> 
#include <../samples/common/logger.h> using namespace nvinfer1; 
using namespace nvonnxparser; 
using namespace sample; int main(int argc, char** argv) 
{ // Create builder Logger m_logger; IBuilder* builder = createInferBuilder(m_logger); const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); IBuilderConfig* config = builder->createBuilderConfig(); // Create model to populate the network INetworkDefinition* network = builder->createNetworkV2(explicitBatch); // Parse ONNX file IParser* parser = nvonnxparser::createParser(*network, m_logger); bool parser_status = parser->parseFromFile("model.onnx", static_cast<int>(ILogger::Severity::kWARNING)); // Get the name of network input Dims dim = network->getInput(0)->getDimensions(); if (dim.d[0] == -1)  // -1 means it is a dynamic model { const char* name = network->getInput(0)->getName(); IOptimizationProfile* profile = builder->createOptimizationProfile(); profile->setDimensions(name, OptProfileSelector::kMIN, Dims4(1, dim.d[1], dim.d[2], dim.d[3])); profile->setDimensions(name, OptProfileSelector::kOPT, Dims4(1, dim.d[1], dim.d[2], dim.d[3])); profile->setDimensions(name, OptProfileSelector::kMAX, Dims4(1, dim.d[1], dim.d[2], dim.d[3])); config->addOptimizationProfile(profile); } // Build engine config->setMaxWorkspaceSize(1 << 20); ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); // Serialize the model to engine file IHostMemory* modelStream{ nullptr }; assert(engine != nullptr); modelStream = engine->serialize(); std::ofstream p("model.engine", std::ios::binary); if (!p) { std::cerr << "could not open output file to save model" << std::endl; return -1; } p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size()); std::cout << "generate file success!" << std::endl; // Release resources modelStream->destroy(); network->destroy(); engine->destroy(); builder->destroy(); config->destroy(); return 0; 
} 

模型推理

使用python接口

#输入一个 1x3x224x224 的张量,输出一个 1x3x112x112 的张量
from typing import Union, Optional, Sequence,Dict,Any import torch 
import tensorrt as trt class TRTWrapper(torch.nn.Module): def __init__(self,engine: Union[str, trt.ICudaEngine], output_names: Optional[Sequence[str]] = None) -> None: super().__init__() self.engine = engine if isinstance(self.engine, str): with trt.Logger() as logger, trt.Runtime(logger) as runtime: with open(self.engine, mode='rb') as f: engine_bytes = f.read() self.engine = runtime.deserialize_cuda_engine(engine_bytes) self.context = self.engine.create_execution_context() names = [_ for _ in self.engine] input_names = list(filter(self.engine.binding_is_input, names)) self._input_names = input_names self._output_names = output_names if self._output_names is None: output_names = list(set(names) - set(input_names)) self._output_names = output_names def forward(self, inputs: Dict[str, torch.Tensor]): assert self._input_names is not None assert self._output_names is not None bindings = [None] * (len(self._input_names) + len(self._output_names)) profile_id = 0 for input_name, input_tensor in inputs.items(): # check if input shape is valid profile = self.engine.get_profile_shape(profile_id, input_name) assert input_tensor.dim() == len( profile[0]), 'Input dim is different from engine profile.' for s_min, s_input, s_max in zip(profile[0], input_tensor.shape, profile[2]): assert s_min <= s_input <= s_max, \ 'Input shape should be between ' \ + f'{profile[0]} and {profile[2]}' \ + f' but get {tuple(input_tensor.shape)}.' idx = self.engine.get_binding_index(input_name) # All input tensors must be gpu variables assert 'cuda' in input_tensor.device.type input_tensor = input_tensor.contiguous() if input_tensor.dtype == torch.long: input_tensor = input_tensor.int() self.context.set_binding_shape(idx, tuple(input_tensor.shape)) bindings[idx] = input_tensor.contiguous().data_ptr() # create output tensors outputs = {} for output_name in self._output_names: idx = self.engine.get_binding_index(output_name) dtype = torch.float32 shape = tuple(self.context.get_binding_shape(idx)) device = torch.device('cuda') output = torch.empty(size=shape, dtype=dtype, device=device) outputs[output_name] = output bindings[idx] = output.data_ptr() self.context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream) return outputs model = TRTWrapper('model.engine', ['output']) 
output = model(dict(input = torch.randn(1, 3, 224, 224).cuda())) 
print(output) 

c++接口

#include <fstream> 
#include <iostream> #include <NvInfer.h> 
#include <../samples/common/logger.h> #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) using namespace nvinfer1; 
using namespace sample; const char* IN_NAME = "input"; 
const char* OUT_NAME = "output"; 
static const int IN_H = 224; 
static const int IN_W = 224; 
static const int BATCH_SIZE = 1; 
static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); void doInference(IExecutionContext& context, float* input, float* output, int batchSize) 
{ const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(IN_NAME); const int outputIndex = engine.getBindingIndex(OUT_NAME); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 3 * IN_H * IN_W /4 * sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * 3 * IN_H * IN_W / 4 * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); 
} int main(int argc, char** argv) 
{ // create a model using the API directly and serialize it to a stream char *trtModelStream{ nullptr }; size_t size{ 0 }; std::ifstream file("model.engine", std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } Logger m_logger; IRuntime* runtime = createInferRuntime(m_logger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); // generate input data float data[BATCH_SIZE * 3 * IN_H * IN_W]; for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++) data[i] = 1; // Run inference float prob[BATCH_SIZE * 3 * IN_H * IN_W /4]; doInference(*context, data, prob, BATCH_SIZE); // Destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; 
} 

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/83938.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

Redis之hash类型

文章目录 Redis之hash类型1. 设置一个字段/获取一个字段2. 获取所有字段值3. 判断字段是否存在4. 设置多个字段/获取多个字段5. 只获取字段名/字段值6. 获取某个key内全部数量7. 增加数字8. 删除key内字段9. 字段不存在时赋值10. 应用场景 Redis之hash类型 redis的hash类型&…

postman接口传参案例

目录 案例1&#xff1a; 接口A 接口B 案例2&#xff1a; //断言 案例1&#xff1a; 接口A 根据返回值需要从返回值中提取userid值&#xff0c;在Tests标签栏下编写脚本 //获取返回的响应值&#xff0c;并转化为json格式 var jsonData pm.response.json(); // 获取返回…

Transformer 01(自注意机制Self-attention)

一、Self-attention [台大李宏毅] 1.1 向量序列的输入 一个序列作为输入&#xff1a; 多个向量输入举例&#xff1a; 一个句子&#xff1a; 声音信号&#xff1a; 图&#xff1a; 1.2 输出 二、Sequence labeling 输入与输出一样多&#xff1a;Sequence labeling 窗口开的…

DDR模块电路的PCB设计建议

DDR电路简介 RK3588 DDR 控制器接口支持 JEDEC SDRAM 标准接口&#xff0c;原理电路16位数据信号如图8-1所示&#xff0c;地址、控制信号如图8-2所示&#xff0c;电源信号如图8-3所示。电路控制器有如下特点&#xff1a; 1、兼容 LPDDR4/LPDDR4X/LPDDR5 标准&#xff1b; 2、…

[补题记录] Atcoder Beginner Contest 309(E)

URL&#xff1a;https://atcoder.jp/contests/abc309 目录 E Problem/题意 Thought/思路 解法一&#xff1a; 解法二&#xff1a; Code/代码 E Problem/题意 一个家庭有 N 个人&#xff0c;根节点为 1&#xff0c;给出 2 ~ N 的父节点。一共购买 M 次保险&#xff0c;每…

数据包络分析(DEA)——CCR模型

写在前面&#xff1a; 博主本人大学期间参加数学建模竞赛十多余次&#xff0c;获奖等级均在二等奖以上。为了让更多学生在数学建模这条路上少走弯路&#xff0c;故将数学建模常用数学模型算法汇聚于此专栏&#xff0c;希望能够对要参加数学建模比赛的同学们有所帮助。 目录 1. …

Java基于微信小程序的青少年健康心理科普平台

第一章 简介 青少年心理健康科普平台为用户提供心理医生咨询服务&#xff0c;系统包括微信小程序端和后台。 微信小程序用户可以先进行注册&#xff0c;填写个人的基本信息提交到服务器&#xff0c;服务器把数据保存到数据库。管理员对青少年的信息进行验证后&#xff0c;青少…

easyui disabled 属性设置

1.设置disabled $("#id").attr("disabled",true); 或 $("#id").attr("disabled","随意字符"); easyui写法 $("#id").numberbox("textbox").attr("disabled", true); $("#id")…

Fedora Linux 39 Beta 预估 10 月底发布正式版

Fedora 39 Beta 镜像于今天发布&#xff0c;用户可以根据自己的使用偏好&#xff0c;下载 KDE Plasma&#xff0c;Xfce 和 Cinnamon 等不同桌面环境版本&#xff0c;正式版预估将于 10 月底发布 Fedora 39 Beta 版本主要更新了 DNF 软件包管理器&#xff0c;并优化了 Anaconda …

CentOS 搭建k8s

目录 1.下载CentOS ISO镜像 (一) https://preparedata.blog.csdn.net/article/details/132878013 2.Hyper-V 安装 CentOS (二) https://preparedata.blog.csdn.net/article/details/132878766 3.Hyper-V 虚拟机CentOS配置网络(三) https://preparedata.blog.csdn.net/articl…

ASfP: 增强AOSP平台开发的利器——Android Studio for Platform

ASfP: 增强AOSP平台开发的利器——Android Studio for Platform Android Studio for Platform (ASfP) 是一个为使用 Soong 构建系统构建的 Android 开源项目&#xff08;AOSP&#xff09;平台开发者而设计的 Android Studio IDE 版本。与标准 Android Studio 不同&#xff0c;…

【Zabbix监控一】zabbix的原理与安装

利用一个优秀的监控软件&#xff0c;我们可以: ●通过一个友好的界面进行浏览整个网站所有的服务器状态 ●可以在 Web 前端方便的查看监控数据 ●可以回溯寻找事故发生时系统的问题和报警情况 总结&#xff1a;zabbix主要功能 监控&#xff0c;cpu负载&#xff0c;内存使用&a…

IT行业未来三年最靠谱的职业方向选择,一定要看完!

近些年“互联网”模式不断发展&#xff0c;以信息化带动传统产业的升级中&#xff0c;社会对IT互联网人才的需求量也在不断增加。随着AI、大数据、人工智能、云计算的兴起&#xff0c;也为对新兴事物充满兴趣热爱和探索的年轻人带来了更多的就业机会&#xff0c;在很大程度上激…

怎样才能让百度搜索到自己的博客?--九五小庞

怎么把自己的博客推荐到百度、Google等主要搜索引擎&#xff1f; 如果不把你的博客提交到各大搜索引擎中&#xff0c;它们一般是不会收录你的博客的&#xff0c;你可以先尝试一下看看能不能在百度搜到你的博客吧。 如果搜不到的话说明你的博客还没有被百度收录&#xff0c;那么…

Nginx rewrite+防盗链

Nginx Nginx6、重写功能rewrite6.1 if指令6.2 return6.3 set指令6.4 break指令6.5 rewrite指令6.5.1 基本原理6.5.2 语法格式6.5.3 举例6.5.3.1 测试访问bj跳转到beijing6.5.3.2 域名重定向&#xff1a;所有域名都跳转到accp 7、防盗链7.1 什么是防盗链7.2 防盗链简介7.3 实现防…

《研发效能(DevOps)工程师国家职业技术认证》工信部教考中心认证证书:塑造研发效能的黄金标准丨IDCF

随着科技的飞速发展和市场竞争的日益激烈&#xff0c;高素质的技术管理人才在当今社会中扮演着越来越重要的角色。特别是在信息技术领域&#xff0c;企业对于拥有专业技能和丰富知识的研发效能管理与技术人才的需求愈发旺盛。工业和信息化部教育与考试中心&#xff08;以下简称…

MissionPlanner编译过程

环境 windows 10 mission planner 1.3.80 visual studio 2022 git 2.22.0 下载源码 (已配置git和ssh) 从github上克隆源码 git clone gitgithub.com:ArduPilot/MissionPlanner.git进入根目录 cd MissionPlanner在根目录下的ExtLibs文件下是链接的其它github源码&#xff0…

【深度学习】 Python 和 NumPy 系列教程(十二):NumPy详解:4、数组广播;5、排序操作

目录 一、前言 二、实验环境 三、NumPy 0、多维数组对象&#xff08;ndarray&#xff09; 多维数组的属性 1、创建数组 2、数组操作 3、数组数学 4、数组广播 5、排序操作 1. np.sort() 函数 2. np.argsort() 函数 3. ndarray.sort() 方法 4. 按列或行排序 5. n…

Mybatis 映射器中使用@InsertProvider,@UpdateProvider,@DeleteProvider,@SelectProvider

上一篇我们介绍了在Mybatis映射器的映射方法中使用Param接收多个参数&#xff1b;本篇我们继续介绍如何在Mybatis的映射器中使用动态SQL。 如果您对Mybatis映射器的映射方法中使用Param接收多个参数不太了解&#xff0c;建议您先进行了解后再阅读本篇&#xff0c;可以参考&…

测试登录界面:Python

import unittest from selenium import webdriver class LoginTest(unittest.TestCase): def setUp(self): self.driver webdriver.Chrome() def test_login(self): # 打开登录页面 self.driver.get("http://example.com/login") # 输入用户名和密码 user…