文章目录
- 1 内存不断增长的问题
- 1.1 主机从GPU拷贝内存
- 1.1.1 htop 内存增长到一定阶段后,保持稳定
- 1.2 GPU拷贝到Host修改之后内存稳定无变化
- 1.3 结论
- 2 主机与GPU数据拷贝方案
- 2.1 cudaMemcpy 拷贝内存
- 2.2 cudaMemcpyAsync 异步数据拷贝
- 2.3 采用多线程拷贝技术
- 2.3.1 多线程DDR拷贝
- 2.3.2 多线程cpu拷贝到cuda内存
- 2.3.3 结论
1 内存不断增长的问题
背景:cudaMalloc 创建一次,while循环中采用cudaMemcpy 向其中拷贝数据,发现内存会一直增大,最终把系统搞崩溃掉,
为了进行思路验证,找到问题,编写了下面的这个demo.
1.1 主机从GPU拷贝内存
#include <chrono>#define BUFFER_SIZE (2 * 1024 * 1024) // 2MB buffer sizeint main() {// Host memory allocationfloat* h_data;h_data = (float*)malloc(BUFFER_SIZE);// Initialize the host buffer with some datafor (int i = 0; i < BUFFER_SIZE / sizeof(float); ++i) {h_data[i] = static_cast<float>(i);}// Device memory allocationfloat* d_data;cudaMalloc((void**)&d_data, BUFFER_SIZE);// Main loopwhile (true) {// Copy data from host to devicecudaMemcpy(d_data, h_data, BUFFER_SIZE, cudaMemcpyHostToDevice);// Add a sleep to avoid consuming all CPU resources// Adjust the duration as neededstd::this_thread::sleep_for(std::chrono::milliseconds(100));}// CleanupcudaFree(d_data);free(h_data);return 0;
}
1.1.1 htop 内存增长到一定阶段后,保持稳定
htop 查看内存情况,发现一直在不断增长.
运行一段时间后,htop内存如下,保持稳定了.
1.2 GPU拷贝到Host修改之后内存稳定无变化
while (true) {// Copy data from host to devicecudaMemcpy(d_data, h_data, BUFFER_SIZE, cudaMemcpyHostToDevice);// Add a sleep to avoid consuming all CPU resources// Adjust the duration as neededstd::this_thread::sleep_for(std::chrono::milliseconds(100));cudaMemcpy(h_data, d_data, BUFFER_SIZE, cudaMemcpyDeviceToHost);std::this_thread::sleep_for(std::chrono::milliseconds(100));}
1.3 结论
通过分析得出 while循环中,调用cudaMemcpy,本身不会导致内存的增加,包括内核函数中使用shared ,也不会有啥影响,
我的工程中,出现内存一直增长,原因是 pcap线程,获取原始数据,入队速度超过了 pixel 线程 cuda的处理速度,导致了内存
不断增长,解决方法原始数据降频。后续优化,参考第2节,cudaMemcpy的优化.
2 主机与GPU数据拷贝方案
忽略内核线程内部数据计算的逻辑,demo的目的,是为了优化拷贝的时间效率
2.1 cudaMemcpy 拷贝内存
#include <cuda_runtime.h>
#include <iostream>
#include <thread>
#include <chrono>#define BUFFER_SIZE (4 * 1024 * 1024) // 2MB buffer sizetypedef struct DistCompenParam{//归一化参数.float a;float b;float c;
}DistCompenParam_T;typedef struct PixelPointLight{struct {uint16_t peak;//uint12_0uint8_t gray; //经过灰度补偿后,计算出来的灰度值.uint8_t ech_en;;//字节对齐.float fwhm_f;float half_start_pos_f;float dR; //距离补偿值//计算时的中间变量float x_peak;float x_fwhm;}echo[2];
}PixelPointLight_T;typedef struct PixelSlotLight{PixelPointLight_T point[192];
}PixelSlotLight_T;// CUDA kernel function to add a value to each element of the array
__global__ void addValueKernel(PixelSlotLight_T* data, DistCompenParam_T *para, int size) {int idx = blockIdx.x; // * blockDim.x + threadIdx.x;int echoIdx = threadIdx.x;int Idy = threadIdx.y;__shared__ float y[6];if (idx < size) {data->point[idx].echo[echoIdx].gray = para->a + 10;data->point[idx].echo[echoIdx].fwhm_f = para->a + 20;y[Idy] = tanh(data->point[idx].echo[echoIdx].gray + data->point[idx].echo[echoIdx].fwhm_f);__syncthreads();data->point[idx].echo[echoIdx].peak = y[0] + y[1] + y[2] + y[3] + y[4] + y[5];}
}extern "C" void process_algo_gpu(PixelSlotLight_T *pixel_devptr, DistCompenParam_T *para_devptr, int numPoints)
{// Call the CUDA kernel to add a value to each elementdim3 blocksPerGrid(192, 1);dim3 threadsPerBlock(2, 6);addValueKernel<<<blocksPerGrid, threadsPerBlock>>>(pixel_devptr, para_devptr,192);}int main() {PixelSlotLight_T host_data1;int total_time[1024] = {};int count = 0;// Host memory allocationPixelSlotLight_T* h_data;h_data = (PixelSlotLight_T *)malloc(sizeof(PixelSlotLight_T));DistCompenParam_T * h_para;h_para = (DistCompenParam_T *)malloc(sizeof(DistCompenParam_T));h_para->a = 3;h_para->b = 4;h_para->c = 5;// Initialize the host buffer with some datafor (int i = 0; i < 192; ++i) {for(int j = 0; j < 2; j++){h_data->point[i].echo[j].peak = 200;h_data->point[i].echo[j].gray = 100;h_data->point[i].echo[j].fwhm_f = 15;}}// Device memory allocationPixelSlotLight_T *pixel_devptr;cudaMalloc((void**)&pixel_devptr, sizeof(PixelSlotLight_T));DistCompenParam_T *para_devptr;cudaMalloc((void**)¶_devptr, sizeof(DistCompenParam_T));cudaMemcpy(para_devptr, h_para, sizeof(DistCompenParam_T), cudaMemcpyHostToDevice);// Main loopwhile (true) {auto start = std::chrono::high_resolution_clock::now();// Copy data from host to devicecudaMemcpy(pixel_devptr, h_data, sizeof(PixelSlotLight_T), cudaMemcpyHostToDevice);process_algo_gpu(pixel_devptr, para_devptr, 192);// Ensure that the kernel execution has completed before moving oncudaDeviceSynchronize();cudaMemcpy(&host_data1, pixel_devptr, sizeof(PixelSlotLight_T), cudaMemcpyDeviceToHost);// Add a sleep to avoid consuming all CPU resources// Adjust the duration as neededauto end = std::chrono::high_resolution_clock::now();total_time[count] = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();count++;if(count == 1024){count = 0;int sum = 0;for(int i = 0; i < 1024; i++){sum += total_time[i];}std::cout << "time:" << sum/1024 << " us." << std::endl;}}// CleanupcudaFree(pixel_devptr);cudaFree(para_devptr);free(h_data);return 0;
}
执行时间, 均值如下
time:204 us.
time:197 us.
time:222 us.
time:209 us.
time:198 us.
time:196 us.
time:209 us.
time:194 us.
time:189 us.
time:231 us.
time:215 us.
time:264 us.
time:242 us.
time:199 us.
time:235 us.
time:422 us.
2.2 cudaMemcpyAsync 异步数据拷贝
#include <cuda_runtime.h>
#include <iostream>
#include <thread>
#include <chrono>#define BUFFER_SIZE (4 * 1024 * 1024) // 2MB buffer sizetypedef struct DistCompenParam{//归一化参数.float a;float b;float c;
}DistCompenParam_T;typedef struct PixelPointLight{struct {uint16_t peak;//uint12_0uint8_t gray; //经过灰度补偿后,计算出来的灰度值.uint8_t ech_en;;//字节对齐.float fwhm_f;float half_start_pos_f;float dR; //距离补偿值//计算时的中间变量float x_peak;float x_fwhm;}echo[2];
}PixelPointLight_T;typedef struct PixelSlotLight{PixelPointLight_T point[192];
}PixelSlotLight_T;// CUDA kernel function to add a value to each element of the array
__global__ void addValueKernel(PixelSlotLight_T* data, DistCompenParam_T *para, int size) {int idx = blockIdx.x; // * blockDim.x + threadIdx.x;int echoIdx = threadIdx.x;int Idy = threadIdx.y;__shared__ float y[6];if (idx < size) {data->point[idx].echo[echoIdx].gray = para->a + 10;data->point[idx].echo[echoIdx].fwhm_f = para->a + 20;y[Idy] = tanh(data->point[idx].echo[echoIdx].gray + data->point[idx].echo[echoIdx].fwhm_f);__syncthreads();data->point[idx].echo[echoIdx].peak = y[0] + y[1] + y[2] + y[3] + y[4] + y[5];}
}extern "C" void process_algo_gpu(PixelSlotLight_T *pixel_devptr, DistCompenParam_T *para_devptr, int numPoints)
{// Call the CUDA kernel to add a value to each elementdim3 blocksPerGrid(192, 1);dim3 threadsPerBlock(2, 6);addValueKernel<<<blocksPerGrid, threadsPerBlock>>>(pixel_devptr, para_devptr,192);}int main() {cudaStream_t stream;cudaStreamCreate(&stream);PixelSlotLight_T host_data1;int total_time[1024] = {};int count = 0;// Host memory allocationPixelSlotLight_T* h_data;h_data = (PixelSlotLight_T *)malloc(sizeof(PixelSlotLight_T));DistCompenParam_T * h_para;h_para = (DistCompenParam_T *)malloc(sizeof(DistCompenParam_T));h_para->a = 3;h_para->b = 4;h_para->c = 5;// Initialize the host buffer with some datafor (int i = 0; i < 192; ++i) {for(int j = 0; j < 2; j++){h_data->point[i].echo[j].peak = 200;h_data->point[i].echo[j].gray = 100;h_data->point[i].echo[j].fwhm_f = 15;}}// Device memory allocationPixelSlotLight_T *pixel_devptr;cudaMalloc((void**)&pixel_devptr, sizeof(PixelSlotLight_T));DistCompenParam_T *para_devptr;cudaMalloc((void**)¶_devptr, sizeof(DistCompenParam_T));cudaMemcpy(para_devptr, h_para, sizeof(DistCompenParam_T), cudaMemcpyHostToDevice);// Main loopwhile (true) {auto start = std::chrono::high_resolution_clock::now();// Copy data from host to devicecudaMemcpyAsync(pixel_devptr, h_data, sizeof(PixelSlotLight_T), cudaMemcpyHostToDevice, stream);// process_algo_gpu(pixel_devptr, para_devptr, 192);cudaMemcpyAsync(&host_data1, pixel_devptr, sizeof(PixelSlotLight_T), cudaMemcpyDeviceToHost, stream);// Add a sleep to avoid consuming all CPU resources// Adjust the duration as neededcudaStreamSynchronize(stream);auto end = std::chrono::high_resolution_clock::now();total_time[count] = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();count++;if(count == 1024){count = 0;int sum = 0;for(int i = 0; i < 1024; i++){sum += total_time[i];}std::cout << "time:" << sum/1024 << " us." << std::endl;}}// CleanupcudaFree(pixel_devptr);cudaFree(para_devptr);cudaStreamDestroy(stream);free(h_data);return 0;
}
感觉上没有量级的提升.
time:203 us.
time:180 us.
time:194 us.
time:179 us.
time:170 us.
time:179 us.
time:184 us.
time:195 us.
time:175 us.
time:176 us.
time:204 us.
time:205 us.
time:176 us.
time:173 us.
time:171 us.
time:198 us.
time:183 us.
time:173 us.
time:184 us.
time:177 us.
time:174 us.
time:174 us.
time:250 us.
time:230 us.
time:272 us.
time:192 us.
time:203 us.
time:197 us.
time:189 us.
time:224 us.
time:223 us.
time:227 us.
time:230 us.
2.3 采用多线程拷贝技术
2.3.1 多线程DDR拷贝
#include <cuda_runtime.h>
#include <iostream>
#include <thread>
#include <chrono>
#include <queue>
#include <semaphore.h>typedef struct DistCompenParam{//归一化参数.float a;float b;float c;
}DistCompenParam_T;typedef struct PixelPointLight{struct {uint16_t peak;//uint12_0uint8_t gray; //经过灰度补偿后,计算出来的灰度值.uint8_t ech_en;;//字节对齐.float fwhm_f;float half_start_pos_f;float dR; //距离补偿值//计算时的中间变量float x_peak;float x_fwhm;}echo[2];
}PixelPointLight_T;typedef struct PixelSlotLight{PixelPointLight_T point[192];PixelPointLight_T point1[192][600];
}PixelSlotLight_T;// CUDA kernel function to add a value to each element of the array
__global__ void addValueKernel(PixelSlotLight_T* data, DistCompenParam_T *para, int size) {int idx = blockIdx.x; // * blockDim.x + threadIdx.x;int echoIdx = threadIdx.x;int Idy = threadIdx.y;__shared__ float y[6];if (idx < size) {data->point[idx].echo[echoIdx].gray = para->a + 10;data->point[idx].echo[echoIdx].fwhm_f = para->a + 20;y[Idy] = tanh(data->point[idx].echo[echoIdx].gray + data->point[idx].echo[echoIdx].fwhm_f);__syncthreads();data->point[idx].echo[echoIdx].peak = y[0] + y[1] + y[2] + y[3] + y[4] + y[5];}
}extern "C" void process_algo_gpu(PixelSlotLight_T *pixel_devptr, DistCompenParam_T *para_devptr, int numPoints)
{// Call the CUDA kernel to add a value to each elementdim3 blocksPerGrid(192, 1);dim3 threadsPerBlock(2, 6);addValueKernel<<<blocksPerGrid, threadsPerBlock>>>(pixel_devptr, para_devptr,192);}// Device memory allocation
PixelSlotLight_T *pixel_devptr;typedef struct {sem_t sem_p[4];sem_t sem_w[4];PixelSlotLight_T *host_ptr;PixelSlotLight_T *dev_ptr;uint8_t direct; //拷贝的方向.
}Worker_T;Worker_T worker_res;
/*** index: 分段拷贝索引* len:分段拷贝大小.*/
void worker(int index, int len) {cudaError_t err;while (1) {sem_wait(&worker_res.sem_w[index]); // 等待信号量//host --> devif(worker_res.direct == 0){// err = cudaMemcpy((uint8_t *)worker_res.dev_ptr + index*len , (uint8_t *)worker_res.host_ptr+index*len, len, cudaMemcpyHostToDevice);memcpy((uint8_t *)worker_res.dev_ptr + index*len , (uint8_t *)worker_res.host_ptr+index*len, len);if(err != cudaSuccess){std::cerr << "cudaMemcpy failed with error: " << cudaGetErrorString(err) << std::endl;}}else if(worker_res.direct == 1){ //dev-->hosterr = cudaMemcpy((uint8_t*)worker_res.host_ptr+index*len, (uint8_t*)worker_res.dev_ptr + index*len, len, cudaMemcpyDeviceToHost);if(err != cudaSuccess){std::cerr << "cudaMemcpy failed with error: " << cudaGetErrorString(err) << std::endl;}}sem_post(&worker_res.sem_p[index]);}
}inline void host_to_dev_memcpy(Worker_T *worker_res_)
{for(int i= 0; i < 4; i++){worker_res_->direct = 0;sem_post(&worker_res_->sem_w[i]);}for(int i= 0; i < 4; i++){sem_wait(&worker_res_->sem_p[i]);}
}inline void dev_to_host_memcpy(Worker_T *worker_res_)
{for(int i= 0; i < 4; i++){worker_res_->direct = 1;sem_post(&worker_res_->sem_w[i]);}for(int i= 0; i < 4; i++){sem_wait(&worker_res_->sem_p[i]);}
}int main() {std::vector<std::thread> threads;for (int i = 0; i < 4; ++i) {threads.emplace_back(worker, i,sizeof(PixelSlotLight_T)/4);}cudaStream_t stream;cudaStreamCreate(&stream);PixelSlotLight_T host_data1;int total_time[4096] = {};int count = 0;// Host memory allocationPixelSlotLight_T* h_data;h_data = (PixelSlotLight_T *)malloc(sizeof(PixelSlotLight_T));PixelSlotLight_T* h_data1;h_data1 = (PixelSlotLight_T *)malloc(sizeof(PixelSlotLight_T));DistCompenParam_T * h_para;h_para = (DistCompenParam_T *)malloc(sizeof(DistCompenParam_T));h_para->a = 3;h_para->b = 4;h_para->c = 5;// Initialize the host buffer with some datafor (int i = 0; i < 192; ++i) {for(int j = 0; j < 2; j++){h_data->point[i].echo[j].peak = 200;h_data->point[i].echo[j].gray = 100;h_data->point[i].echo[j].fwhm_f = 15;}}cudaMalloc((void**)&pixel_devptr, sizeof(PixelSlotLight_T));DistCompenParam_T *para_devptr;cudaMalloc((void**)¶_devptr, sizeof(DistCompenParam_T));cudaMemcpy(para_devptr, h_para, sizeof(DistCompenParam_T), cudaMemcpyHostToDevice);printf("PixelSlotLight_T:%ld Byte.\n", sizeof(PixelSlotLight_T));// Main loopwhile (true) {auto start = std::chrono::high_resolution_clock::now();worker_res.host_ptr = &host_data1;worker_res.dev_ptr = h_data1;worker_res.direct = 0;host_to_dev_memcpy(&worker_res);// cudaMemcpy(&host_data1, pixel_devptr, sizeof(PixelSlotLight_T), cudaMemcpyDeviceToHost);// Add a sleep to avoid consuming all CPU resources// Adjust the duration as needed// cudaStreamSynchronize(stream);auto end = std::chrono::high_resolution_clock::now();total_time[count] = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();count++;if(count == 4096){count = 0;int sum = 0;for(int i = 0; i < 4096; i++){sum += total_time[i];}std::cout << "time:" << sum/4096 << " us." << std::endl;}}// CleanupcudaFree(pixel_devptr);cudaFree(para_devptr);cudaStreamDestroy(stream);free(h_data);return 0;
}
./host7
PixelSlotLight_T:5538816 Byte.
time:879 us.
time:870 us.
time:888 us.
time:932 us.
time:924 us.
time:943 us.
time:1056 us.
time:1501 us.
time:1051 us.
time:1205 us.
time:734 us.
time:504 us.
time:550 us.
time:545 us.
time:669 us.
2.3.2 多线程cpu拷贝到cuda内存
4线程cpu拷贝数据到GPU
PixelSlotLight_T:5538816 Byte.
time:2058 us.
time:2333 us.
time:2417 us.
time:2151 us.
time:1664 us.
time:1649 us.
单线程cpu拷贝cuda
./host7
PixelSlotLight_T:5538816 Byte.
time:2058 us.
time:2333 us.
time:2417 us.
time:2151 us.
time:1664 us.
time:1649 us
单线程cpu拷贝数据到GPU
PixelSlotLight_T:5538816 Byte.
time:1649 us.
time:1675 us.
time:1667 us.
time:1631 us.
time:1484 us.
time:1281 us.
time:1256 us.
time:1256 us.
time:1302 us.
time:1586 us.
time:1444 us.
2.3.3 结论
分析由于:CPU与GPU 之间采用的是 PCIE传输数据,总带宽固定,多线程可能有竞争,信号量,反而并没有优势。
相同条件下,提升效率的方式,采用单次匹配适当的数据比较合适,它有一个 临界点,小一个临界点之类,耗时区别不大。