工作小计- RGB相关算子实现

项目中的模型一直都是直接操作NV12的yuv格式数据，这次的模型只支持RGB格式的输入，正好来自己实现对应的算子。
这里记录一下对应算子的实现过程，主要涉及到NV12到RGB的变换，RGB的crop/resize操作，对于数据的Norm/ToFloat操作，调整Layout等等。

cu文件是要nvcc来进行编译的，但是其头文件可以供外部的cpp文件调用，另外这里的核函数并没有涉及到stream的考虑，因为这个涉及到之后的性能优化环节，要有先来后到。实际stream也就是在核函数调用前的<<<>>>中传入stream而已，然后之后要跟着同步stream的操作。与函数实现逻辑无关。

cuda_transformation.cu

在这里实现真正的核函数，

NV12toRGB

在这里插入图片描述

这里的坑点在于 BT.601/709 FULL/非FULL的yuv格式，如果出了差错会导致图像看起来色度不对，遇到过的问题就是红色很不明显，原因就是转换公式写的有问题。

__global__ void NV12toRGB(uint8_t *yuv, uint8_t *rgb, int width,int height) {const int nv_start = width * height;int i, j, nv_index = 0;uint8_t y, u, v;int r, g, b;j = blockIdx.x * blockDim.x + threadIdx.x;i = blockIdx.y * blockDim.y + threadIdx.y;if (i >= height || j >= width)return;nv_index = i / 2 * width + j - j % 2;int rgb_index = i * width + j;y = yuv[rgb_index];u = yuv[nv_start + nv_index];v = yuv[nv_start + nv_index + 1];r = y + (140 * (v - 128)) / 100;                         // rg = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // gb = y + (177 * (u - 128)) / 100;                         // bif (r > 255)r = 255;if (g > 255)g = 255;if (b > 255)b = 255;if (r < 0)r = 0;if (g < 0)g = 0;if (b < 0)b = 0;rgb[rgb_index * 3 + 0] = b;rgb[rgb_index * 3 + 1] = g;rgb[rgb_index * 3 + 2] = r;
}int cudaNV12toRGB(uint8_t *input, uint8_t *output, size_t width,size_t height) {if (!input || !output)return cudaErrorInvalidDevicePointer;const dim3 blockDim(32, 32, 1);const dim3 gridDim((width + blockDim.x - 1) / blockDim.x,(height + blockDim.y - 1) / blockDim.y, 1);NV12toRGB<<<gridDim, blockDim>>>(input, output, width, height);return cudaDeviceSynchronize();
}

RGBBilinearResize

__global__ void RGBBilinearResize(uint8_t *input, uint8_t *output,int inputWidth, int inputHeight,int outputWidth, int outputHeight) {// 计算线程的全局索引int x = blockIdx.x * blockDim.x + threadIdx.x;int y = blockIdx.y * blockDim.y + threadIdx.y;if (x >= outputWidth || y >= outputHeight)return;// gx,gy是相对于resize后的图中的点，这里计算对应的原图中的浮点位置，确定要从哪里采样float gx = ((float)x) / outputWidth * (inputWidth - 1);float gy = ((float)y) / outputHeight * (inputHeight - 1);// 对应的整数位置及其偏移量int gxi = (int)gx;int gyi = (int)gy;float dx = gx - gxi;float dy = gy - gyi;// 读取四个最近的像素值uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0],input[(gyi * inputWidth + gxi) * 3 + 1],input[(gyi * inputWidth + gxi) * 3 + 2]};uint8_t topRight[3] = {input[(gyi * inputWidth + gxi + 1) * 3 + 0],input[(gyi * inputWidth + gxi + 1) * 3 + 1],input[(gyi * inputWidth + gxi + 1) * 3 + 2]};uint8_t bottomLeft[3] = {input[((gyi + 1) * inputWidth + gxi) * 3 + 0],input[((gyi + 1) * inputWidth + gxi) * 3 + 1],input[((gyi + 1) * inputWidth + gxi) * 3 + 2]};uint8_t bottomRight[3] = {input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0],input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1],input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]};// 对每个通道进行双线性插值for (int i = 0; i < 3; i++) {float top = topLeft[i] * (1 - dx) + topRight[i] * dx;float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx;output[(y * outputWidth + x) * 3 + i] = top * (1 - dy) + bottom * dy;}
}int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width,size_t height, size_t resize_width,size_t resize_height) {if (!input || !output)return cudaErrorInvalidDevicePointer;const dim3 blockDim(32, 32, 1);const dim3 gridDim((width + blockDim.x - 1) / blockDim.x,(height + blockDim.y - 1) / blockDim.y, 1);RGBBilinearResize<<<gridDim, blockDim>>>(input, output, width, height,resize_width, resize_height);return cudaDeviceSynchronize();
}

RGBToFloat

这里的实现要额外记录下，因为涉及到debug中的opencv-dump所以在传入模型前的数据都是BGR格式的，在转浮点这里重新调整成模型需要的RGB格式。

__global__ void RGBToFloat(uint8_t *input, float *output, int width, int height) {int x = blockIdx.x * blockDim.x + threadIdx.x;int y = blockIdx.y * blockDim.y + threadIdx.y;if (x >= width || y >= height) return;int idx = y * width + x;output[idx * 3 + 0] = input[idx * 3 + 2] / 255.0f; // Routput[idx * 3 + 1] = input[idx * 3 + 1] / 255.0f; // Goutput[idx * 3 + 2] = input[idx * 3 + 0] / 255.0f; // B
}int cudaRGBToFloat(uint8_t *input, float *output, int width, int height) {dim3 blockDim(16, 16);dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);RGBToFloat<<<gridDim, blockDim>>>(input, output, width, height);return cudaDeviceSynchronize();
}

RGBNormalize

__global__ void RGBNormalize(float *image, int width, int height, float mean[], float std[]) {int x = blockIdx.x * blockDim.x + threadIdx.x;int y = blockIdx.y * blockDim.y + threadIdx.y;if (x >= width || y >= height) {return;}int idx = y * width + x;if (std[0] < 1e-6 || std[1] < 1e-6 || std[2] < 1e-6) {printf("Error: std values are too small for safe division.\n");return;}image[idx * 3 + 0] = (image[idx * 3 + 0] - mean[0]) / std[0]; // Bimage[idx * 3 + 1] = (image[idx * 3 + 1] - mean[1]) / std[1]; // Gimage[idx * 3 + 2] = (image[idx * 3 + 2] - mean[2]) / std[2]; // R
}int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]) {dim3 blockDim(16, 16);dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);RGBNormalize<<<gridDim, blockDim>>>(d_image, width, height, mean, std);cudaError_t cudaStatus = cudaDeviceSynchronize();if (cudaStatus != cudaSuccess) {fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(cudaStatus));return -1;}return 0;
}

HWC2CHW

template <typename T>
__global__ void HWC2CHW(const T* input, T* output, int height, int width) {int x = blockIdx.x * blockDim.x + threadIdx.x;int y = blockIdx.y * blockDim.y + threadIdx.y;if (x >= width || y >= height) return;int channelSize = width * height;int hwcIndex = y * width + x;int chwIndex;for (int c = 0; c < 3; ++c) {chwIndex = c * channelSize + y * width + x;output[chwIndex] = input[hwcIndex * 3 + c];}
}template <typename T>
int cudaHWC2CHW(const T* input, T* output, int height, int width) {dim3 blockDim(16, 16);dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);HWC2CHW<<<gridDim, blockDim>>>(input, output, height, width);return cudaDeviceSynchronize();
}
template int cudaHWC2CHW<float>(const float* input, float* output, int height, int width);

cuda_transformation.h

void convertNV12toYUV444withActions_cuda(uint8_t *src_img, uint8_t *src_imgcuda,uint8_t *tmpImagecuda,ImageTransParam &trans_param,uint8_t *dst_imgcuda, uint8_t *dst_img,cudaStream_t stream);
void convertNV12toYUV444withActions_cuda1(uint8_t *src_imgcuda,ImageTransParam &trans_param,uint8_t *dst_imgcuda);int cudaNV12toRGB(uint8_t* input, uint8_t* output, size_t width, size_t height);int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width,size_t height, size_t resize_width,size_t resize_height);int cudaRGBToFloat(uint8_t *input, float *output, int width, int height);int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]);template <typename T>
int cudaHWC2CHW(const T* input, T* output, int height, int width);

image_transformation.h

这里也是对该变换进行封装，虽然项目是面向对象的抽象出了类似Transformer这个类，但是出于逻辑清晰和方便调试，我这里提供的都是面向过程的代码，另外附上了cpu中算子的实现。实际上一个图像处理算子的实现，一般过程是先生成cpu的，基于NCHW的循环版本，再对其改装成gpu上的算子，毕竟gpu的算子调试相较cpu不是很方便。虽然有cuda-gdb这种东西。可以看到cpu和gpu的版本基本上只在循环方式上有差别，因此核函数也是可以称为 for_each_pixel_func

  void TransformNV12toRGB(uint8_t *input, uint8_t *output,int width, int height) {int ret = cudaNV12toRGB(input, output, width, height);if (ret != 0){HSLOG_E << "cudaNV12toRGB FAILED";}}void CpuTransformNV12toRGB(uint8_t *yuv, uint8_t *rgb,int width, int height) {const int nv_start = width * height;uint32_t i, j, index = 0, rgb_index = 0;uint8_t y, u, v;int r, g, b, nv_index = 0;for (i = 0; i < height; i++) {for (j = 0; j < width; j++) {// nv_index = (rgb_index / 2 - width / 2 * ((i + 1) / 2)) * 2;nv_index = i / 2 * width + j - j % 2;y = yuv[rgb_index];u = yuv[nv_start + nv_index];v = yuv[nv_start + nv_index + 1];r = y + (140 * (v - 128)) / 100;                         // rg = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // gb = y + (177 * (u - 128)) / 100;                         // bif (r > 255)r = 255;if (g > 255)g = 255;if (b > 255)b = 255;if (r < 0)r = 0;if (g < 0)g = 0;if (b < 0)b = 0;// index = rgb_index % width + (height - i - 1) * width;index = rgb_index % width + i * width;rgb[index * 3 + 0] = b;rgb[index * 3 + 1] = g;rgb[index * 3 + 2] = r;rgb_index++;}}}void TransformRGBResize(uint8_t *input, uint8_t *output, size_t width,size_t height, size_t resize_width,size_t resize_height) {int ret = cudaRGBBilinearResize(input, output, width, height, resize_width, resize_height);if (ret != 0){HSLOG_E << "cudaRGBBilinearResize FAILED: " << ret;}}void CPURGBBilinearResize(uint8_t *input, uint8_t *output,int inputWidth, int inputHeight, int outputWidth,int outputHeight) {for (int y = 0; y < outputHeight; y++) {for (int x = 0; x < outputWidth; x++) {// 计算对应的原图中的浮点位置float gx = ((float)x) / outputWidth * (inputWidth - 1);float gy = ((float)y) / outputHeight * (inputHeight - 1);// 对应的整数位置及其偏移量int gxi = (int)gx;int gyi = (int)gy;float dx = gx - gxi;float dy = gy - gyi;// 读取四个最近的像素值uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0],input[(gyi * inputWidth + gxi) * 3 + 1],input[(gyi * inputWidth + gxi) * 3 + 2]};uint8_t topRight[3] = {input[(gyi * inputWidth + gxi + 1) * 3 + 0],input[(gyi * inputWidth + gxi + 1) * 3 + 1],input[(gyi * inputWidth + gxi + 1) * 3 + 2]};uint8_t bottomLeft[3] = {input[((gyi + 1) * inputWidth + gxi) * 3 + 0],input[((gyi + 1) * inputWidth + gxi) * 3 + 1],input[((gyi + 1) * inputWidth + gxi) * 3 + 2]};uint8_t bottomRight[3] = {input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0],input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1],input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]};// 对每个通道进行双线性插值for (int i = 0; i < 3; i++) {float top = topLeft[i] * (1 - dx) + topRight[i] * dx;float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx;output[(y * outputWidth + x) * 3 + i] =static_cast<uint8_t>(top * (1 - dy) + bottom * dy);}}}}void TransfromConvertRGBToFloat(uint8_t *input, float *output, int width, int height){int ret = cudaRGBToFloat(input, output, width, height);if (ret != 0){HSLOG_E << "cudaRGBToFloat FAILED: " << ret;}}void TransfromRGBNormalize(float *input, int width, int height, float* mean, float* std){int ret = cudaRGBNormalize(input, width, height, mean, std);if (ret != 0){HSLOG_E << "cudaRGBNormalize FAILED: " << ret;}}template <typename T>int TransfromHWC2CHW(const T* input, T* output, int height, int width){int ret = cudaHWC2CHW<T>(input, output, height, width);if (ret != 0){HSLOG_E << "cudaHWC2CHW FAILED: " << ret;}}

pre_process_module.cpp

这里额外加入一些dump的操作，以及debuggpu前N个字节的操作，方便调试

void PreProcessModule::Transform21dImage(hobot::dataflow::spMsgResourceProc proc,const hobot::dataflow::MessageLists &msgs){UNUSED(proc);auto &input_img_batch_msgs = msgs[0];std::shared_ptr<ImageBatchMsg<GPUImageMsg>> batch_image_msg =std::static_pointer_cast<ImageBatchMsg<GPUImageMsg>>(input_img_batch_msgs->at(0));for (int i = 0; i < batch_image_msg->batch_size_; ++i) {auto image_msg = batch_image_msg->batch_img_msg_[i];int height = image_msg->img_trans_param_.src_height;int width = image_msg->img_trans_param_.src_width;image_transformation_[i].TransformNV12toRGB(image_msg->cuda_nv12_, image_transformation_[i].cuda_image_out_, width, height);static int cnt = 0;if (true){std::string input_file_path= "/home/yuxuan03.zhang/utils_code/lcc/query/" + std::to_string(cnt) + ".jpg";cv::Mat bgrImage = cv::imread(input_file_path);if (bgrImage.empty()) {std::cerr << "Error: Image cannot be loaded!" << std::endl;}size_t size = bgrImage.total() * bgrImage.elemSize(); // 计算需要复制的内存大小HSLOG_E << "height: " << height << "width: " << width << "size: " << size << "file" << input_file_path;// 将数据从 cv::Mat 复制到 GPU 内存cudaMemcpy(image_transformation_[i].cuda_image_out_, bgrImage.ptr(), size, cudaMemcpyHostToDevice);image_msg->SetDoneTimestamp(cnt);cnt++;}// int size = width * height * 3 / 2;// uint8_t* cpu_nv12 = new uint8_t[size];// cudaMemcpy(cpu_nv12, image_msg->cuda_nv12_, size, cudaMemcpyDeviceToHost);// cv::Mat nv12Img(height + height / 2, width, CV_8UC1, cpu_nv12);// cv::Mat bgrImg;// cv::cvtColor(nv12Img, bgrImg, cv::COLOR_YUV2BGR_NV12);// std::string file = std::to_string(image_msg->GetGenTimestamp()) + "_nv12.png";// cv::imwrite(file, bgrImg);// delete[] cpu_nv12;// int dataSize = width * height * 3;// uint8_t* cpu_rgb = new uint8_t[dataSize];// cudaMemcpy(cpu_rgb, image_transformation_[i].cuda_image_out_, dataSize, cudaMemcpyDeviceToHost);// cv::Mat rgb_img(height, width, CV_8UC3, cpu_rgb);// std::string file1 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb.png";// cv::imwrite(file1, rgb_img);// delete[] cpu_rgb;image_transformation_[i].TransformRGBResize(image_transformation_[i].cuda_image_out_, image_transformation_[i].cuda_image_trans_buffer_, width, height, 910, 512);HSLOG_E <<"Resize: " << PrintFirstNUint8Bytes((uint8_t*)image_transformation_[i].cuda_image_trans_buffer_);// uint8_t* cpu_rgb_resize = new uint8_t[910*512*3];// cudaMemcpy(cpu_rgb_resize, image_transformation_[i].cuda_image_trans_buffer_, 910*512*3, cudaMemcpyDeviceToHost);// cv::Mat rgb_resize_img(512, 910, CV_8UC3, cpu_rgb_resize);// std::string file2 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb_resize.png";// cv::imwrite(file2, rgb_resize_img);// delete[] cpu_rgb_resize;image_transformation_[i].TransfromConvertRGBToFloat(image_transformation_[i].cuda_image_trans_buffer_, (float*)image_transformation_[i].cuda_image_out_, 910, 512);HSLOG_E <<"BRGToRGBFloat: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_);std::vector<float> mean = {0.485, 0.456, 0.406};std::vector<float> std  = {0.229, 0.224, 0.225};float* mean_gpu = (float*)image_transformation_[i].cuda_image_trans_buffer_;float* std_gpu = mean_gpu+3;cudaMemcpy(mean_gpu, mean.data(), 3 * sizeof(float), cudaMemcpyHostToDevice);cudaMemcpy(std_gpu, std.data(), 3 * sizeof(float), cudaMemcpyHostToDevice);image_transformation_[i].TransfromRGBNormalize((float*)image_transformation_[i].cuda_image_out_, 910, 512, mean_gpu, std_gpu);HSLOG_E <<"Norm: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_);image_transformation_[i].TransfromHWC2CHW((float*)image_transformation_[i].cuda_image_out_, (float*)image_msg->cuda_yuv444_, 512, 910);HSLOG_E <<"HWC2CHW: " << PrintFirstNFloatBytes((float*)image_msg->cuda_yuv444_);if (true) {float *cuda_image_out_ = (float*)image_msg->cuda_yuv444_;size_t dataSize = 3 * 512 * 910 * sizeof(float);float *hostData = new float[dataSize / sizeof(float)];cudaMemcpy(hostData, cuda_image_out_, dataSize, cudaMemcpyDeviceToHost);std::string input_file_path= "./dump_bin/" + std::to_string(cnt) + ".bin";std::ofstream outFile(input_file_path, std::ios::out | std::ios::binary);outFile.write(reinterpret_cast<char *>(hostData), dataSize);outFile.close();delete[] hostData;}}SEND_DATA(SLOT_OUT_BATCH_TRANS_IMAGE, batch_image_msg);
}