0. 思路
为了能把理念说通,使用了 step by step 的方式,一步步迭代会觉得比较合理。源代码从nv官方vectorAdd改过来的。
step 1, 单 cu 文件的可执行文件版本
源代码
main_app.cu
#include <stdio.h>
#include <cuda_runtime.h>template <typename T>
__global__ void vector_square_add(T *A, T *B, T *C, int n)
{int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < n){C[i] = A[i] * A[i] + B[i] * B[i];}
}template __global__ void vector_square_add(float *A, float *B, float *C, int n);template <typename T>
__global__ void vector_add_kernel(T *A, T *B, T *C, int n)
{int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < n){C[i] = A[i] + B[i] + 0.0f;}
}template __global__ void vector_add_kernel(float *A, float *B, float *C, int n);template <typename T>
void ic_vector_add(T *A, T *B, T *C, int n)
{dim3 grid, block;block.x = 256;grid.x = (n + block.x - 1) / block.x;printf("CUDA kernel launch with %d blocks of %d threads\n", grid.x, block.x);vector_add_kernel<T><<<grid, block>>>(A, B, C, n);
}template void ic_vector_add(float* A, float *B, float* C, int n);int main(void)
{int n = 50;size_t size = n * sizeof(float);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);for (int i = 0; i < n; ++i){h_A[i] = 3; // rand() / (float)RAND_MAX;h_B[i] = 4; // rand() / (float)RAND_MAX;}float *d_A = NULL;float *d_B = NULL;float *d_C = NULL;cudaMalloc((void **)&d_A, size);cudaMalloc((void **)&d_B, size);cudaMalloc((void **)&d_C, size);cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
/*int threadsPerBlock = 256;int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vector_add_kernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, n);
*/ic_vector_add(d_A, d_B, d_C, n);printf("Copy output data from the CUDA device to the host memory\n");cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);for (int i = 0; i < n; ++i){printf("%3.2f ", h_C[i]);// if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); }}printf("\nTest PASSED\n");cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}
Makefile
all: main_
main_.cu:main.cucp main.cu main_.cumain_.o:main_.cunvcc $< -c --keepmain_:main_.onvcc $< -o $@.PHONY:clean
clean:-rm -f main_*
step2, 一个 API 函数的动态链接库 Makefile 版本
step3, 两个 API 函数的动态链接库 Makefile 版本
step4, 将Makefile 转换成 cmake 自定义版本