参考视频
宝藏up主!CUDA编程模型系列七(利用shared memory优化矩阵转置)_哔哩哔哩_bilibili
代码
#define BLOCK_SIZE 32
#define M 3000
#define N 1000__managed__ int matrix[N][M];
__managed__ int gpu_matrix[M][N];
__managed__ int cpu_matrix[M][N];__global__ void gpu_matrix_transpose(int in[N][M], int out[M][N])
{int x = threadIdx.x + blockDim.x * blockIdx.x;int y = threadIdx.y + blockDim.y * blockIdx.y;if (x < M && y < N) {out[x][y] = in[y][x];}
}__global__ void gpu_shared_matrix_transpose(int in[N][M], int out[M][N])
{int x = threadIdx.x + blockDim.x * blockIdx.x;int y = threadIdx.y + blockDim.y * blockIdx.y;__shared__ int ken[BLOCK_SIZE + 1][BLOCK_SIZE + 1]; // 有冲突,所以多申请一些if (x < M && y < N){ken[threadIdx.y][threadIdx.x] = in[y][x];}__syncthreads();int x1 = threadIdx.x + blockDim.y * blockIdx.y;int y1 = threadIdx.y + blockDim.x * blockIdx.x;if (x1 < N && y1 < M) {out[y1][x1] = ken[threadIdx.x][threadIdx.y];}
}void cpu_matrix_transpose(int in[N][M], int out[M][N])
{for (int y = 0; y < N; y++) {for (int x = 0; x < M; x++) {out[x][y] = in[y][x];}}
}void transpose_test()
{for (int y = 0; y < N; y++) {for (int x = 0; x < M; x++) {matrix[y][x] = rand() % 1024;}}cudaEvent_t start, stop_gpu, stop_cpu;cudaEventCreate(&start);cudaEventCreate(&stop_gpu);cudaEventCreate(&stop_cpu);cudaEventRecord(start);cudaEventSynchronize(start);dim3 dimGrid((M + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);for (int i = 0; i < 20; i++){gpu_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);//gpu_shared_matrix_transpose<<<dimGrid, dimBlock>>>(matrix, gpu_matrix);cudaDeviceSynchronize();}cudaEventRecord(stop_gpu);cudaEventSynchronize(stop_gpu);cpu_matrix_transpose(matrix, cpu_matrix);cudaEventRecord(stop_cpu);cudaEventSynchronize(stop_cpu);float time_cpu, time_gpu;cudaEventElapsedTime(&time_gpu, start, stop_gpu);cudaEventElapsedTime(&time_cpu, stop_gpu, stop_cpu);bool errors = false;for (int y = 0; y < M; y++) {for (int x = 0; x < N; x++) {if (fabs(cpu_matrix[y][x] - gpu_matrix[y][x]) > (1.0e-10)){errors = true;break;}}}printf("Result: %s \n", errors ? "Errors" : "Pass");printf("CPU time: %.2f \n GPU time: %.2f \n", time_cpu, time_gpu / 20);
}