参考资料
CUDA编程模型系列六(利用shared memory和统一内存优化矩阵乘)_哔哩哔哩_bilibili
代码片段
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>#define M 1000
#define N 500
#define K 800/* CUDA自动维护申请和释放 */
__managed__ int a[M * N];
__managed__ int b[N * K];
__managed__ int c_gpu[M * K];
__managed__ int c_cpu[M * K];#define BLOCK_SIZE 16__global__ void gpu_matrix(int* a, int* b, int* c, int m, int n, int k)
{/* 申请共享内存 */__shared__ int sub_a[BLOCK_SIZE][BLOCK_SIZE];__shared__ int sub_b[BLOCK_SIZE][BLOCK_SIZE];int x = threadIdx.x + blockDim.x * threadIdx.x;int y = threadIdx.y + blockDim.y * threadIdx.y;int tmp = 0;int idx;for (int step = 0; step < n / BLOCK_SIZE; step++){/* 首先加载 小a 矩阵 */int step_x = step * BLOCK_SIZE + threadIdx.x;int step_y = threadIdx.y;idx = step_y * n + step_x;if (step_x >= n || step_y >= m){sub_a[threadIdx.y][threadIdx.x] = 0;}else{sub_a[threadIdx.y][threadIdx.x] = a[idx];}/* 再加载小b矩阵 */step_x = x;step_y = step * BLOCK_SIZE + threadIdx.y;idx = step_y * k + step_x;if (step_x >= k || step_y >= n){sub_b[threadIdx.y][threadIdx.x] = 0;}else{sub_b[threadIdx.y][threadIdx.x] = b[idx];}__syncthreads();for (int i = 0; i < BLOCK_SIZE; i++) {tmp += sub_a[threadIdx.y][i] * sub_b[i][threadIdx.x];}__syncthreads();}if (x < k && y < m){c[y * k + x] = tmp;}
}void cpu_matrix(int* a, int* b, int* c, int m, int n, int k)
{for (int y = 0; y < m; y++) {for (int x = 0; x < k; x++) {int tmp = 0;for (int z = 0; z < n; z++) {tmp += (a[y * n + z] * b[z * k + x]);}c[y * k + x] = tmp;}}
}int main()
{/* 初始化数据 */for (int y = 0; y < M; y++) {for (int x = 0; x < N; x++) {a[y * N + x] = x % 1024;}}for (int y = 0; y < N; y++) {for (int x = 0; x < K; x++) {b[y * K + x] = x % 1024;}}unsigned int grid_x = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;unsigned int grid_y = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;dim3 gridDim(grid_x, grid_y);dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);gpu_matrix<<<gridDim, dimBlock>>>(a, b, c_gpu, M, N, K);cudaDeviceSynchronize();cpu_matrix(a, b, c_cpu, M, N, K);bool errors = false;for (int y = 0; y < M; y++) {for (int x = 0; x < K; x++) {if(abs(c_cpu[y * K + x] - c_gpu[y * K + x]) > (1.0e-10))errors = true;}}printf("Result: %s \n", errors ? "Pass" : "Error");return 0;
}
小结
1. 我的代码是可以在windows平台运行的。
2. 建议还是跟着UP主自己手敲一遍,看会是一遍,自己写又是另一遍。