Cuda编程——使用share memory优化矩阵乘法

在上一篇文章：第一个Cuda程序，矩阵相乘代码，我们设计了一种并行的矩阵乘法程序，效果和使用CPU计算的一样，但时间有了很大的降低，然而，这只是最基本的一种方法，事实上我们完全可以让程序变得更快！

仔细看看，会发现我们使用的是global memory，而share memory的访问速度要远远大于global memory，所以我们将使用share memory优化矩阵乘法，让程序更快！

#include <stdio.h>#include<stdlib.h>#include <cuda.h>#include <cuda_runtime.h>#include <device_launch_parameters.h>#include <device_functions.h>
# define BLOCK_SIZE 8# define M 6# define N 8# define K 6__managed__ float a[M * N];__managed__ float b[N*K];__managed__ float c_gpu[M * K];__managed__ float c_cpu[M * K];__global__ void gpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k){  __shared__ float sub_a[BLOCK_SIZE][BLOCK_SIZE];  __shared__ float sub_b[BLOCK_SIZE][BLOCK_SIZE];  int x = threadIdx.x + blockIdx.x * blockDim.x;  int y = threadIdx.y + blockIdx.y * blockDim.y;  float temp = 0.0;  int step, i;  for (step = 0; step <N / BLOCK_SIZE; step++)  {    if ((step * BLOCK_SIZE + threadIdx.x) >= N || y >= M)    {      sub_a[threadIdx.y][threadIdx.x] = 0.0;    }    else    {      sub_a[threadIdx.y][threadIdx.x] = a[y * N + (step * BLOCK_SIZE + threadIdx.x)];    }
    if ((step * BLOCK_SIZE + threadIdx.y) >= N || x >= K)    {      sub_b[threadIdx.y][threadIdx.x] = 0.0;
    }    else    {      sub_b[threadIdx.y][threadIdx.x] = b[(step * BLOCK_SIZE + threadIdx.y) * K + x];    }    __syncthreads();    for (i = 0; i < BLOCK_SIZE; i++)    {      temp = temp + sub_a[threadIdx.y][i] * sub_b[i][threadIdx.x];    }    __syncthreads();
    if (x < K && y < M)    {      c[y * K + x] = temp;    }  }}
void cpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k){  int y, x, step;  float temp;  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      temp = 0;      for (step = 0; step < N; step++)      {        temp += a[y * N + step] * b[step * K + x];      }      c[y * K + x] = temp;    }  }}
int main(){  int x,y;  float item1;  //初始化矩阵  for (y = 0; y < M; y++)  {    for (x = 0; x < N; x++)    {      item1 = x + y;      a[y * N + x] = item1;
    }
  }  for (y = 0; y < N; y++)  {    for (x = 0; x < K; x++)    {      item1 = x + y;      b[y * K + x] = item1;
    }  }  printf("-----------------两个矩阵-#--------------\n");  for (y = 0; y < M; y++)  {
    for (x = 0; x < N; x++)    {      printf("%f ", a[y * N + x]);    }    printf("\n");  }  for (y = 0; y < N; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", b[y * K + x]);    }    printf("\n");  }  unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;  unsigned int grid_cols = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
  dim3 dimGrid(grid_rows, grid_cols);  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  gpu_matrix <<<dimGrid, dimBlock>>> (a, b, c_gpu, M, N, K);
  cudaDeviceSynchronize();
  cpu_matrix(a, b, c_cpu, M, N, K);  //  //打印cpu计算结果  printf("---------------cpu计算结果---------#------------------\n");
  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", c_cpu[y * K + x]);    }    printf("\n");
  }
  printf("------------------gpu计算结果------#------------------\n");  //打印GPU计算结果  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", c_gpu[y * K + x]);    }    printf("\n");  }  return 0;}

-----------------两个矩阵-#--------------0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.0000001.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.0000002.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.0000003.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.0000004.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.0000005.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.0000000.000000  1.000000  2.000000  3.000000  4.000000  5.0000001.000000  2.000000  3.000000  4.000000  5.000000  6.0000002.000000  3.000000  4.000000  5.000000  6.000000  7.0000003.000000  4.000000  5.000000  6.000000  7.000000  8.0000004.000000  5.000000  6.000000  7.000000  8.000000  9.0000005.000000  6.000000  7.000000  8.000000  9.000000  10.0000006.000000  7.000000  8.000000  9.000000  10.000000  11.0000007.000000  8.000000  9.000000  10.000000  11.000000  12.000000---------------cpu计算结果---------#------------------140.000000  168.000000  196.000000  224.000000  252.000000  280.000000168.000000  204.000000  240.000000  276.000000  312.000000  348.000000196.000000  240.000000  284.000000  328.000000  372.000000  416.000000224.000000  276.000000  328.000000  380.000000  432.000000  484.000000252.000000  312.000000  372.000000  432.000000  492.000000  552.000000280.000000  348.000000  416.000000  484.000000  552.000000  620.000000------------------gpu计算结果------#------------------140.000000  168.000000  196.000000  224.000000  252.000000  280.000000168.000000  204.000000  240.000000  276.000000  312.000000  348.000000196.000000  240.000000  284.000000  328.000000  372.000000  416.000000224.000000  276.000000  328.000000  380.000000  432.000000  484.000000252.000000  312.000000  372.000000  432.000000  492.000000  552.000000280.000000  348.000000  416.000000  484.000000  552.000000  620.000000