共享内存(Shared Memory)
1.是一种低延迟、高带宽的片上内存
2.由同一个Block内的所有线程共享
3.生命周期与Block相同
4.访问速度比全局内存快约100倍
Block(线程块)
1.GPU执行的基本单位,包含一组线程
2.多个Block组成Grid(网格)
3.Block内的线程可以通过共享内存通信
4.Block之间是独立执行的
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>#define CHECK(call) \{\const cudaError_t error = call; \if (error != cudaSuccess)\{\printf("Error: %s: %d\n", __FILE__, __LINE__);\printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\exit(1);\}\
}void initialInt( int * ip, int size)
{for (int i =0; i < size; i ++){ip[i] = i;}
}void printMatrix(int *C, const int nx, const int ny)
{int *ic = C;printf("\n matrix : (%d, %d)\n", nx, ny);for (int iy = 0; iy < ny; iy++){for (int ix =0; ix < nx; ix++){printf("%3d",ic[ix]);}ic += nx;printf("\n");}printf("\n");
}__global__ void printThreadIndex(int *A, const int nx, const int ny)
{int bx = blockIdx.x;int by = blockIdx.y;int ix = threadIdx.x + blockIdx.x * blockDim.x;int iy = threadIdx.y + blockIdx.y * blockDim.y;int tx = threadIdx.x;int ty = threadIdx.y;unsigned int idx = iy*nx + ix;const int BM = 2; const int BN = 4;__shared__ float smem[BM][BN];smem[ty][tx] = float(A[idx]);printf("threadidx: (%d ,%d) blockidx:(%d ,%d) coordinate: (%d ,%d) global index: (%2d ival %2d), smem val (%f) \n", threadIdx.x, threadIdx.y,blockIdx.x, blockIdx.y,ix, iy,idx, A[idx],//smem[ty][tx]smem[0][0]);}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);CHECK(cudaSetDevice(dev));// set matrix int nx = 8;int ny = 6;int nxy = nx * ny;int nBytes = nxy * sizeof(float);// malloc host memoryint * h_A;h_A = (int *) malloc(nBytes);//initial intinitialInt(h_A, nxy);printMatrix(h_A, nx, ny);// deviceint *d_MatA;cudaMalloc((void **)&d_MatA, nBytes);cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);dim3 block(4,2);dim3 grid ((nx + block.x - 1)/block.x, (ny + block.y - 1)/ block.y);printf("execution config grid (%d, %d), block (%d, %d)\n", grid.x, grid.y, block.x, block.y);printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);cudaDeviceSynchronize();cudaFree(d_MatA);free(h_A);cudaDeviceReset();return 0;
}
示例代码中, block的大小是(4,2), 所以在核函数中声明4x2大小的 SMEM, 只需要一次load操作,则8个线程会将数据load进 4x2大小的SMEM里。
在做printf的时候,因为SMEM对block 可见,所以访问SMEM[0][0] 打印出来的都是block里第一个线程load进去的数据。
输出如下:
matrix : (8, 6)0 1 2 3 4 5 6 78 9 10 11 12 13 14 1516 17 18 19 20 21 22 2324 25 26 27 28 29 30 3132 33 34 35 36 37 38 3940 41 42 43 44 45 46 47execution config grid (2, 3), block (4, 2)
threadidx: (0 ,0) blockidx:(0 ,1) coordinate: (0 ,2) global index: (16 ival 16), smem val (16.000000)
threadidx: (1 ,0) blockidx:(0 ,1) coordinate: (1 ,2) global index: (17 ival 17), smem val (16.000000)
threadidx: (2 ,0) blockidx:(0 ,1) coordinate: (2 ,2) global index: (18 ival 18), smem val (16.000000)
threadidx: (3 ,0) blockidx:(0 ,1) coordinate: (3 ,2) global index: (19 ival 19), smem val (16.000000)
threadidx: (0 ,1) blockidx:(0 ,1) coordinate: (0 ,3) global index: (24 ival 24), smem val (16.000000)
threadidx: (1 ,1) blockidx:(0 ,1) coordinate: (1 ,3) global index: (25 ival 25), smem val (16.000000)
threadidx: (2 ,1) blockidx:(0 ,1) coordinate: (2 ,3) global index: (26 ival 26), smem val (16.000000)
threadidx: (3 ,1) blockidx:(0 ,1) coordinate: (3 ,3) global index: (27 ival 27), smem val (16.000000)
threadidx: (0 ,0) blockidx:(1 ,1) coordinate: (4 ,2) global index: (20 ival 20), smem val (20.000000)
threadidx: (1 ,0) blockidx:(1 ,1) coordinate: (5 ,2) global index: (21 ival 21), smem val (20.000000)
threadidx: (2 ,0) blockidx:(1 ,1) coordinate: (6 ,2) global index: (22 ival 22), smem val (20.000000)
threadidx: (3 ,0) blockidx:(1 ,1) coordinate: (7 ,2) global index: (23 ival 23), smem val (20.000000)
threadidx: (0 ,1) blockidx:(1 ,1) coordinate: (4 ,3) global index: (28 ival 28), smem val (20.000000)
threadidx: (1 ,1) blockidx:(1 ,1) coordinate: (5 ,3) global index: (29 ival 29), smem val (20.000000)
threadidx: (2 ,1) blockidx:(1 ,1) coordinate: (6 ,3) global index: (30 ival 30), smem val (20.000000)
threadidx: (3 ,1) blockidx:(1 ,1) coordinate: (7 ,3) global index: (31 ival 31), smem val (20.000000)
threadidx: (0 ,0) blockidx:(1 ,0) coordinate: (4 ,0) global index: ( 4 ival 4), smem val (4.000000)
threadidx: (1 ,0) blockidx:(1 ,0) coordinate: (5 ,0) global index: ( 5 ival 5), smem val (4.000000)
threadidx: (2 ,0) blockidx:(1 ,0) coordinate: (6 ,0) global index: ( 6 ival 6), smem val (4.000000)
threadidx: (3 ,0) blockidx:(1 ,0) coordinate: (7 ,0) global index: ( 7 ival 7), smem val (4.000000)
threadidx: (0 ,1) blockidx:(1 ,0) coordinate: (4 ,1) global index: (12 ival 12), smem val (4.000000)
threadidx: (1 ,1) blockidx:(1 ,0) coordinate: (5 ,1) global index: (13 ival 13), smem val (4.000000)
threadidx: (2 ,1) blockidx:(1 ,0) coordinate: (6 ,1) global index: (14 ival 14), smem val (4.000000)
threadidx: (3 ,1) blockidx:(1 ,0) coordinate: (7 ,1) global index: (15 ival 15), smem val (4.000000)
threadidx: (0 ,0) blockidx:(0 ,2) coordinate: (0 ,4) global index: (32 ival 32), smem val (32.000000)
threadidx: (1 ,0) blockidx:(0 ,2) coordinate: (1 ,4) global index: (33 ival 33), smem val (32.000000)
threadidx: (1 ,0) blockidx:(0 ,2) coordinate: (1 ,4) global index: (33 ival 33), smem val (32.000000)
threadidx: (2 ,0) blockidx:(0 ,2) coordinate: (2 ,4) global index: (34 ival 34), smem val (32.000000)
threadidx: (3 ,0) blockidx:(0 ,2) coordinate: (3 ,4) global index: (35 ival 35), smem val (32.000000)
threadidx: (0 ,1) blockidx:(0 ,2) coordinate: (0 ,5) global index: (40 ival 40), smem val (32.000000)
threadidx: (1 ,1) blockidx:(0 ,2) coordinate: (1 ,5) global index: (41 ival 41), smem val (32.000000)
threadidx: (0 ,1) blockidx:(0 ,2) coordinate: (0 ,5) global index: (40 ival 40), smem val (32.000000)
threadidx: (1 ,1) blockidx:(0 ,2) coordinate: (1 ,5) global index: (41 ival 41), smem val (32.000000)
threadidx: (2 ,1) blockidx:(0 ,2) coordinate: (2 ,5) global index: (42 ival 42), smem val (32.000000)
threadidx: (3 ,1) blockidx:(0 ,2) coordinate: (3 ,5) global index: (43 ival 43), smem val (32.000000)
threadidx: (3 ,1) blockidx:(0 ,2) coordinate: (3 ,5) global index: (43 ival 43), smem val (32.000000)
threadidx: (0 ,0) blockidx:(0 ,0) coordinate: (0 ,0) global index: ( 0 ival 0), smem val (0.000000)
threadidx: (0 ,0) blockidx:(0 ,0) coordinate: (0 ,0) global index: ( 0 ival 0), smem val (0.000000)
threadidx: (1 ,0) blockidx:(0 ,0) coordinate: (1 ,0) global index: ( 1 ival 1), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (0 ,1) blockidx:(0 ,0) coordinate: (0 ,1) global index: ( 8 ival 8), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (0 ,1) blockidx:(0 ,0) coordinate: (0 ,1) global index: ( 8 ival 8), smem val (0.000000)
threadidx: (1 ,1) blockidx:(0 ,0) coordinate: (1 ,1) global index: ( 9 ival 9), smem val (0.000000)
threadidx: (2 ,1) blockidx:(0 ,0) coordinate: (2 ,1) global index: (10 ival 10), smem val (0.000000)
threadidx: (3 ,1) blockidx:(0 ,0) coordinate: (3 ,1) global index: (11 ival 11), smem val (0.000000)
threadidx: (0 ,0) blockidx:(1 ,2) coordinate: (4 ,4) global index: (36 ival 36), smem val (36.000000)
threadidx: (1 ,0) blockidx:(1 ,2) coordinate: (5 ,4) global index: (37 ival 37), smem val (36.000000)
threadidx: (2 ,0) blockidx:(1 ,2) coordinate: (6 ,4) global index: (38 ival 38), smem val (36.000000)
threadidx: (3 ,0) blockidx:(1 ,2) coordinate: (7 ,4) global index: (39 ival 39), smem val (36.000000)
threadidx: (0 ,1) blockidx:(1 ,2) coordinate: (4 ,5) global index: (44 ival 44), smem val (36.000000)
threadidx: (1 ,1) blockidx:(1 ,2) coordinate: (5 ,5) global index: (45 ival 45), smem val (36.000000)
threadidx: (2 ,1) blockidx:(1 ,2) coordinate: (6 ,5) global index: (46 ival 46), smem val (36.000000)
threadidx: (3 ,1) blockidx:(1 ,2) coordinate: (7 ,5) global index: (47 ival 47), smem val (36.000000)