【cuda学习日记】3.1 CUDA执行模型--线程束分化

3.1.1 将同用的function放到header文件里
./common/common.h

#define CHECK(call) \{\const cudaError_t error = call; \if (error != cudaSuccess)\{\printf("Error: %s: %d\n", __FILE__, __LINE__);\printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\exit(1);\}\
}class Timer{cudaEvent_t _start;cudaEvent_t _stop;public:Timer(){cudaEventCreate(&_start);cudaEventCreate(&_stop);}void start(){cudaEventRecord(_start, 0);}void stop(){cudaEventRecord(_stop, 0);cudaEventSynchronize(_stop);}float elapsedms(){float out;cudaEventElapsedTime(&out, _start, _stop);return out;}~Timer(){cudaEventDestroy(_start);cudaEventDestroy(_stop);}
};

然后include

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>
#include "../common/common.h"typedef unsigned long DWORD;__global__ void mathKernel1( float *C){int tid = blockIdx.x * blockDim.x + threadIdx.x;float a, b;a = b = 0.0f;if (tid % 2 == 0){a = 100.0f;}else{b = 200.0f;}C[tid] = a + b;
}__global__ void mathKernel2( float *C){int tid = blockIdx.x * blockDim.x + threadIdx.x;float a, b;a = b = 0.0f;if ((tid / warpSize) % 2 == 0){a = 100.0f;}else{b = 200.0f;}C[tid] = a + b;
}__global__ void mathKernel3( float *C){int tid = blockIdx.x * blockDim.x + threadIdx.x;float a, b;a = b = 0.0f;bool ipred = (tid % 2 == 0);if (ipred){a = 100.0f;}if  (!ipred){b = 200.0f;}C[tid] = a + b;
}__global__ void warmingup( float *C){int tid = blockIdx.x * blockDim.x + threadIdx.x;float a, b;a = b = 0.0f;if ((tid / warpSize) % 2 == 0){a = 100.0f;}else{b = 200.0f;}C[tid] = a + b;
}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);int size = 64;int blocksize = 64;if (argc > 1) blocksize = atoi(argv[1]);if (argc > 2) size      = atoi(argv[2]);printf("Data size %d\n", size);dim3 block(blocksize, 1);dim3 grid((size + block.x - 1)/block.x);printf("execution config: %d %d\n", block.x, grid.x);float *d_C;size_t nBytes = size * sizeof(float);cudaMalloc((float**) &d_C, nBytes);Timer timer;timer.start();cudaDeviceSynchronize();warmingup<<<grid,block>>>(d_C);cudaDeviceSynchronize();timer.stop();float elapsedTime = timer.elapsedms();printf("warmup execution time: %f ms\n", elapsedTime);// kernel 1Timer timer1;timer1.start();cudaDeviceSynchronize();mathKernel1<<<grid,block>>>(d_C);cudaDeviceSynchronize();timer1.stop();float elapsedTime1 = timer1.elapsedms();printf("kernel1 execution time: %f ms\n", elapsedTime1);// kernel 2Timer timer2;timer2.start();cudaDeviceSynchronize();mathKernel2<<<grid,block>>>(d_C);cudaDeviceSynchronize();timer2.stop();float elapsedTime2 = timer2.elapsedms();printf("kernel2 execution time: %f ms\n", elapsedTime2);// kernel 3Timer timer3;timer3.start();cudaDeviceSynchronize();mathKernel3<<<grid,block>>>(d_C);cudaDeviceSynchronize();timer3.stop();float elapsedTime3 = timer3.elapsedms();printf("kernel3 execution time: %f ms\n", elapsedTime3);cudaFree(d_C);cudaDeviceReset();return 0;
}

输出时间每次略有差异
Data size 64
execution config: 64 1
warmup execution time: 0.317440 ms
kernel1 execution time: 0.033792 ms
kernel2 execution time: 0.044992 ms
kernel3 execution time: 0.034816 ms

线程束定义：
线程束是SM中基本的执行单元。当一个线程块的网格被启动后，网格中的线程块分布在SM中。一旦线程块被调度到一个SM上，线程块中的线程会被进一步划分为线程束。一个线程束由32个连续的线程组成。

线程束分化：
一个线程束中的所有线程在同一周期中必须执行相同的指令，如果一个线程执行一条指令，那么线程束中的所有线程都必须执行该指令。对于kernel1,一半的线程束需要执行if语句块中的指令，而另一半需要执行else语句块中的指令。在同一线程束中的线程执行不同的指令，被称为线程束分化。

kernel1 和 kernel2的区别：
tid % 2 == 0是在单个线程束内进行操作，它会交替选择线程束中的线程。
(tid / warpsize) % 2 == 0是在线程束级别进行操作，它会选择所有偶数索引的线程束中的所有线程。
例如，假设warpsize是32：

对于tid % 2 == 0，满足条件的线程ID将是0, 2, 4, …, 30, 32, 34, …, 62等。
对于(tid / warpsize) % 2 == 0，满足条件的线程ID将是0-31, 64-95, 128-159等，即每个偶数索引的线程束中的所有32个线程。

通过NVIDIA Nsight Compute (NCU) 来查看和分析线程束分化
ncu --launch-skip=0 --launch-count=4 .\simple_divergence2.exe
–launch-skip=0表示从应用程序启动开始就进行分析，–launch-count=1表示只分析一次内核启动。
结果，还不知道如何看：
在这里插入图片描述