Opencl 并行求和

上周尝试用opencl求极大值，在网上查到大多是求和，所谓的reduction算法。不过思路是一样的。

CPP：

　　 int err = 0;unsigned long int nNumCount = 102400000;int nLocalSize = 256;int nGroupSize = 102400;int nGroup = nGroupSize / nLocalSize;int* pArray = new int[nNumCount];unsigned long int nReal = 0;int nStart = GetTickCount();for (int i=0;i<nNumCount;++i){pArray[i] = i*2;nReal += pArray[i];}cout<<GetTickCount() - nStart<<endl;cl_mem clmemArray = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nNumCount,NULL,NULL);err = clEnqueueWriteBuffer(queue,clmemArray,CL_TRUE,0,sizeof(int)*nNumCount,pArray,0,0,0);cl_mem clmemRes  = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nGroup,NULL,NULL);nStart = GetTickCount();err = clSetKernelArg(m_KerCalcRay,0,sizeof(cl_mem),&clmemArray);err = clSetKernelArg(m_KerCalcRay,1,sizeof(cl_mem),&clmemRes);err = clSetKernelArg(m_KerCalcRay,2,sizeof(int)*nLocalSize,0);err = clSetKernelArg(m_KerCalcRay,3,sizeof(int),&nNumCount);size_t localws[1] = {nLocalSize};size_t globalws[1] = {nGroupSize};err = clEnqueueNDRangeKernel(queue,m_KerCalcRay,1,NULL,globalws,localws,0,NULL,NULL);clFinish(queue);int* pRes = new int[nGroup];err = clEnqueueReadBuffer(queue,clmemRes,CL_TRUE,0,sizeof(int)*nGroup,pRes,0,0,0);clFinish(queue);unsigned long int nRes = 0;for(int i=0;i<nGroup;++i){nRes += pRes[i];}
　　assert（nRes == nReal）;

kernel:

__kernel void ReduceSum(__global int* num,__global int* res,__local int* pData,int nCount)
{unsigned int tid = get_local_id(0); unsigned int bid = get_group_id(0); unsigned int gid = get_global_id(0);unsigned int localSize = get_local_size(0); unsigned int globalSize = get_global_size(0);int nRes = 0;while(gid < nCount){nRes += num[gid];gid += globalSize;}pData[tid] = nRes;    barrier(CLK_LOCAL_MEM_FENCE);// do reduction in shared memfor(unsigned int s = localSize >> 1; s > 0; s >>= 1) {if(tid < s) {pData[tid] += pData[tid + s];}barrier(CLK_LOCAL_MEM_FENCE);}if(tid == 0)res[bid] =     pData[0];}

Reduction求和是这样一种方法，比如8个数0到7依次存放，求和的时候就是下标0和4、1和5、2和6、3和7，求和结果放到下标0、1、2、3中（同步一把barrier(CLK_LOCAL_MEM_FENCE)）。然后继续就是0和2,、1和3求和结果放到0、1中。如此往复、最终结果就放到下标0中啦。

另：我试过循环展开减少同步次数、不过效率增长微乎其微。

转载于:https://www.cnblogs.com/slean/p/3770094.html

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.mzph.cn/news/575415.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！