1、算法简述

　　实现矩阵相加：Cn = An + Bn。这个例子虽然很简单，但是由于矩阵元素之间相互独立，每个元素可以非常容易地进行并行计算，可以非常理想地在OpenCL中实现。

2. C/C++实现

[cpp] view plaincopyprint?

/*
* This confidential and proprietary software may be used only as
* authorised by a licensing agreement from ARM Limited
* (C) COPYRIGHT 2013 ARM Limited
* ALL RIGHTS RESERVED
* The entire notice above must be reproduced on all authorised
* copies and copies may only be made to the extent permitted
* by a licensing agreement from ARM Limited.
*/
#include <iostream>
using namespace std;
/**
* \brief Basic integer array addition implemented in C/C++.
* \details A sample which shows how to add two integer arrays and store the result in a third array.
* No OpenCL code is used in this sample, only standard C/C++. The code executes only on the CPU.
* \return The exit code of the application, non-zero if a problem occurred.
*/
int main(void)
{
/* [Setup memory] */
/* Number of elements in the arrays of input and output data. */
int arraySize = 1000000;
/* Arrays to hold the input and output data. */
int* inputA = new int[arraySize];
int* inputB = new int[arraySize];
int* output = new int[arraySize];
/* [Setup memory] */
/* Fill the arrays with data. */
for (int i = 0; i < arraySize; i++)
{
inputA[i] = i;
inputB[i] = i;
}
/* [C/C++ Implementation] */
for (int i = 0; i < arraySize; i++)
{
output[i] = inputA[i] + inputB[i];
}
/* [C/C++ Implementation] */
/* Uncomment the following block to print results. */
/*
for (int i = 0; i < arraySize; i++)
{
cout << "i = " << i << ", output = " << output[i] << "\n";
}
*/
delete[] inputA;
delete[] inputB;
delete[] output;
}

3 Open基本实现

3.1 内核代码实现

　　内核代码的实现如下，其中指针的修饰符restrict是C99中的关键字，只用于限定指针。该关键字用于告知编译器，所有修改该指针所指向内容的操作全部都是基于该指针的，即不存在其它进行修改操作的途径；这样的后果是帮助编译器进行更好的代码优化，生成更有效率的汇编代码。

[cpp] view plaincopyprint?

/*
* This confidential and proprietary software may be used only as
* authorised by a licensing agreement from ARM Limited
* (C) COPYRIGHT 2013 ARM Limited
* ALL RIGHTS RESERVED
* The entire notice above must be reproduced on all authorised
* copies and copies may only be made to the extent permitted
* by a licensing agreement from ARM Limited.
*/
/**
* \brief Hello World kernel function.
* \param[in] inputA First input array.
* \param[in] inputB Second input array.
* \param[out] output Output array.
*/
/* [OpenCL Implementation] */
__kernel void hello_world_opencl(__global int* restrict inputA,
__global int* restrict inputB,
__global int* restrict output)
{
/*
* Set i to be the ID of the kernel instance.
* If the global work size (set by clEnqueueNDRangeKernel) is n,
* then n kernels will be run and i will be in the range [0, n - 1].
*/
int i = get_global_id(0);
/* Use i as an index into the three arrays. */
output[i] = inputA[i] + inputB[i];
}
/* [OpenCL Implementation] */

3.2 宿主机代码实现

　　内核代码中并没有循环语句，只计算一个矩阵元素的值，每一个实例获得一个独一无二的所以需要运行的内核实例数目等同于矩阵元素个数。

[cpp] view plaincopyprint?

/*
* Each instance of our OpenCL kernel operates on a single element of each array so the number of
* instances needed is the number of elements in the array.
*/
size_t globalWorksize[1] = {arraySize};
/* Enqueue the kernel */
if (!checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorksize, NULL, 0, NULL, &event)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}

　　因为我们并没有设置内核间的依赖性，OpenCL设备可以用并行的方式自由地运行内核实例。现在并行化上的唯一限制是设备的容量。在前面的代码运行之前，需要建立OpenCL，下面分别介绍与建立OpenCL相关的各项内容。

　　因为现在的操作是在GPU而不是CPU中，我们需要知道任何使用数据的位置。知道数据是在GPU内存空间还是CPU内存空间是非常重要的。在桌面系统中，GPU和CPU有它们自己的内存空间，被相对低速率的总线分开，这意味着在GPU和CPU之间共享数据是一个代价高昂的操作。在大多数带Mali-T600系列GPU的嵌入式系统中，GPU和CPU共享同一个内存，因此这使得以相对低的代价共享GPU和CPU之间内存成为可能。

　　由于这些系统的差异，OpenCL支持多种分配和共享设备间内存的方式。下面是一种共享设备间内存的方式，目的是减少从一个设备到另一个设备的内存拷贝(在一个共享内存系统中)。

a. 要求OpenCL设备分配内存

　　在C/C++实现中，我们使用数组来分配内存。

[cpp] view plaincopyprint?

/* Number of elements in the arrays of input and output data. */
int arraySize = 1000000;
/* Arrays to hold the input and output data. */
int* inputA = new int[arraySize];
int* inputB = new int[arraySize];
int* output = new int[arraySize];

　　在OpenCL中，我们使用内存缓冲区。内存缓冲区其实是一定大小的内存块。为了分配缓冲区，我们如下做：

[cpp] view plaincopyprint?

/* Number of elements in the arrays of input and output data. */
cl_int arraySize = 1000000;
/* The buffers are the size of the arrays. */
size_t bufferSize = arraySize * sizeof(cl_int);
/*
* Ask the OpenCL implementation to allocate buffers for the data.
* We ask the OpenCL implemenation to allocate memory rather than allocating
* it on the CPU to avoid having to copy the data later.
* The read/write flags relate to accesses to the memory from within the kernel.
*/
bool createMemoryObjectsSuccess = true;
memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
createMemoryObjectsSuccess &= checkSuccess(errorNumber);
memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
createMemoryObjectsSuccess &= checkSuccess(errorNumber);
memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
createMemoryObjectsSuccess &= checkSuccess(errorNumber);
if (!createMemoryObjectsSuccess)
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to create OpenCL buffer. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}

　　尽管这看上去更加复杂，但其实这里只有三个OpenCL API调用。唯一的区别是这里我们检查错误(这是一个好的做法)，而C++中并不用做。

b. 映射内存到局部指针

　　现在内存已分配，但是只有OpenCL实现知道它的位置。为了访问CPU上的内存，我们把它们映射到一个指针。

[cpp] view plaincopyprint?

/* Map the memory buffers created by the OpenCL implementation to pointers so we can access them on the CPU. */
bool mapMemoryObjectsSuccess = true;
cl_int* inputA = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &errorNumber);
mapMemoryObjectsSuccess &= checkSuccess(errorNumber);
cl_int* inputB = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &errorNumber);
mapMemoryObjectsSuccess &= checkSuccess(errorNumber);
if (!mapMemoryObjectsSuccess)
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to map buffer. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}

　　现在这些指针可以想普通的C/C++指针那样使用了。

c. 在CPU上初始化数据

　　因为我们已有了指向内存的指针，这一步与在CPU上一样。

[cpp] view plaincopyprint?

for (int i = 0; i < arraySize; i++)
{
inputA[i] = i;
inputB[i] = i;
}

d. 取消映射缓冲区

　　为了使OpenCL设备使用缓冲区，我们必须把它们在CPU上的映射取消。

[cpp] view plaincopyprint?

/*
* Unmap the memory objects as we have finished using them from the CPU side.
* We unmap the memory because otherwise:
* - reads and writes to that memory from inside a kernel on the OpenCL side are undefined.
* - the OpenCL implementation cannot free the memory when it is finished.
*/
if (!checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[0], inputA, 0, NULL, NULL)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Unmapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
if (!checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[1], inputB, 0, NULL, NULL)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Unmapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}

e. 映射数据到内核

　　在我们调度内核运行之前，我们必须告诉内核哪些数据作为输入使用。这里，我们映射内存对象到OpenCL内核函数的参数中。

[cpp] view plaincopyprint?

bool setKernelArgumentsSuccess = true;
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]));
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]));
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]));
if (!setKernelArgumentsSuccess)
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}

f. 运行内核

　　对于内核代码见前面，如何调度它则不作详述。

g. 获取运行结果

　　一旦计算结束，我们像映射输入缓冲区那样映射输出缓冲区。然后，我们就可以使用指针读取结果数据，然后取消缓冲区映射，就像前面那样。

　　基本实现的宿主机的完整代码如下：

[cpp] view plaincopyprint?

/*
* This confidential and proprietary software may be used only as
* authorised by a licensing agreement from ARM Limited
* (C) COPYRIGHT 2013 ARM Limited
* ALL RIGHTS RESERVED
* The entire notice above must be reproduced on all authorised
* copies and copies may only be made to the extent permitted
* by a licensing agreement from ARM Limited.
*/
#include "common.h"
#include "image.h"
#include <CL/cl.h>
#include <iostream>
using namespace std;
/**
* \brief Basic integer array addition implemented in OpenCL.
* \details A sample which shows how to add two integer arrays and store the result in a third array.
* The main calculation code is in an OpenCL kernel which is executed on a GPU device.
* \return The exit code of the application, non-zero if a problem occurred.
*/
int main(void)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
int numberOfMemoryObjects = 3;
cl_mem memoryObjects[3] = {0, 0, 0};
cl_int errorNumber;
if (!createContext(&context))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to create an OpenCL context. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
if (!createCommandQueue(context, &commandQueue, &device))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to create the OpenCL command queue. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
if (!createProgram(context, device, "assets/hello_world_opencl.cl", &program))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to create OpenCL program." << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
kernel = clCreateKernel(program, "hello_world_opencl", &errorNumber);
if (!checkSuccess(errorNumber))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to create OpenCL kernel. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Setup memory] */
/* Number of elements in the arrays of input and output data. */
cl_int arraySize = 1000000;
/* The buffers are the size of the arrays. */
size_t bufferSize = arraySize * sizeof(cl_int);
/*
* Ask the OpenCL implementation to allocate buffers for the data.
* We ask the OpenCL implemenation to allocate memory rather than allocating
* it on the CPU to avoid having to copy the data later.
* The read/write flags relate to accesses to the memory from within the kernel.
*/
bool createMemoryObjectsSuccess = true;
memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
createMemoryObjectsSuccess &= checkSuccess(errorNumber);
memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
createMemoryObjectsSuccess &= checkSuccess(errorNumber);
memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
createMemoryObjectsSuccess &= checkSuccess(errorNumber);
if (!createMemoryObjectsSuccess)
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to create OpenCL buffer. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Setup memory] */
/* [Map the buffers to pointers] */
/* Map the memory buffers created by the OpenCL implementation to pointers so we can access them on the CPU. */
bool mapMemoryObjectsSuccess = true;
cl_int* inputA = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &errorNumber);
mapMemoryObjectsSuccess &= checkSuccess(errorNumber);
cl_int* inputB = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &errorNumber);
mapMemoryObjectsSuccess &= checkSuccess(errorNumber);
if (!mapMemoryObjectsSuccess)
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to map buffer. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Map the buffers to pointers] */
/* [Initialize the input data] */
for (int i = 0; i < arraySize; i++)
{
inputA[i] = i;
inputB[i] = i;
}
/* [Initialize the input data] */
/* [Un-map the buffers] */
/*
* Unmap the memory objects as we have finished using them from the CPU side.
* We unmap the memory because otherwise:
* - reads and writes to that memory from inside a kernel on the OpenCL side are undefined.
* - the OpenCL implementation cannot free the memory when it is finished.
*/
if (!checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[0], inputA, 0, NULL, NULL)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Unmapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
if (!checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[1], inputB, 0, NULL, NULL)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Unmapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Un-map the buffers] */
/* [Set the kernel arguments] */
bool setKernelArgumentsSuccess = true;
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]));
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]));
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]));
if (!setKernelArgumentsSuccess)
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Set the kernel arguments] */
/* An event to associate with the Kernel. Allows us to retrieve profiling information later. */
cl_event event = 0;
/* [Global work size] */
/*
* Each instance of our OpenCL kernel operates on a single element of each array so the number of
* instances needed is the number of elements in the array.
*/
size_t globalWorksize[1] = {arraySize};
/* Enqueue the kernel */
if (!checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorksize, NULL, 0, NULL, &event)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Global work size] */
/* Wait for kernel execution completion. */
if (!checkSuccess(clFinish(commandQueue)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* Print the profiling information for the event. */
printProfilingInfo(event);
/* Release the event object. */
if (!checkSuccess(clReleaseEvent(event)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed releasing the event object. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* Get a pointer to the output data. */
cl_int* output = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, bufferSize, 0, NULL, NULL, &errorNumber);
if (!checkSuccess(errorNumber))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed to map buffer. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* [Output the results] */
/* Uncomment the following block to print results. */
/*
for (int i = 0; i < arraySize; i++)
{
cout << "i = " << i << ", output = " << output[i] << "\n";
}
*/
/* [Output the results] */
/* Unmap the memory object as we are finished using them from the CPU side. */
if (!checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Unmapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}
/* Release OpenCL objects. */
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
}

4 向量化你的OpenCL代码

4.1 向量基础

　　OpenCL设备可以通告它们为不同数据类型的首选向量宽度，你可以使用这个信息来选择一个内核。结果是，相当于该内核为你正在运行的平台做了优化。例如，一个设备可能仅有标量整数的硬件支持，而另一个设备则有宽度为4的整数向量的硬件支持。可以写两个版本的内核，一个用于标量，一个用于向量，在运行时选择正确的版本。

　　这里是一个在特定设备上询问首选整数向量宽度的例子。

[cpp] view plaincopyprint?

/*
* Query the device to find out it's prefered integer vector width.
* Although we are only printing the value here, it can be used to select between
* different versions of a kernel.
*/
cl_uint integerVectorWidth;
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &integerVectorWidth, NULL);
cout << "Prefered vector width for integers: " << integerVectorWidth << endl;

对于其它OpenCL数据类型也是一样的。

　　每一个Mali T600系列GPU核最少有两个128位宽度的ALU(算数逻辑单元)，它们具有矢量计算能力。ALU中的绝大多数操作(例如，浮点加，浮点乘，整数加，整数乘)，可以以128位向量数据操作(例如，char16, short8, int4, float4)。使用前面讲述的询问方法来为你的数据类型决定使用正确的向量大小。

　　当使用Mali T600系列GPU时，我们推荐在任何可能的地方使用向量。

4.2 向量化代码

　　首先，修改内核代码以支持向量运算。对于Mali T600系列GPU来说，一个向量运算的时间与一个整数加法的时间是一样的。具体代码解读，见下面代码中的注释部分。

[cpp] view plaincopyprint?

__kernel void hello_world_vector(__global int* restrict inputA,
__global int* restrict inputB,
__global int* restrict output)
{
/*
* We have reduced the global work size (n) by a factor of 4 compared to the hello_world_opencl sample.
* Therefore, i will now be in the range [0, (n / 4) - 1].
*/
int i = get_global_id(0);
/*
* Load 4 integers into 'a'.
* The offset calculation is implicit from the size of the vector load.
* For vloadN(i, p), the address of the first data loaded would be p + i * N.
* Load from the data from the address: inputA + i * 4.
*/
int4 a = vload4(i, inputA);
/* Do the same for inputB */
int4 b = vload4(i, inputB);
/*
* Do the vector addition.
* Store the result at the address: output + i * 4.
*/
vstore4(a + b, i, output);
}

　　由于现在每个内核实例能够实现多个加法运算，所以必须减少内核实例的数量，在宿主机代码中的修改部分如下所示。

[cpp] view plaincopyprint?

/*
* Each instance of our OpenCL kernel now operates on 4 elements of each array so the number of
* instances needed is the number of elements in the array divided by 4.
*/
size_t globalWorksize[1] = {arraySize / 4};
/* Enqueue the kernel */
if (!checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorksize, NULL, 0, NULL, &event)))
{
cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"<< __LINE__ << endl;
return 1;
}