我打開AMD-APP-SDK3.0裡的例子,我以為可以直接運作,結果不行。就比如這第一個例子:AdvancedConvolution 裡需要amdsdk但根本沒有相關的頭檔案和庫。後來終于找到了:
然後将這個工程照着例子改成這樣,我習慣這樣看:
一、advancedNonSeparableConvolution 這個kernel:
main.cpp:
#include <CL/cl.h>
#include "SDKBitMap.hpp"
#include "FilterCoeff.h"
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;
using namespace streamsdk;
int main()
{
/host datas ready...
cl_uint filterSize=3, filterType=0,useLDSPass1=0;
cl_uint filterRadius,height,width,paddedHeight,paddedWidth;
cl_uchar4 *inputImage2D,*paddedInputImage2D,*nonSepOutputImage2D,*sepOutputImage2D,*nonSepVerificationOutput,*sepVerificationOutput;
cl_float *mask,*rowFilter,*colFilter;
size_t localThreads[2],globalThreads[2];
// Check whether isLds is zero or one
if(useLDSPass1 != 0 && useLDSPass1 != 1)
{
std::cout << "isLds should be either 0 or 1" << std::endl;
return -1;
}
// initialization of mask
if(filterSize != 3 && filterSize != 5)
{
std::cout << "Filter Size should be either 3 or 5" << std::endl;
return -1;
}
if (filterType !=0 && filterType != 1 && filterType !=2)
{
std::cout << "Filter Type can only be 0, 1 or 2 for Sobel, Box and Gaussian filters respectively." << std::endl;
return -1;
}
switch (filterType)
{
case 0: /* Sobel Filter */
if(filterSize == 3)
{
mask = SOBEL_FILTER_3x3;
rowFilter = SOBEL_FILTER_3x3_pass1;
colFilter = SOBEL_FILTER_3x3_pass2;
}
else
{
mask = SOBEL_FILTER_5x5;
rowFilter = SOBEL_FILTER_5x5_pass1;
colFilter = SOBEL_FILTER_5x5_pass2;
}
break;
case 1: /* Box Filter */
if(filterSize == 3)
{
mask = BOX_FILTER_3x3;
rowFilter = BOX_FILTER_3x3_pass1;
colFilter = BOX_FILTER_3x3_pass2;
}
else
{
mask = BOX_FILTER_5x5;
rowFilter = BOX_FILTER_5x5_pass1;
colFilter = BOX_FILTER_5x5_pass2;
}
break;
case 2: /* Gaussian Filter */
if(filterSize == 3)
{
mask = GAUSSIAN_FILTER_3x3;
rowFilter = GAUSSIAN_FILTER_3x3_pass1;
colFilter = GAUSSIAN_FILTER_3x3_pass2;
}
else
{
mask = GAUSSIAN_FILTER_5x5;
rowFilter = GAUSSIAN_FILTER_5x5_pass1;
colFilter = GAUSSIAN_FILTER_5x5_pass2;
}
break;
}
// load input bitmap image
SDKBitMap inputBitmap;
char imgName[]={"/home/jumper/OpenCL_projects/AMD-Sample-AdvancedConvolution/AdvancedConvolution_Input.bmp"};
inputBitmap.load(imgName);
// error if image did not load
if(!inputBitmap.isLoaded())
{
std::cout << "Failed to load input image!";
return SDK_FAILURE;
}
// get width and height of input image
height = inputBitmap.getHeight();
width = inputBitmap.getWidth();
// allocate memory for input image data to host
inputImage2D = (cl_uchar4*)malloc(width * height * sizeof(cl_uchar4));
CHECK_ALLOCATION(inputImage2D,"Failed to allocate memory! (inputImage2D)");
// get the pointer to pixel data
uchar4 *pixelData = inputBitmap.getPixels();
if(pixelData == NULL)
{
std::cout << "Failed to read pixel Data!";
return SDK_FAILURE;
}
// Copy pixel data into inputImageData2D
cl_uint pixelSize=sizeof(uchar4);
memcpy(inputImage2D, pixelData, width * height * pixelSize);
// allocate and initalize memory for padded input image data to host
filterRadius = filterSize - 1;
paddedHeight = height + filterRadius;
paddedWidth = width + filterRadius;
paddedInputImage2D = (cl_uchar4*)malloc(paddedWidth * paddedHeight * sizeof(cl_uchar4));
CHECK_ALLOCATION(paddedInputImage2D,"Failed to allocate memory! (paddedInputImage2D)");
memset(paddedInputImage2D, 0, paddedHeight*paddedWidth*sizeof(cl_uchar4));
for(cl_uint i = filterRadius; i < height + filterRadius; i++)
{
for(cl_uint j = filterRadius; j < width + filterRadius; j++)
{
paddedInputImage2D[i * paddedWidth + j] = inputImage2D[(i - filterRadius) * width + (j - filterRadius)];
}
}
// allocate memory for output image data for Non-Separable Filter to host
nonSepOutputImage2D = (cl_uchar4*)malloc(width * height * sizeof(cl_uchar4));
CHECK_ALLOCATION(nonSepOutputImage2D,"Failed to allocate memory! (nonSepOutputImage2D)");
memset(nonSepOutputImage2D, 0, width * height * pixelSize);
// allocate memory for output image data for Separable Filter to host
sepOutputImage2D = (cl_uchar4*)malloc(width * height * sizeof(cl_uchar4));
CHECK_ALLOCATION(sepOutputImage2D,"Failed to allocate memory! (sepOutputImage2D)");
memset(sepOutputImage2D, 0, width * height * pixelSize);
// allocate memory for verification output
nonSepVerificationOutput = (cl_uchar4*)malloc(width * height * pixelSize);
CHECK_ALLOCATION(nonSepVerificationOutput,"Failed to allocate memory! (verificationOutput)");
sepVerificationOutput = (cl_uchar4*)malloc(width * height * pixelSize);
CHECK_ALLOCATION(sepVerificationOutput,"Failed to allocate memory! (verificationOutput)");
memset(nonSepVerificationOutput, 0, width * height * pixelSize);
memset(sepVerificationOutput, 0, width * height * pixelSize);
size_t blockSizeX=16,blockSizeY=16;
localThreads[0] = blockSizeX;
localThreads[1] = blockSizeY;
// set global work-group size, padding work-items do not need to be considered
globalThreads[0] = (width + localThreads[0] - 1) / localThreads[0];//????????????????? ?
globalThreads[0] *= localThreads[0];
globalThreads[1] = (height + localThreads[1] - 1) / localThreads[1];
globalThreads[1] *= localThreads[1];
//set up OpenCL...
cl_uint platformNum;
cl_int status;
status=clGetPlatformIDs(0,NULL,&platformNum);
if(status!=CL_SUCCESS){
printf("cannot get platforms number.\n");
return -1;
}
cl_platform_id* platforms;
platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
status=clGetPlatformIDs(platformNum,platforms,NULL);
if(status!=CL_SUCCESS){
printf("cannot get platforms addresses.\n");
return -1;
}
cl_platform_id platformInUse=platforms[0];
cl_device_id device;
status=clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_DEFAULT,1,&device,NULL);
cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&status);
cl_command_queue queue=clCreateCommandQueue(context,device,CL_QUEUE_PROFILING_ENABLE, &status);
std::ifstream srcFile("/home/jumper/OpenCL_projects/AMD-Sample-AdvancedConvolution/convolution.cl");
std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
cl_program program=clCreateProgramWithSource(context,1,&src,&length,&status);
status=clBuildProgram(program,1,&device,NULL,NULL,&status);
if (status != CL_SUCCESS)
{
cout<<"error:clBuildProgram()..."<<endl;
shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(context));
oclLogPtx(program, oclGetFirstDev(context), "oclfluore.ptx");
return(EXIT_FAILURE);
}
cl_kernel nonSeparablekernel = clCreateKernel(program, "advancedNonSeparableConvolution", &status);
CHECK_OPENCL_ERROR(status, "clCreateKernel failed (advancedNonSeparableConvolution).");
///Prepare needed buffers...
//5 buffer
cl_mem inputBuffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,pixelSize * paddedWidth * paddedHeight,paddedInputImage2D,&status);
cl_mem outputBuffer = clCreateBuffer(context,CL_MEM_WRITE_ONLY,pixelSize * width * height,NULL,&status);
cl_mem maskBuffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,sizeof(cl_float ) * filterSize * filterSize,mask,&status);
cl_mem rowFilterBuffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,sizeof(cl_float ) * filterSize,rowFilter,&status);
cl_mem colFilterBuffer = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,sizeof(cl_float ) * filterSize,colFilter,&status);
///launch the non-Separate kernel...
cl_event event0;
// Set appropriate arguments to the kernel
status = clSetKernelArg(nonSeparablekernel, 0,sizeof(cl_mem),(void *)&inputBuffer);
CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (inputBuffer)");
status = clSetKernelArg(nonSeparablekernel, 1, sizeof(cl_mem),(void *)&maskBuffer);
CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (maskBuffer)");
status = clSetKernelArg(nonSeparablekernel, 2,sizeof(cl_mem),(void *)&outputBuffer);
CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (outputBuffer)");
status = clSetKernelArg(nonSeparablekernel,3,sizeof(cl_uint),(void *)&width);
CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (width)");
status = clSetKernelArg(nonSeparablekernel,4, sizeof(cl_uint),(void *)&height);
CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (height)");
status = clSetKernelArg(nonSeparablekernel,5,sizeof(cl_uint),(void *)&paddedWidth);
CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (paddedWidth)");
// Enqueue a kernel run call.
status = clEnqueueNDRangeKernel(queue,nonSeparablekernel,2,NULL,globalThreads,localThreads,0, NULL,&event0);
CHECK_OPENCL_ERROR( status, "clEnqueueNDRangeKernel failed.");
status = clFlush(queue);
CHECK_OPENCL_ERROR(status,"clFlush() failed");
status = clWaitForEvents(1,&event0);
CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(events[0]) Failed");
clReleaseEvent(event0);
status = clEnqueueReadBuffer(queue,outputBuffer,CL_TRUE,0,width * height * pixelSize,nonSepOutputImage2D,0,NULL,NULL);
CHECK_OPENCL_ERROR( status, "clEnqueueReadBuffer(nonSepOutputImage2D) failed.");
memcpy(pixelData, nonSepOutputImage2D, width * height * pixelSize);
// write the output bmp file
if(!inputBitmap.write("NonSeparableOutputImage.bmp"))
{
std::cout << "Failed to write output image!";
return SDK_FAILURE;
}
/clean up all variables...
if (nonSeparablekernel != NULL)
{
status = clReleaseKernel(nonSeparablekernel);
CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.(nonSeparablekernel)");
}
if (program)
{
status = clReleaseProgram(program);
CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.(program)");
}
if (inputBuffer)
{
status = clReleaseMemObject(inputBuffer);
CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(inputBuffer)");
}
if (outputBuffer)
{
status = clReleaseMemObject(outputBuffer);
CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(outputBuffer)");
}
if (maskBuffer)
{
status = clReleaseMemObject(maskBuffer);
CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(maskBuffer)");
}
if (rowFilterBuffer)
{
status = clReleaseMemObject(rowFilterBuffer);
CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(rowFilterBuffer)");
}
if(colFilterBuffer)
{
status = clReleaseMemObject(colFilterBuffer);
CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(colFilterBuffer)");
}
if (queue)
{
status = clReleaseCommandQueue(queue);
CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.(commandQueue)");
}
if (context)
{
status = clReleaseContext(context);
CHECK_OPENCL_ERROR(status, "clReleaseContext failed.(context)");
}
// release program resources (input memory etc.)
FREE(inputImage2D);
FREE(paddedInputImage2D);
FREE(nonSepOutputImage2D);
FREE(sepOutputImage2D);
FREE(nonSepVerificationOutput);
FREE(sepVerificationOutput);
return 0;
}
其對應的kernel部分:
#define FILTERSIZE 3
__kernel void advancedNonSeparableConvolution(
__global uchar4 *input,
__global float *mask,
__global uchar4 *output,
uint nWidth,
uint nHeight,
uint nExWidth)
{
int col = get_global_id(0);
int row = get_global_id(1);
if (col >= nWidth || row >= nHeight) return;
int lid_x = get_local_id(0);
int lid_y = get_local_id(1);
int start_col, start_row;
int cnt = 0;
#if 1
//#USE_LDS == 1
__local uchar4 local_input[(16 + 3 - 1) * (16 + 3 - 1)];
int tile_xres = (16 + 3 - 1);
int tile_yres = (16 + 3 - 1);
start_col = get_group_id(0) * 16; //Image is padded
start_row = get_group_id(1) * 16;
int lid = lid_y * 16 + lid_x;
int gx, gy;
do {
gy = lid / tile_xres;
gx = lid - gy * tile_xres;
local_input[lid] = input[(start_row + gy) * nExWidth + (start_col + gx)];
lid += (16 * 16);
} while (lid < (tile_xres * tile_yres));
barrier(CLK_LOCAL_MEM_FENCE);
start_col = lid_x;
start_row = lid_y;
#else
start_col = col;
start_row = row;
#endif
float4 sum = (float4)0.0f;
int m = 0, n = 0;
#pragma unroll 16
for (int j = start_row; j < start_row + FILTERSIZE; j++,m++)
{
n = 0;
for (int i = start_col; i < start_col + FILTERSIZE; i++,n++)
{
//#if USE_LDS == 1
#if 1
{
sum = mad(convert_float4(local_input[j * tile_xres + i]), (float4)mask[m * FILTERSIZE + n], sum);//a*b+c
}
#else
{
sum = mad(convert_float4(input[(j)*nExWidth + (i)]), (float4)mask[m * FILTERSIZE + n], sum);
}
#endif
}
}
output[row * nWidth + col] = (convert_uchar4_sat)(sum);
}
這個例子的步驟就是: 原圖是512X512的彩圖,用3X3的視窗進行卷積!在host上先将圖像擴充邊界,globalsize是512X512,localsize是16X16。kernel的設計思想主要分兩步:1、每個group将圖像同位置的18X18的圖像塊緩沖進自己的LDS即local數組local_input[]中;2、group中每個item負責以自己所在的點為中心與掩模視窗進行卷積得到一個和sum作為自己這個點的最終結果!3、全局上每個item将自己的結果給最終的圖像。
其實這個kernel就這兩步。巧妙之處:先将圖像擴充邊界;利用SDKBitMap庫即決定了後續要利用uchar4 float4節約空間并且快讀讀到像素值;利用LDS緩沖加速;每個item負責一個像素點的卷積;很機智的使用了#pragma unroll 16!
要我設計的話,可能不會這樣面面俱到!我要學習這些方法(提高性能前路漫長)
用CodeXL看了device上的時間:0.9670

二、advancedSeparableConvolution 這個kernel:
main.cpp上其實沒什麼差别,和上面幾乎差不多,就不廢話了。重要的是kernel上蘊含的思想!
#define FILTERSIZE 3
__kernel void advancedSeparableConvolution(
__global uchar4 *input,
__global float *row_filter,
__global float *col_filter,
__global uchar4 *output,
uint nWidth,
uint nHeight,
uint nExWidth)
{
__local float4 local_output[16 * (16 + FILTERSIZE - 1)];
int col = get_global_id(0);
int row = get_global_id(1);
if (col >= nWidth || row >= nHeight) return;
int lid_x = get_local_id(0);
int lid_y = get_local_id(1);
int start_col, start_row;
int cnt = 0;
/***************************************************************************************
* If using LDS, get the data to local memory. Else, get the global memory indices ready
***************************************************************************************/
//#if USE_LDS == 1
#if 1
__local uchar4 local_input[(16 + FILTERSIZE - 1) * (16 + FILTERSIZE - 1)];
int tile_xres = (16 + FILTERSIZE - 1);
int tile_yres = (16 + FILTERSIZE - 1);
start_col = get_group_id(0) * 16; //Image is padded
start_row = get_group_id(1) * 16;
int lid = lid_y * 16 + lid_x;
int gx, gy;
/*********************************************************************
* Read input from global buffer and put in local buffer
* Read 256 global memory locations at a time (256 WI).
* Conitnue in a loop till all pixels in the tile are read.
**********************************************************************/
do {
gy = lid / tile_xres;
gx = lid - gy * tile_xres;
local_input[lid] = input[(start_row + gy) * nExWidth + (start_col + gx)];
lid += (16 * 16);
} while (lid < (tile_xres * tile_yres));
barrier(CLK_LOCAL_MEM_FENCE);
start_col = lid_x;
#else
/************************************************************************
* Non - LDS implementation
* Read pixels directly from global memory
************************************************************************/
start_col = col;
#endif
/***********************************************************************************
* Row-wise convolution - Inputs will be read from local or global memory
************************************************************************************/
float4 sum = (float4)0.0f;
cnt = 0;
#pragma unroll FILTERSIZE
for (int i = start_col; i < start_col + FILTERSIZE; i++)
{
//#if USE_LDS == 1
#if 1
sum = mad(convert_float4(local_input[lid_y * tile_xres + i]), (float4)row_filter[cnt++], sum);
#else
sum = mad(convert_float4(input[row * nExWidth + i]), (float4)row_filter[cnt++], sum);
#endif
}
/***********************************************************************************
* Output is stored in local memory
************************************************************************************/
local_output[lid_y * 16 + lid_x] = sum;
/***************************************************************************************
* Row-wise convolution of pixels in the remaining rows
***************************************************************************************/
if (lid_y < FILTERSIZE - 1)
{
cnt = 0;
sum = (float4)0.0f;
#pragma unroll FILTERSIZE
for (int i = start_col; i < start_col + FILTERSIZE; i++)
{
//#if USE_LDS == 1
#if 1
sum = mad(convert_float4(local_input[(lid_y + 16) * tile_xres + i]), (float4)row_filter[cnt++], sum);
#else
sum = mad(convert_float4(input[(row + 16) * nExWidth + i]), (float4)row_filter[cnt++], sum);
#endif
}
/***********************************************************************************
* Again the output is stored in local memory
************************************************************************************/
local_output[(lid_y + 16) * 16 + lid_x] = sum;
}
/***********************************************************************************
* Wait for all the local WIs to finish row-wise convolution.
************************************************************************************/
barrier(CLK_LOCAL_MEM_FENCE);
/************************************************************************************
* Column-wise convolution - Input is the output of row-wise convolution
* Inputs are always read from local memory.
* The output is written to global memory.
***********************************************************************************/
start_row = lid_y;
sum = (float4)0.0f;
cnt = 0;
#pragma unroll FILTERSIZE
for (int i = start_row; i < start_row + FILTERSIZE; i++)
{
sum = mad(local_output[i * 16 + lid_x], (float4)col_filter[cnt++], sum);
}
/* Save Output */
sum = (sum < 0.0f) ? 0.0f : sum;
output[row * nWidth + col] = (convert_uchar4_sat_rte)(sum);
}
這個kernel的思想與第一個kernel的步驟有同有異(這樣以橫縱兩個一維向量(視窗)去卷積,第1個是以二維視窗去卷積): 1、每個group将圖像同位置的18X18的圖像塊緩沖進自己的LDS即local數組local_input[]中;但第2點開始不一樣:2、group中每個item負責以自己所在的點為中心與橫向一維掩模進行卷積得到一個和sum作為自己這個點的卷積結果(這裡很心機的使用了cnt++ 而不是放在for裡!太心機了!如果是我這麼菜的設計者就想不到如此使用)(因為localsize是16X16是以對于18X18的圖像塊,一次隻卷積了16X18卷積後的結果是16X16大小,還剩最後兩行沒有進行卷積)第2就将18X18的圖像塊橫向卷積後為18X16大小(這也是為什麼local_output要定義為16X18大小的原因)!3、單獨拿出前兩行itens處理剩下的兩行圖像,即用2X16處理2X18結果是2X16 到這一步就橫向卷積完畢。;4、接着就是對local_output縱向卷積後大小為16X16了!同樣很心機的使用了cnt++和#praga unroll 3而不是放在一個循環裡!real機智啊!
這個kernel比上一個kernel更妙。巧妙之處:先将圖像擴充邊界;利用SDKBitMap庫即決定了後續要利用uchar4 float4節約空間并且快讀讀到像素值;利用LDS緩沖加速;每個item先負責一個像素點的橫向卷積然後再複制縱向卷積;很機智的使用了#pragma unroll 3以及每個item所屬的cnt++!
感受:AMD-OpenCL-SDK的例子的确比Nvidia-OpenCL-SDK的例子要好,更适合跟着學習如何實際一個kernel如何提高性能!!!
弄懂了開心。