Plantform(平台):主機加上OpenCL架構管理下的若幹裝置構成了這個平台,通過這個平台,應用程式可以與裝置共享資源并在裝置上執行kernel。
平台通過cl_plantform來展現,可以使用下面的代碼來初始化平台
cl_int oclGetPlatformID (cl_platform_id *platforms); // Pointer to the platform object
Device(裝置):通過cl_device來表現
cl_int clGetDeviceIDs (cl_platform_id platform,
cl_device_type device_type, // Bitfield identifying the type. For the GPU we use CL_DEVICE_TYPE_GPU
cl_uint num_entries, // Number of devices, typically 1
cl_device_id *devices, // Pointer to the device object
cl_uint *num_devices // Puts here the number of devices matching the device_type
);
Context(上下文):定義了整個OpenCL化境,包括OpenCL kernel、裝置、記憶體管理、指令隊列等。上下文使用cl_context來表現
cl_context clCreateContext(const cl_context_properties *properties, // Bitwise with the properties (see specification)
cl_uint num_devices,
const cl_device_id *devices, // Pointer to the devices object
void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb, void *user_data), // (don't worry about this)
void *user_data, // (don't worry about this)
cl_int *errcode_ret // error code result
);
Command-Queue(指令隊列):就像它的名字一樣,他是一個存儲需要在裝置上執行的OpenCL指令的隊列。
指令隊列建立在一個上下文中的指定裝置上 ,多個指令隊列允許應用程式在不需要同步的情況下執行多條無關聯的指令
cl_command_queue clCreateCommandQueue (
cl_context context,
cl_device_id device,
cl_command_queue_properties properties, // Bitwise with the properties
cl_int *errcode_ret // error code result
);
下面是一個簡單的示例
cl_int error = 0; // Used to handle error codes
cl_platform_id platform;
cl_context context;
cl_command_queue queue;
cl_device_id device;
// Platform
error = oclGetPlatformID(&platform);
if (error != CL_SUCCESS) {
cout << "Error getting platform id: " << errorMessage(error) << endl;
exit(error);
}
// Device
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) {
cout << "Error getting device ids: " << errorMessage(error) << endl;
exit(error);
}
// Context
context = clCreateContext(0, 1, &device, NULL, NULL, &error);
if (error != CL_SUCCESS) {
cout << "Error creating context: " << errorMessage(error) << endl;
exit(error);
}
// Command-queue
queue = clCreateCommandQueue(context, device, 0, &error);
if (error != CL_SUCCESS) {
cout << "Error creating command queue: " << errorMessage(error) << endl;
exit(error);
}
在裝置上配置設定記憶體,我們需要使用cl_mem類型
cl_mem clCreateBuffer (cl_context context, // The context where the memory will be allocated
cl_mem_flags flags,
size_t size, // The size in bytes
void *host_ptr,
cl_int *errcode_ret
);
flags是逐位的,選項如下:
CL_MEM_READ_WRITE
CL_MEM_WRITE_ONLY
CL_MEM_READ_ONLY
CL_MEM_USE_HOST_PTR
CL_MEM_ALLOC_HOST_PTR
CL_MEM_COPY_HOST_PTR – 從 host_ptr處拷貝資料
下面是一個簡單的記憶體配置設定示例,配置設定3塊記憶體,兩份分别存a,b向量,一份存相加後的結果向量,這裡的變量名後的_d表示記憶體是配置設定在裝置上的
const int mem_size = sizeof(float)*size;
// Allocates a buffer of size mem_size and copies mem_size bytes from src_a_h
cl_mem src_a_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_a_h, &error);
cl_mem src_b_d = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, src_b_h, &error);
cl_mem res_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size, NULL, &error);
Kernel:kernel本質上是一個我們可以從主機上調用的,運作在裝置上的函數.所有運作在裝置上的代碼,包括kernel和kernel調用的其他的函數,
都是在運作的時候編譯的。這涉及到下一個概念,Program。
Program:OpenCL Program由kernel函數、其他函數和聲明組成。它通過cl_program表示。當建立一個program時,你必須指定它是由哪些檔案組成的,然後編譯它。
建立program
cl_program clCreateProgramWithSource (
cl_context context,
cl_uint count, // 檔案個數
const char **strings, // 字元串數組,每一個字元串代表一個檔案
const size_t *lengths, // 指明檔案長度的數組
cl_int *errcode_ret // 傳回的錯誤碼
);
編譯program
cl_int clBuildProgram (cl_program program,
cl_uint num_devices, //裝置個數
const cl_device_id *device_list, //裝置ID号清單
const char *options, // 編譯選項
void (*pfn_notify)(cl_program, void *user_data), //
void *user_data
);
檢視編譯log,必須使用下面的函數
cl_int clGetProgramBuildInfo (cl_program program,
cl_device_id device,
cl_program_build_info param_name, // 我們想要知道的參數
size_t param_value_size, // 參數值的大小
void *param_value, // 結果
size_t *param_value_size_ret
);
提取program的入口點,使用cl_kernel
cl_kernel clCreateKernel (
cl_program program, // The program where the kernel is
const char *kernel_name, // The name of the kernel, i.e. the name of the kernel function as it's declared in the code
cl_int *errcode_ret
);
下面是一個簡單的示例
// Creates the program
// Uses NVIDIA helper functions to get the code string and it's size (in bytes)
size_t src_size = 0;
const char* path = shrFindFilePath("vector_add_gpu.cl", NULL);
const char* source = oclLoadProgSource(path, "", &src_size);
//建立program
cl_program program = clCreateProgramWithSource(context, 1, &source, &src_size, &error);
assert(error == CL_SUCCESS);
//編譯program
error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
assert(error == CL_SUCCESS);
//顯示log
char* build_log;
size_t log_size;
// 第一次調用獲得恰當的日志大小
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
build_log = new char[log_size+1];
// 第二次調用獲得日志的内容
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
build_log[log_size] = '\0';
cout << build_log << endl;
delete[] build_log;
//提取kernel
//下面的 "vector_add_gpu"對應的就是cl檔案裡面的函數 vector_add_gpu(...)函數
cl_kernel vector_add_kernel = clCreateKernel(program, "vector_add_gpu", &error);
assert(error == CL_SUCCESS);
運作kernel
一旦我們的kernel建立好,我們就可以運作它。
首先,我們必須設定kernel的參數
cl_int clSetKernelArg (
cl_kernel kernel, // Which kernel
cl_uint arg_index, // 哪一個參數,譬如0,1,2,3代表第1,2,3,4号參數
size_t arg_size, // Size of the next argument (not of the value pointed by it!)
const void *arg_value // Value
)
每個參數都需要調用一次這個函數。
當所有參數設定完畢,我們就可以調用這個kernel
cl_int clEnqueueNDRangeKernel(
cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim, // Choose if we are using 1D, 2D or 3D work-items and work-groups
const size_t *global_work_offset,
const size_t *global_work_size, // The total number of work-items (must have work_dim dimensions)
const size_t *local_work_size, // The number of work-items per work-group (must have work_dim dimensions)
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
// Enqueuing parameters
// Note that we inform the size of the cl_mem object, not the size of the memory pointed by it
error = clSetKernelArg(vector_add_k, 0, sizeof(cl_mem), &src_a_d);
error |= clSetKernelArg(vector_add_k, 1, sizeof(cl_mem), &src_b_d);
error |= clSetKernelArg(vector_add_k, 2, sizeof(cl_mem), &res_d);
error |= clSetKernelArg(vector_add_k, 3, sizeof(size_t), &size);
assert(error == CL_SUCCESS);
// Launching kernel
const size_t local_ws = 512; // Number of work-items per work-group
// shrRoundUp returns the smallest multiple of local_ws bigger than size
const size_t global_ws = shrRoundUp(local_ws, size); // Total number of work-items
error = clEnqueueNDRangeKernel(queue, vector_add_k, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
assert(error == CL_SUCCESS);
讀取結果
讀取結果非常簡單。與之前講到的寫入記憶體(裝置記憶體)的操作相似,現在我們需要存入隊列一個讀取緩沖區的操作
cl_int clEnqueueReadBuffer (
cl_command_queue command_queue,
cl_mem buffer, // 從哪個buffer
cl_bool blocking_read, // 是否讀阻塞
size_t offset, // 對開始的偏移
size_t cb, // 要讀的位元組大小
void *ptr, // 主機的記憶體指針
cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event);
使用示例
float* check = new float[size];
clEnqueueReadBuffer(queue, res_d, CL_TRUE, 0, mem_size, check, 0, NULL, NULL);
清理記憶體
使用clCreate申請的(緩沖區、kernel、隊列)必須使用clRelease釋放
delete[] src_a_h;
delete[] src_b_h;
delete[] res_h;
delete[] check;
clReleaseKernel(vector_add_k);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseMemObject(src_a_d);
clReleaseMemObject(src_b_d);
clReleaseMemObject(res_d);