本文主要針對與我一樣的小白想使用CUDA加速方法,而經過Visual Studio編譯器對cuda代碼反複試驗與調整得到以下成果。
目前本文主要使用一維數組對CUDA的grid與block的組合嘗試,已實作CUDA一維數組的計算,将其記錄如下。
而我也将會在下篇繼續使用多元數組使用CUDA計算。
本文可以幫助大家盡快上手CUDA實作簡單代碼編寫,了解如何利用CUDA 線程實作每個元素處理(若為圖檔則是像素處理)。
本文結構:1.基本原理;2.一維數組代碼;3.實作結果展示。
1.基本原理
原理引用部落格為https://blog.csdn.net/tiao_god/article/details/107181883
CUDA會把線程分為Block和Grid:

這裡友善介紹我們用的都是二維的結構,其實Block和Grid也可以設定成三維的。
blockDim.x:Block的x方向的次元,這裡是5,即每行5個線程。
blockDim.y:Block的y方向的次元,這裡是3,即每列3個線程。
blockIdx.x:Block在x方向的位置,圖中放大的Block是2,即為第2個。
blockIdx.y:Block在y向的位置,圖中放大的Block是2,即為第2個。
注意blockIdx中的Idx是表示index的縮寫,而不是表示x方向的ID。
在CUDA裡計算線程索引一般都是:
const int X = blockIdx.x * blockDim.x + threadIdx.x;
const int Y = blockIdx.y * blockDim.y + threadIdx.y;
對應圖中放大的區域的Thread(3,1):
計算式:X = 2*5+3 Y = 2*3+1
2.一維數組代碼
// 一個次元的計算方法
__global__ void one_vector_add(float* a_device, float* b_device, float* c_device) {
int tid = threadIdx.y+blockDim.x*threadIdx.x;
c_device[tid] = a_device[tid] + b_device[tid];
}
// 一維數組相加
void OneDim_add() {
const int length = 4; // 數組長度為16
float a[length], b[length], c[length]; // host中的數組
// 變量初始化指派
for (int i = 0; i < length; i++) {
a[i] = i + 1;
b[i] = a[i] * 10;
}
// 變量初始化列印
cout << "a=" << '\t';
for (int i = 0; i < length; i++) { cout << a[i] << '\t'; }
cout << '\n' << "b=" << '\t';
for (int i = 0; i < length; i++) { cout << b[i] << '\t'; }
//構造cuda指針變量并配置設定記憶體
float *a_device, *b_device, *c_device; // device中的數組
cudaSetDevice(0); // 設定cuda裝置
cudaMalloc((void**)&a_device, length * sizeof(float));
cudaMalloc((void**)&b_device, length * sizeof(float));
cudaMalloc((void**)&c_device, length * sizeof(float));
// 将host數組的值拷貝給device數組
cudaMemcpy(a_device, a, length * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_device, b, length * sizeof(float), cudaMemcpyHostToDevice);
cout << '\n' << "length:" << length << "\t配置設定大小:" << length * sizeof(float) << endl;
// 配置grid,block參數
dim3 grid(1, 1, 1), block(1,length , 1); // 設定grid block 參數
// 重點:調用kernel函數
one_vector_add <<<grid, block >>> (a_device, b_device, c_device); // 啟動kernel
// 将gpu上得到結果值傳回host變量
cudaMemcpy(c, c_device, length * sizeof(float), cudaMemcpyDeviceToHost); // 将結果拷貝到host
cout << '\n' << "c=" << '\t';
for (int i = 0; i < length; i++) { cout << c[i] << '\t'; } //列印cuda計算的值
}
//block-thread 1D-3D
__global__ void TwoBlock1Thread2Way1(float *c, const float *a, const float *b)
{
int i = threadIdx.x+threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y; //3d 也适用2D
c[i] = b[i] + a[i];
}
//block-thread 2D-3D
__global__ void TwoBlock2Thread3Way1(float *c, const float *a, const float *b)
{
int thread2d = threadIdx.x + threadIdx.y*blockDim.x;
int block2d = blockIdx.x+blockIdx.y*gridDim.x; // block是2D形式,但同樣适用block為1D形式(blockIdx.x)
int i = thread2d + (blockDim.x*blockDim.y)*block2d;
c[i] = b[i] + a[i];
}
void TwoDimway1_add() {
//将二維矩陣化為一維數組,使用一維數組進行block配置設定
const int row = 4;
const int col = 6;
const int length = row*col; // 數組長度為16
float a[length], b[length], c[length]; // host中的數組
// 變量初始化指派
for (int i = 0; i < length; i++) {
a[i] = i + 1;
b[i] = a[i] * 100;
}
// 變量初始化列印
cout << "變量a=" <<endl;
for (int i = 0; i < row; i++) {
for (int i = 0; i < col; i++) {
cout<< '\t' << a[i] ;
}
cout << endl;
}
cout << "變量b=" << endl;
for (int i = 0; i < row; i++) {
for (int i = 0; i < col; i++) {
cout << '\t' << b[i];
}
cout << endl;
}
//構造cuda指針變量并配置設定記憶體
float *a_device, *b_device, *c_device; // device中的數組
cudaSetDevice(0); // 設定cuda裝置
cudaMalloc((void**)&a_device, length * sizeof(float));
cudaMalloc((void**)&b_device, length * sizeof(float));
cudaMalloc((void**)&c_device, length * sizeof(float));
// 将host數組的值拷貝給device數組
cudaMemcpy(a_device, a, length * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_device, b, length * sizeof(float), cudaMemcpyHostToDevice);
// 配置grid,block參數
//dim3 grid(1, 1, 1), block(row, col/2, col / 2); // 設定block為3D參數
//TwoBlock1Thread2Way1 <<<grid, block >>> (c_device,a_device, b_device ); // 重點:調用kernel函數
//dim3 grid(1, 1, 1), block(row, col , 1); // 設定block為2D參數
//TwoBlock1Thread2Way1 << <grid, block >> > (c_device, a_device, b_device);// 重點:調用kernel函數
dim3 grid(2, 1, 1), block(row/2, col, 1); // 設定block為2D參數
TwoBlock2Thread3Way1 << <grid, block >> > (c_device, a_device, b_device);// 重點:調用kernel函數
// 将gpu上得到結果值傳回host變量
cudaMemcpy(c, c_device, length * sizeof(float), cudaMemcpyDeviceToHost); // 将結果拷貝到host
cout << "計算結果 c=" << endl;
for (int i = 0; i < row; i++) {
for (int i = 0; i < col; i++) {
cout << '\t' << c[i];
}
cout << endl;
}
}
int main() {
cout << "\n一維數組cuda計算結果:\n" << endl;
OneDim_add();
//TwoDim_add();
cout << "\n\n二維數組一維方式cuda計算結果:\n\n" << endl;
TwoDimway1_add();
return 0;
}
其它kernel實作形式代碼展示:
//thread 1D
__global__ void Thread1(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = b[i] - a[i];
}
//thread 2D
__global__ void Thread2(int *c, const int *a, const int *b)
{
int i = threadIdx.x + threadIdx.y*blockDim.x;
c[i] = b[i] - a[i];
}
//thread 3D
__global__ void Thread3(int *c, const int *a, const int *b)
{
int i = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
c[i] = b[i] - a[i];
}
//block 1D
__global__ void Block1(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = b[i] - a[i];
}
//block 2D
__global__ void Block2(int *c, const int *a, const int *b)
{
int i = blockIdx.x + blockIdx.y*gridDim.x;
c[i] = b[i] - a[i];
}
//block 3D
__global__ void Block3(int *c, const int *a, const int *b)
{
int i = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
c[i] = b[i] - a[i];
}
//block-thread 1D-1D
__global__ void Block1Thread1(float *c, const float *a, const float *b)
{
int i = threadIdx.x + blockDim.x*blockIdx.x;
c[i] = b[i] - a[i];
}
//block-thread 1D-2D
__global__ void Block1Thread2(float *c, const float *a, const float *b)
{
int threadId_2D = threadIdx.x + threadIdx.y*blockDim.x;
int i = threadId_2D + (blockDim.x*blockDim.y)*blockIdx.x; //
c[i] = b[i] - a[i];
}
//block-thread 1D-3D
__global__ void Block1Thread3(int *c, const int *a, const int *b)
{
int threadId_3D = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
int i = threadId_3D + (blockDim.x*blockDim.y*blockDim.z)*blockIdx.x;
c[i] = b[i] - a[i];
}
//block-thread 2D-1D
__global__ void Block2Thread1(int *c, const int *a, const int *b)
{
int blockId_2D = blockIdx.x + blockIdx.y*gridDim.x;
int i = threadIdx.x + blockDim.x*blockId_2D;
c[i] = b[i] - a[i];
}
//block-thread 3D-1D
__global__ void Block3Thread1(int *c, const int *a, const int *b)
{
int blockId_3D = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
int i = threadIdx.x + blockDim.x*blockId_3D;
c[i] = b[i] - a[i];
}
//block-thread 2D-2D
__global__ void Block2Thread2(int *c, const int *a, const int *b)
{
int threadId_2D = threadIdx.x + threadIdx.y*blockDim.x;
int blockId_2D = blockIdx.x + blockIdx.y*gridDim.x;
int i = threadId_2D + (blockDim.x*blockDim.y)*blockId_2D;
c[i] = b[i] - a[i];
}
//block-thread 2D-3D
__global__ void Block2Thread3(int *c, const int *a, const int *b)
{
int threadId_3D = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
int blockId_2D = blockIdx.x + blockIdx.y*gridDim.x;
int i = threadId_3D + (blockDim.x*blockDim.y*blockDim.z)*blockId_2D;
c[i] = b[i] - a[i];
}
//block-thread 3D-2D
__global__ void Block3Thread2(int *c, const int *a, const int *b)
{
int threadId_2D = threadIdx.x + threadIdx.y*blockDim.x;
int blockId_3D = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
int i = threadId_2D + (blockDim.x*blockDim.y)*blockId_3D;
c[i] = b[i] - a[i];
}
//block-thread 3D-3D
__global__ void Block3Thread3(int *c, const int *a, const int *b)
{
int threadId_3D = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.x*blockDim.y;
int blockId_3D = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*gridDim.x*gridDim.y;
int i = threadId_3D + (blockDim.x*blockDim.y*blockDim.z)*blockId_3D;
c[i] = b[i] - a[i];
}
3.實作結果展示
采用加法運算,将其變量a與變量b位置元素一一對應相加結果:
處理算法通用的輔助的code,如讀取txt檔案,讀取xml檔案,将xml檔案轉換成txt檔案,讀取json檔案等