【CUDA開發】CUDA開發瑣碎知識

## 一維矩陣的加

//實作一個一維1*16的小矩陣的加法。

//矩陣大小:1*16

//配置設定一個block，共有16個線程并發。

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <cuda_runtime.h>

#include <cutil.h>

#define vec_size 16

//kernel function

__global__ void vecadd(float* d_a,float* d_b,float* d_c)

{

int index=threadidx.x;

d_c[index]=d_a[index]+d_b[index];

}

int main()

//得到配置設定空間的大小

size_t size=vec_size*sizeof(float);

//為本地配置設定記憶體

float* h_a=(float*)malloc(size);

float* h_b=(float*)malloc(size);

float* h_c=(float*)malloc(size);

//初始化

for (int i=0;i<vec_size;++i)

{

h_a[i]=1.0;

h_b[i]=2.0;

}

//将本地記憶體的中的資料複制到裝置中

float* d_a;

cudamalloc((void**)&d_a,size);

cudamemcpy(d_a,h_a,size,cudamemcpyhosttodevice);

float* d_b;

cudamalloc((void**)&d_b,size);

cudamemcpy(d_b,h_b,size,cudamemcpyhosttodevice);

//配置設定存放結果的空間

float* d_c;

cudamalloc((void**)&d_c,size);

//定義16個線程

dim3 dimblock(16);

vecadd<<<1,dimblock>>>(d_a,d_b,d_c);

//講計算結果複制回主存中

cudamemcpy(h_c,d_c,size,cudamemcpydevicetohost);

//輸出計算結果

for (int j=0;j<vec_size;++j)

printf("%f/t",h_c[j]);

//釋放主機和裝置記憶體

cudafree(d_a);

cudafree(d_b);

cudafree(d_c);

free(h_a);

free(h_b);

free(h_c);

return 0;

## cudamallocpitch()的使用

名稱 cudamallocpitch – 向gpu配置設定存儲器

概要 cudaerror_t cudamallocpitch( void** devptr，size_t* pitch，size_t widthinbytes，size_t height )

說明向裝置配置設定至少widthinbytes*height位元組的線性存儲器，并以*devptr的形式傳回指向所配置設定存儲器的指針。該函數可以填充所配置設定的存儲器，以確定在位址從一行更新到另一行時，給定行的對應指針依然滿足對齊要求。cudamallocpitch()以*pitch的形式傳回間距，即所配置設定存儲器的寬度，以位元組為機關。間距用作存儲器配置設定的一個獨立參數，用于在2d數組内計算位址。如果給定一個t類型數組元素的行和列，可按如下方法計算位址：

t* pelement = (t*)((char*)baseaddress + row * pitch) + column;

對于2d數組的配置設定，建議程式員考慮使用cudamallocpitch()來執行間距配置設定。由于硬體中存在間距對齊限制，如果應用程式将在裝置存儲器的不同區域之間執行2d存儲器複制（無論是線性存儲器還是cuda數組），這種方法将非常有用。

例子：為emudebug

原來《cuda程式設計指南》上給出的pitch的類型為int，在實際運作時與cudamallocpitch()類型不比對。

/************************************************************************/

/* this is a example of the cuda program.

/************************************************************************/

/* mykernel */

__global__ void mykernel(float* devptr,int height,int width,int pitch)

for(int r=0;r {

float* row=(float*)((char*)devptr+r*pitch);

for (int c=0;c {

float element=row[c];

printf("%f/n",element);//模拟運作

}

/* main cuda */

int main(int argc, char* argv[])

size_t width=10;

size_t height=10;

float* decptr;

//pitch的值應該為size_t在整形的時，與函數參數不比對

size_t pitch;

cudamallocpitch((void**)&decptr,&pitch,width*sizeof(float),height);

mykernel<<<1,1>>>(decptr,10,10,pitch);

cudafree(decptr);

printf("%d/n",pitch);

//cut_exit(argc, argv);

## cudamallocarray()的使用

名稱: cudamemcpytoarray – 在主機和裝置間複制資料

概要: cudaerror_t cudamemcpytoarray(struct cudaarray* dstarray，size_t dstx，size_t dsty，const void* src，size_t count，enum cudamemcpykind kind)

cudaerror_t cudamemcpytoarrayasync(struct cudaarray* dstarray，size_t dstx，size_t dsty，const void* src，size_t count，enum cudamemcpykind kind，cudastream_t stream)

說明從src指向的存儲器區域内将count個位元組複制到一個cuda數組dstarray，該數組的左上角從(dstx，dsty)開始，其中kind是cudamemcpyhosttohost、cudamemcpyhost-todevice、cudamemcpydevicetohost或cudamemcpydevicetodevice之一，用于指定複制的方向。

cudamemcpytoarrayasync()是異步的，可選擇傳入非零流參數，進而将其關聯到一個流。它僅對分頁鎖定的主存儲器有效，如果傳入指向可分頁存儲器的指針，那麼将傳回一個錯誤。

傳回值相關傳回值：

cudasuccess

cudaerrorinvalidvalue

cudaerrorinvaliddevicepointer cudaerrorinvalidmemcpydirection

注意，如果之前是異步啟動，該函數可能傳回錯誤碼。

注：

在《cuda程式設計指導》中對，cudamallocarray()函數的使用，個人覺得有錯誤。

enum cudamemcpykind kind ，應該是cudamemcpyhosttohost、cudamemcpyhost-todevice、cudamemcpydevicetohost或cudamemcpydevicetodevice之一。

在指導中使用的是cudamemcpytoarray(cuarray,0,0,h_data,&channeldesc),channeldese為cudachannelformatdesc類型，不是cudamemcpykind.

/*********************************************************************/

/* this is a example of the cuda program.*/

const int width=10;

const int height=10;

//初始化h_array

int h_array[width][height];

for (int i=0;i<width;i++)

for (int j=0;j<height;++j)

h_array[i][j]=j+i*64;

//以機構提channeldesc描述cuda數組中的元件數量和資料類型

cudachannelformatdesc channeldesc = cudacreatechanneldesc(32,0,0,0,cudachannelformatkindunsigned);

cudaarray* cuarray;

cudamallocarray(&cuarray,&channeldesc,width,height);

size_t sizemem=width*height*sizeof(int);

size_t potx=0;

size_t poty=0;

cudamemcpytoarray(cuarray,potx,poty,h_array,sizemem,cudamemcpydevicetohost);

cudafreearray(cuarray);

## cuda統計時間

在cuda中統計運算時間，大緻有三種方法：

<1>使用cutil.h中的函數

unsigned int timer=0;

//建立計時器

cutcreatetimer(&timer);

//開始計時

cutstarttimer(timer);

{

//統計的代碼段

…………

//停止計時

cutstoptimer(timer);

//獲得從開始計時到停止之間的時間

cutgettimervalue( timer);

//删除timer值

cutdeletetimer( timer);

不知道在這種情況下，統計精度。

<2>time.h中的clock函數

clock_t start, finish;

float costtime;

start = clock();

finish = clock();

//得到兩次記錄之間的時間差

costtime = (float)(finish - start) / clocks_per_sec;

時鐘計時單元的長度為1毫秒，那麼計時的精度也為1毫秒。

<3>事件event

cudaevent_t start,stop;

cudaeventcreate(&start);

cudaeventcreate(&stop);

cudaeventrecend(start,0);

…………

cudaeventrecord(stop,0);

cudaeventelapsedtime(&costtime,start,stop);

cudaerror_t cudaeventcreate( cudaevent_t* event )---建立事件對象；

cudaerror_t cudaeventrecord( cudaevent_t event，custream stream )--- 記錄事件；

cudaerror_t cudaeventelapsedtime( float* time，cudaevent_t start，cudaevent_t

end )---計算兩次事件之間相差的時間；

cudaerror_t cudaeventdestroy( cudaevent_t event )---銷毀事件對象。

計算兩次事件之間相差的時間（以毫秒為機關，精度為0.5微秒）。如果尚未記錄其中任何一個事件，此函數将傳回cudaerrorinvalidvalue。如果記錄其中任何一個事件使用了非零流，則結果不确定。

## cuda代碼常用編寫技巧

1. 聲明 __shared__ 變量或數組:

__shared__ float sh_farr[ 256];

__shared__ int a;

2.結構體指針成員的配置設定裝置記憶體:

typedef struct teacher_t

...{

int a;

unsigned int *g_mem1;

float *g_mem2;

}teacher;

void initmem( teacher& t, const unsigned int mat_size)

unsigned int mat_size_ui = sizeof(int) * mat_size;

unsigned int mat_size_f = sizeof(float) * mat_size;

cuda_safe_call( cudamalloc((void**)&t.g_mem1, mat_size_ui) );

cuda_safe_call( cudamalloc((void**)&t.g_mem1, mat_size_f) );

...

3.計時:

unsigned int timer = 0;

cut_safe_call( cutcreatetimer( &timer));

cut_safe_call( cutstarttimer( timer));

...//kernel

cut_safe_call( cutstoptimer( timer));

printf( "total time: %f ms ", cutgettimervalue( timer) );

cut_safe_call( cutdeletetimer( timer));

4. 擷取輸入指令行中包含的檔案名:

/**/////////////////////////////////////////////////////////////////////////////////

//! check if a particular filename has to be used for the file where the result

//! is stored

//! @param argc number of command line arguments (from main(argc, argv)

//! @param argv pointers to command line arguments (from main(argc, argv)

//! @param filename filename of result file, updated if user specified

//! filename

void

getresultfilename( int argc, char** argv, char*& filename)

char* temp = null;

cutgetcmdlineargumentstr( argc, (const char**) argv, "filename-result", &temp);

if( null != temp)

...{

filename = (char*) malloc( sizeof(char) * strlen( temp));

strcpy( filename, temp);

cutfree( temp);

printf( "result filename: '%s' ", filename);

類似的:

//! check if a specific precision of the eigenvalue has to be obtained

//! @param iters_timing numbers of iterations for timing, updated if a

//! specific number is specified on the command line

getprecision( int argc, char** argv, float& precision)

float temp = -1.0f;

cutgetcmdlineargumentf( argc, (const char**) argv, "precision", &temp);

if( temp > 0.0f)

precision = temp;

printf( "precision: %f ", precision);

5.host調用完kernel函數需要進行線程同步,而在kernel或global函數隻需要在必要的地方__syncthreads();即可:

cuda_safe_call( cudathreadsynchronize());

【CUDA開發】CUDA開發瑣碎知識

繼續閱讀

UVA 590 Always on the run

FZU 1978 Repair the brackets

UVA 10344- 23 out of 5

ZOJ 3935 2016

POJ 2115 C Looooops

HDU 5381 The sum of gcd

ZOJ 1104 Leaps Tall Buildings

ZOJ 3700 Ever Dream

HDU 2821 Pusher

ZOJ 1199 Point of Intersection

UVA 1401 Remember the Word

UVA 620 Cellular Structure

ZOJ 2748 Free Kick

CSU 1567 Reverse Rot

UVA 519 Puzzle (II)

數組和類數組