關鍵是線程配置設定方式,将一個大矩陣分成一片一片的矩陣,用線程的ny,nx來分别表示矩陣的行和列,但由于一般矩陣都是用一個一維數組進行存儲,是以最終對應到全局記憶體的index還得利用行和列轉成一維的index。
#include <stdio.h>
const int N = 128; // matrix' dim
const int TILE_DIM = 32; // block size
const int SIZE = sizeof(int) * N * N; // memory
void __global__ copy(int *da, int *db, const int N);
void __global__ copy(int *da, int *db, const int N)
{
int ny = blockIdx.y * TILE_DIM + threadIdx.y;
int nx = blockIdx.x * TILE_DIM + threadIdx.x;
int index = ny*N + nx; // get each index
if(ny<N && nx<N)
{
db[index] = da[index];
}
}
int main(int argc, char *argv[])
{
// host memory and assignment
int *ha, *hb;
ha = (int *)malloc(SIZE);
hb = (int *)malloc(SIZE);
for(int i=0; i< N*N; ++i)
{
ha[i] = 100;
}
// device memry
int *da, *db;
cudaMalloc((void **)&da, SIZE);
cudaMalloc((void **)&db, SIZE);
cudaMemcpy(da, ha, SIZE, cudaMemcpyHostToDevice);
// kernel function
const dim3 block_size(TILE_DIM, TILE_DIM);
const int grid_size_x = (N +TILE_DIM -1) / TILE_DIM;
const int grid_size_y = grid_size_x;
const dim3 grid_size(grid_size_x, grid_size_y);
copy<<<grid_size,block_size>>>(da,db,N);
// device to host
cudaMemcpy(hb,db,SIZE,cudaMemcpyDeviceToHost);
printf("%d\n",hb[100]);
// free
free(ha);
free(hb);
cudaFree(da);
cudaFree(db);
return 0;
}