前面已经写过了利用Fortran实现任意维度矩阵乘的CUDA实现,详见:https://blog.csdn.net/xll_bit/article/details/117551476?spm=1001.2014.3001.5501
今天更新一个使用c实现的任意维度矩阵乘的CUDA实现,分别包含了简单实现版本和利用tile技术实现的share memory 版本,详细代码如下:
#include <iostream>
#include <random>
#include <time.h>
using namespace std;
#define TILE_SIZE 16
typedef float my_type;
bool comp(my_type *s, my_type *p, int M, int K){
for(int m = 0; m < M * K; m ++){
if( abs(s[m] - p[m]) > 1e-1){
cout << "Test failed at" << m <<' '<<s[m]<<' '<<p[m]<<' '<< abs(s[m] - p[m])<<endl;
return false;
}
}
cout << "Test passed" << endl;
return true;
}
void print(my_type *s, my_type *p, int M, int K){
for(int m = 0; m < M ; m ++){
for(int k = 0; k< K; k++){
cout<<s[m*K + k]<<' ';
}
cout<<endl;
for(int k = 0; k< K; k++){
cout<<p[m*K + k]<<' ';
}
cout<<endl;
}
return;
}
__global__ void kernel_global(my_type *a, my_type *b, my_type *c,
const int M, const int N, const int K){
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < K && y < M){
my_type tmp = 0;
for(int n = 0; n < N; n++){
tmp += a[y * N + n] * b[n * K + x];
}
c[y * K + x] = tmp;
}
}
__global__ void kernel_shared(my_type *a, my_type *b, my_type *c,
const int M, const int N, const int K){
__shared__ my_type s_a[TILE_SIZE][TILE_SIZE];
__shared__ my_type s_b[TILE_SIZE][TILE_SIZE];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
my_type tmp = 0;
for(int n = 0; n < N; n += TILE_SIZE){
s_a[ty][tx] = 0;
s_b[ty][tx] = 0;
if( y < M && n + tx < N){
s_a[ty][tx] = a[ y * N + n + tx];
}
if (x < K && n + ty < N){
s_b[ty][tx] = b[(n + ty) * K + x];
}
__syncthreads();
for(int i = 0; i < TILE_SIZE; i ++){
tmp += s_a[ty][i] * s_b[i][tx];
}
__syncthreads();
}
if(x < K && y < M){
c[y * K + x] = tmp;
}
}
int main(){
const int M = 2000, N = 1000, K = 3000, NUM = 100;
my_type *h_a,*h_b,*p_c,*d_a,*d_b,*d_c, *s_c;
h_a = new my_type[M*N];
h_b = new my_type[N*K];
p_c = new my_type[M*K];
s_c = new my_type[M*K];
// initilize
srand((unsigned)time(NULL));
for(unsigned int i=0; i < M*N; i++){
h_a[i] = (my_type)(rand())/((my_type)RAND_MAX/10);
}
for(unsigned int i=0; i < N*K; i++){
h_b[i] = (my_type)(rand())/((my_type)RAND_MAX/10);
}
cudaEvent_t start, stop;
float elapsedTime_global, elapsedTime_shared, elapsedTime_cpu;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc(&d_a,M*N*sizeof(my_type));
cudaMalloc(&d_b,N*K*sizeof(my_type));
cudaMalloc(&d_c,M*K*sizeof(my_type));
cudaMemcpy(d_a,h_a,M*N*sizeof(my_type),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,N*K*sizeof(my_type),cudaMemcpyHostToDevice);
dim3 block(TILE_SIZE, TILE_SIZE);
dim3 grid((K-1)/TILE_SIZE + 1, (M-1)/TILE_SIZE + 1);
for(int i=0; i < NUM; i ++)
kernel_global<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
cudaEventRecord(start,0);
for(int i=0; i < NUM; i ++)
kernel_global<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime_global,start,stop);
elapsedTime_global /= NUM;
cudaEventRecord(start,0);
for(int i=0; i < NUM; i ++)
kernel_shared<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime_shared,start,stop);
elapsedTime_shared /= NUM;
cudaMemcpy(p_c,d_c,M*K*sizeof(my_type),cudaMemcpyDeviceToHost);
clock_t c_start, c_stop;
c_start = clock();
for(int m = 0; m < M; m ++){
for( int k = 0; k < K; k ++){
my_type tmp = 0;
for( int n = 0; n < N; n++){
tmp += h_a[m * N + n] * h_b[n * K + k];
}
s_c[m * K + k] = tmp;
}
}
c_stop = clock();
elapsedTime_cpu = (c_stop - c_start)/ 1000;
if( !comp(s_c,p_c,M, K))
print(s_c,p_c,M, K);
cout<<"cpu time: "<<elapsedTime_cpu<<endl;
cout<<"gpu global: "<<elapsedTime_global<<endl;
cout<<"gpu shared: "<<elapsedTime_shared<<endl;
cout<<"speedup-global: "<< elapsedTime_cpu / elapsedTime_global << endl;
cout<<"speedup-shared: "<< elapsedTime_cpu / elapsedTime_shared << endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
free(h_a);
free(h_b);
free(s_c);
free(p_c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
如下是加上 -O2编译选项后运行出的结果: