天天看点

【CUDA-C/C++】任意维度矩阵乘

前面已经写过了利用Fortran实现任意维度矩阵乘的CUDA实现,详见:https://blog.csdn.net/xll_bit/article/details/117551476?spm=1001.2014.3001.5501

今天更新一个使用c实现的任意维度矩阵乘的CUDA实现,分别包含了简单实现版本和利用tile技术实现的share memory 版本,详细代码如下:

#include <iostream>
#include <random>
#include <time.h>
using namespace std;

#define TILE_SIZE 16
typedef float my_type;

bool comp(my_type *s, my_type *p, int M, int K){
    for(int m = 0; m < M * K; m ++){
        if( abs(s[m] - p[m]) > 1e-1){
            cout << "Test failed at" << m <<' '<<s[m]<<' '<<p[m]<<' '<< abs(s[m] - p[m])<<endl;
            return false;
        }
    }
    cout << "Test passed" << endl;
    return true;
}
void print(my_type *s, my_type *p, int M, int K){
    for(int m = 0; m < M ; m ++){
        for(int k = 0; k< K; k++){
            cout<<s[m*K + k]<<' ';
        }
        cout<<endl;
        for(int k = 0; k< K; k++){
            cout<<p[m*K + k]<<' ';
        }
        cout<<endl;
    }
    return;
}

__global__ void kernel_global(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x < K && y < M){
        my_type tmp = 0;
        for(int n = 0; n < N; n++){
            tmp += a[y * N + n] * b[n * K + x];
        }
        c[y * K + x] = tmp;
    }
}
__global__ void kernel_shared(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    __shared__ my_type s_a[TILE_SIZE][TILE_SIZE];
    __shared__ my_type s_b[TILE_SIZE][TILE_SIZE];
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    my_type tmp = 0;
    for(int n = 0; n < N; n += TILE_SIZE){
        s_a[ty][tx] = 0;
        s_b[ty][tx] = 0;
        if( y < M && n + tx < N){
            s_a[ty][tx] = a[ y * N + n + tx];
        }
        if (x < K && n + ty < N){
            s_b[ty][tx] = b[(n + ty) * K + x];
        }
        __syncthreads();
        for(int i = 0; i < TILE_SIZE; i ++){
            tmp += s_a[ty][i] * s_b[i][tx];
        }
        __syncthreads();
    }
    if(x < K && y < M){
        c[y * K + x] = tmp;
    }
}

int main(){
    const int M = 2000, N = 1000, K = 3000, NUM = 100;
    my_type *h_a,*h_b,*p_c,*d_a,*d_b,*d_c, *s_c;
    h_a = new my_type[M*N];
    h_b = new my_type[N*K];
    p_c = new my_type[M*K];
    s_c = new my_type[M*K];
    // initilize
    srand((unsigned)time(NULL));
    for(unsigned int i=0; i < M*N; i++){
        h_a[i] = (my_type)(rand())/((my_type)RAND_MAX/10);
    }
    for(unsigned int i=0; i < N*K; i++){
        h_b[i] = (my_type)(rand())/((my_type)RAND_MAX/10);
    }
    cudaEvent_t start, stop;
    float elapsedTime_global, elapsedTime_shared, elapsedTime_cpu;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc(&d_a,M*N*sizeof(my_type));
    cudaMalloc(&d_b,N*K*sizeof(my_type));
    cudaMalloc(&d_c,M*K*sizeof(my_type));
    cudaMemcpy(d_a,h_a,M*N*sizeof(my_type),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,h_b,N*K*sizeof(my_type),cudaMemcpyHostToDevice);

    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((K-1)/TILE_SIZE + 1, (M-1)/TILE_SIZE + 1);

    for(int i=0; i < NUM; i ++)
        kernel_global<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(start,0);
    for(int i=0; i < NUM; i ++)
        kernel_global<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime_global,start,stop);
    elapsedTime_global /= NUM;

    cudaEventRecord(start,0);
    for(int i=0; i < NUM; i ++)
        kernel_shared<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime_shared,start,stop);
    elapsedTime_shared /= NUM;
    cudaMemcpy(p_c,d_c,M*K*sizeof(my_type),cudaMemcpyDeviceToHost);

    clock_t c_start, c_stop;

    c_start = clock();
    for(int m = 0; m < M; m ++){
        for( int k = 0; k < K; k ++){
            my_type tmp = 0;
            for( int n = 0; n < N; n++){
                tmp += h_a[m * N + n] * h_b[n * K + k];
            }
            s_c[m * K + k] = tmp;
        }
    }
    c_stop = clock();
    elapsedTime_cpu = (c_stop - c_start)/ 1000;
    if( !comp(s_c,p_c,M, K))
        print(s_c,p_c,M, K);
    cout<<"cpu time:    "<<elapsedTime_cpu<<endl;
    cout<<"gpu global:  "<<elapsedTime_global<<endl;
    cout<<"gpu shared:  "<<elapsedTime_shared<<endl;
    cout<<"speedup-global:  "<< elapsedTime_cpu / elapsedTime_global << endl;
    cout<<"speedup-shared:  "<< elapsedTime_cpu / elapsedTime_shared << endl;


    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    free(h_a);
    free(h_b);
    free(s_c);
    free(p_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);


    return 0;
}
           

如下是加上 -O2编译选项后运行出的结果:

【CUDA-C/C++】任意维度矩阵乘

继续阅读