【CUDA-C/C++】任意维度矩阵乘

前面已经写过了利用Fortran实现任意维度矩阵乘的CUDA实现，详见：https://blog.csdn.net/xll_bit/article/details/117551476?spm=1001.2014.3001.5501

今天更新一个使用c实现的任意维度矩阵乘的CUDA实现，分别包含了简单实现版本和利用tile技术实现的share memory 版本，详细代码如下：

#include <iostream>
#include <random>
#include <time.h>
using namespace std;

#define TILE_SIZE 16
typedef float my_type;

bool comp(my_type *s, my_type *p, int M, int K){
    for(int m = 0; m < M * K; m ++){
        if( abs(s[m] - p[m]) > 1e-1){
            cout << "Test failed at" << m <<' '<<s[m]<<' '<<p[m]<<' '<< abs(s[m] - p[m])<<endl;
            return false;
        }
    }
    cout << "Test passed" << endl;
    return true;
}
void print(my_type *s, my_type *p, int M, int K){
    for(int m = 0; m < M ; m ++){
        for(int k = 0; k< K; k++){
            cout<<s[m*K + k]<<' ';
        }
        cout<<endl;
        for(int k = 0; k< K; k++){
            cout<<p[m*K + k]<<' ';
        }
        cout<<endl;
    }
    return;
}

__global__ void kernel_global(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x < K && y < M){
        my_type tmp = 0;
        for(int n = 0; n < N; n++){
            tmp += a[y * N + n] * b[n * K + x];
        }
        c[y * K + x] = tmp;
    }
}
__global__ void kernel_shared(my_type *a, my_type *b, my_type *c,
                              const int M, const int N, const int K){
    __shared__ my_type s_a[TILE_SIZE][TILE_SIZE];
    __shared__ my_type s_b[TILE_SIZE][TILE_SIZE];
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    my_type tmp = 0;
    for(int n = 0; n < N; n += TILE_SIZE){
        s_a[ty][tx] = 0;
        s_b[ty][tx] = 0;
        if( y < M && n + tx < N){
            s_a[ty][tx] = a[ y * N + n + tx];
        }
        if (x < K && n + ty < N){
            s_b[ty][tx] = b[(n + ty) * K + x];
        }
        __syncthreads();
        for(int i = 0; i < TILE_SIZE; i ++){
            tmp += s_a[ty][i] * s_b[i][tx];
        }
        __syncthreads();
    }
    if(x < K && y < M){
        c[y * K + x] = tmp;
    }
}

int main(){
    const int M = 2000, N = 1000, K = 3000, NUM = 100;
    my_type *h_a,*h_b,*p_c,*d_a,*d_b,*d_c, *s_c;
    h_a = new my_type[M*N];
    h_b = new my_type[N*K];
    p_c = new my_type[M*K];
    s_c = new my_type[M*K];
    // initilize
    srand((unsigned)time(NULL));
    for(unsigned int i=0; i < M*N; i++){
        h_a[i] = (my_type)(rand())/((my_type)RAND_MAX/10);
    }
    for(unsigned int i=0; i < N*K; i++){
        h_b[i] = (my_type)(rand())/((my_type)RAND_MAX/10);
    }
    cudaEvent_t start, stop;
    float elapsedTime_global, elapsedTime_shared, elapsedTime_cpu;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc(&d_a,M*N*sizeof(my_type));
    cudaMalloc(&d_b,N*K*sizeof(my_type));
    cudaMalloc(&d_c,M*K*sizeof(my_type));
    cudaMemcpy(d_a,h_a,M*N*sizeof(my_type),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,h_b,N*K*sizeof(my_type),cudaMemcpyHostToDevice);

    dim3 block(TILE_SIZE, TILE_SIZE);
    dim3 grid((K-1)/TILE_SIZE + 1, (M-1)/TILE_SIZE + 1);

    for(int i=0; i < NUM; i ++)
        kernel_global<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(start,0);
    for(int i=0; i < NUM; i ++)
        kernel_global<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime_global,start,stop);
    elapsedTime_global /= NUM;

    cudaEventRecord(start,0);
    for(int i=0; i < NUM; i ++)
        kernel_shared<<<grid,block>>>(d_a, d_b, d_c, M, N, K);
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime_shared,start,stop);
    elapsedTime_shared /= NUM;
    cudaMemcpy(p_c,d_c,M*K*sizeof(my_type),cudaMemcpyDeviceToHost);

    clock_t c_start, c_stop;

    c_start = clock();
    for(int m = 0; m < M; m ++){
        for( int k = 0; k < K; k ++){
            my_type tmp = 0;
            for( int n = 0; n < N; n++){
                tmp += h_a[m * N + n] * h_b[n * K + k];
            }
            s_c[m * K + k] = tmp;
        }
    }
    c_stop = clock();
    elapsedTime_cpu = (c_stop - c_start)/ 1000;
    if( !comp(s_c,p_c,M, K))
        print(s_c,p_c,M, K);
    cout<<"cpu time:    "<<elapsedTime_cpu<<endl;
    cout<<"gpu global:  "<<elapsedTime_global<<endl;
    cout<<"gpu shared:  "<<elapsedTime_shared<<endl;
    cout<<"speedup-global:  "<< elapsedTime_cpu / elapsedTime_global << endl;
    cout<<"speedup-shared:  "<< elapsedTime_cpu / elapsedTime_shared << endl;


    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    free(h_a);
    free(h_b);
    free(s_c);
    free(p_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);


    return 0;
}

如下是加上 -O2编译选项后运行出的结果：

【CUDA-C/C++】任意维度矩阵乘

【CUDA-C/C++】任意维度矩阵乘

继续阅读

极大似然法(ML)与最大期望法(EM)

C++ 第十五周报告1--《冒泡法排序》

C语言：初学者必定看懂的注释！！！猴子吃桃问题。猴子第一天摘下若干个桃子，每天都吃了前一天剩下的一半零一个，到第10天早上想再吃的时候，就剩下一个桃子. 求第一天共摘多少个桃子。

[转]九大排序算法——C语言实现及详解

C++实现简单顺序表

笔试面试题目：滑动窗口(二)

while 循环、do- while 循环和 for 循环之间的那点事C语言自学之三种循环比较

数据结构与算法（27）——排序（二）

结构体：typedef与struct的区别

Dijkstra--简易版（最短路径）

C经典书籍笔记——C陷阱与缺陷②(语法陷阱之优先级)一、错误案列二、优先级规律

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

线性表之顺序表的实现

C++判断素数、求最大公约数代码判断一个数是否为素数求两个数的最大公约数

SequoiaDB巨杉数据库C++驱动概述

hdu7108哈希