編寫代碼

首先将上次的轉灰階圖的程式拷過來用于生成灰階圖

共編寫了cpu、gpu_wrong_naive、gpu_naive、gpu_usesharemem四種方式實作

cpu版本

cpu版本代碼很簡單：

void getGrayHistincpu(unsigned char * const grayData, 
                    unsigned int * const hist,
                    uint imgheight,
                    uint imgwidth)
{
    for(int i = 0; i < imgheight; i++)
    {
        for (int j = 0; j < imgwidth; j++)
        {
            hist[grayData[i*imgwidth+j]]++;
        }
    }
}

gpu版本1——直接照搬 gpu_wrong_naive

__global__ void getGrayHistincuda_wrong_naive(unsigned char * const grayData, 
                                unsigned int * const hist,
                                uint imgheight,
                                uint imgwidth)  //會發生沖突，數值每次會變化
{
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;    

    if(idx < imgwidth && idy < imgheight)
    {
        const unsigned long pid = imgwidth * idy + idx;
        const unsigned char value = grayData[pid];
        hist[value]++;
    }
}

這個代碼有問題，因為各個線程會同時通路同一塊全局記憶體，數值會不正确

gpu版本2——原子操作 gpu_naive

__global__ void getGrayHistincuda_naive(unsigned char * const grayData, 
                                            unsigned int * const hist,
                                            uint imgheight,
                                            uint imgwidth)  //使用原子操作保證數值正确
{
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;    

    if(idx < imgwidth && idy < imgheight)
    {
        const unsigned long pid = imgwidth * idy + idx;
        const unsigned char value = grayData[pid];
        atomicAdd(&(hist[value]), 1);
    }
}

這個代碼結果是正确的，但是因為有很多線程（我這裡設定的是一個像素交由一個線程處理）互相競争全局記憶體，速度并不快

gpu版本3——共享記憶體 gpu_usesharemem

__global__ void getGrayHistincuda_usesharemem(unsigned char * const grayData, 
                                            unsigned int * const hist,
                                            uint imgheight,
                                            uint imgwidth)  //使用共享記憶體加速
{
    __shared__ unsigned int hist_shared[256];   //共享記憶體僅線上程塊内共享
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;  
    const unsigned char inner_idx = threadIdx.y * blockDim.x + threadIdx.x;

    hist_shared[inner_idx%256] = 0;   //清空資料，由于每個塊的inner_idx可以超過256，是以這樣可以保證hist_shared被全部清零

    __syncthreads();    //等待其他線程完成

    if(idx < imgwidth && idy < imgheight)
    {
        const unsigned long pid = imgwidth * idy + idx;
        const unsigned char value = grayData[pid];
        atomicAdd(&(hist_shared[value]), 1);
    }

    __syncthreads();

    if(threadIdx.y < 8) //每個線程塊将自己共享記憶體中的值合并到全局記憶體中去
    {
        atomicAdd(&(hist[inner_idx]), hist_shared[inner_idx]);
    }

}

這個程式比上一個速度更快，加速的原因有二：

使用了更快的共享記憶體
共享記憶體由線程塊獨占，是以各個線程塊在寫入共享記憶體時，不會與其他線程塊沖突；另外，在合并共享記憶體時，也減少了沖突

測試

正确性

以cpu代碼做參考，隻有gpu_wrong_naive是錯的，原因是沒有進行原子操作加法

速度

方法	時間
cpu	0.00069200
gpu_wrong_naive	0.00013200
gpu_naive	0.00021600
gpu_use_share_mem	0.00011300

可以看到，使用共享記憶體确實可以加速

源碼

#include <iostream>
#include <time.h>
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;

Mat rgb2gray(Mat& srcImage);

void getGrayHistincpu(unsigned char * const grayData, 
                    unsigned int * const hist,
                    uint imgheight,
                    uint imgwidth)
{
    for(int i = 0; i < imgheight; i++)
    {
        for (int j = 0; j < imgwidth; j++)
        {
            hist[grayData[i*imgwidth+j]]++;
        }
    }
}

__global__ void getGrayHistincuda_wrong_naive(unsigned char * const grayData, 
                                unsigned int * const hist,
                                uint imgheight,
                                uint imgwidth)  //會發生沖突，數值每次會變化
{
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;    

    if(idx < imgwidth && idy < imgheight)
    {
        const unsigned long pid = imgwidth * idy + idx;
        const unsigned char value = grayData[pid];
        hist[value]++;
    }
}

__global__ void getGrayHistincuda_naive(unsigned char * const grayData, 
                                            unsigned int * const hist,
                                            uint imgheight,
                                            uint imgwidth)  //使用原子操作保證數值正确
{
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;    

    if(idx < imgwidth && idy < imgheight)
    {
        const unsigned long pid = imgwidth * idy + idx;
        const unsigned char value = grayData[pid];
        atomicAdd(&(hist[value]), 1);
    }
}

__global__ void getGrayHistincuda_usesharemem(unsigned char * const grayData, 
                                            unsigned int * const hist,
                                            uint imgheight,
                                            uint imgwidth)  //使用共享記憶體加速
{
    __shared__ unsigned int hist_shared[256];   //共享記憶體僅線上程塊内共享
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int idy = blockDim.y * blockIdx.y + threadIdx.y;  
    const unsigned char inner_idx = threadIdx.y * blockDim.x + threadIdx.x;

    hist_shared[inner_idx%256] = 0;   //清空資料，由于每個塊的inner_idx可以超過256，是以這樣可以保證hist_shared被全部清零

    __syncthreads();    //等待其他線程完成

    if(idx < imgwidth && idy < imgheight)
    {
        const unsigned long pid = imgwidth * idy + idx;
        const unsigned char value = grayData[pid];
        atomicAdd(&(hist_shared[value]), 1);
    }

    __syncthreads();

    if(threadIdx.y < 8) //每個線程塊将自己共享記憶體中的值合并到全局記憶體中去
    {
        atomicAdd(&(hist[inner_idx]), hist_shared[inner_idx]);
    }

}

#define PHASE 3

int main(void)
{
    Mat srcImage = imread("./test.jpg");
    Mat grayImage = rgb2gray(srcImage);

    const uint imgheight = grayImage.rows;
    const uint imgwidth = grayImage.cols;

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1)/threadsPerBlock.x, 
                        (imgheight + threadsPerBlock.y - 1)/threadsPerBlock.y);

    unsigned char *gpuGrayData;
    unsigned int *gpuGrayHist;

    unsigned int grayHist[256] = {0};

    cudaMalloc((void**)&gpuGrayData, imgwidth*imgheight*sizeof(unsigned char));
    cudaMalloc((void**)&gpuGrayHist, 256*(sizeof(unsigned int)));

    cudaMemcpy(gpuGrayData, grayImage.data, imgwidth*imgheight*sizeof(unsigned char), cudaMemcpyHostToDevice);

    clock_t start, end;

#if PHASE == 0

    start = clock();
    getGrayHistincpu(grayImage.data, grayHist, imgheight, imgwidth);
    end = clock();
    for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
    printf("cuda_wrong_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);


#elif PHASE == 1

    memset(grayHist, 0, 256*sizeof(unsigned int));
    cudaMemcpy(gpuGrayHist, grayHist, 256*sizeof(unsigned int), cudaMemcpyHostToDevice);   //清零gpu記憶體
    start = clock();
    getGrayHistincuda_wrong_naive<< <blocksPerGrid, threadsPerBlock>> >(gpuGrayData, gpuGrayHist,
                                                                        imgheight, imgwidth);
    cudaDeviceSynchronize();
    end = clock();
    cudaMemcpy(grayHist, gpuGrayHist, 256*sizeof(unsigned int), cudaMemcpyDeviceToHost);
    for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
    printf("cuda_wrong_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);

#elif PHASE == 2

    memset(grayHist, 0, 256*sizeof(unsigned int));
    cudaMemcpy(gpuGrayHist, grayHist, 256*sizeof(unsigned int), cudaMemcpyHostToDevice);   //清零gpu記憶體
    start = clock();
    getGrayHistincuda_naive<< <blocksPerGrid, threadsPerBlock>> >(gpuGrayData, gpuGrayHist, 
                                                                        imgheight, imgwidth);
    cudaDeviceSynchronize();
    end = clock();
    cudaMemcpy(grayHist, gpuGrayHist, 256*sizeof(unsigned int), cudaMemcpyDeviceToHost);
    for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
    printf("cuda_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);

#elif PHASE == 3

    memset(grayHist, 0, 256*sizeof(unsigned int));
    cudaMemcpy(gpuGrayHist, grayHist, 256*sizeof(unsigned int), cudaMemcpyHostToDevice);   //清零gpu記憶體
    start = clock();
    getGrayHistincuda_usesharemem<< <blocksPerGrid, threadsPerBlock>> >(gpuGrayData, gpuGrayHist, 
                                                                        imgheight, imgwidth);
    cudaDeviceSynchronize();
    end = clock();
    cudaMemcpy(grayHist, gpuGrayHist, 256*sizeof(unsigned int), cudaMemcpyDeviceToHost);
    for(int i = 0; i < 256; i++) printf("(%d, %d)\n", i, grayHist[i]);
    printf("cuda_naive run time %.8f\n", (float)(end-start)/CLOCKS_PER_SEC);
    
#endif

    cudaFree(gpuGrayData);
    cudaFree(gpuGrayHist);

    return 0;

}

cuda練習(二)：灰階統計直方圖編寫代碼測試源碼

編寫代碼

cpu版本

gpu版本1——直接照搬 gpu_wrong_naive

gpu版本2——原子操作 gpu_naive

gpu版本3——共享記憶體 gpu_usesharemem

測試

正确性

速度

源碼

繼續閱讀

Ubuntu20.04下配置OpenCV2.4.13

ubuntu14.04 編譯安裝OpenCV 3.2

opencv學習筆記（十四）——圖像像素的通路

linxu下CUDA靜态庫-上

opencv028-輪廓發現

qt-opencv圖像增強之邊緣檢測（多線程）1 .pro項目檔案配置2 ui設計3 多線程邏輯4 Canny邊緣檢測和Laplace算子5 主要代碼6 運作結果

DLP面曝光3D列印機模型資料矯正

《cuda并行程式設計》勘誤（3）

《cuda并行程式設計》勘誤（2）

VS2010 / MFC + OpenCV 2.4.9打開圖檔

2021-09-30三維點雲測量正方形包裹體積

ImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory（完美解決）

一種解決思路： ImportError: libcublas.so.10.0: cannot open shared object file: No such file

Gsl在VS2008下的配置和使用

如何用Opencv求圖像的灰階投影曲線

opencv——圖像的灰階處理（線性變換/拉伸/直方圖/均衡化）