cuda 記憶體拷貝速率問題
`` CPU與GPU之間記憶體拷貝速率,一個9M的圖檔,CPU->GPU耗時9ms,GPU->CPU耗時11ms,項目要求處理的是大圖,對每一個像素點做處理,以400M的圖檔為例子,cpu與gpu之間拷貝均耗時200多ms。顯示卡帶寬為192GB/s,(我認為的)正常情況應該傳輸速率很快,請問一下有什麼需要修改的地方嗎(剛剛接觸GPU的萌新)。下圖為GPU型号:
void Binaryzation_GPU(CppColorRange* range_host,byte* src_host, byte* dst_host, int width, int height)
{
int length = width * height;
int memSize = length * sizeof(uchar4);
CppColorRange range_devive = *range_host;
uchar4* src_device;
uchar4* dst_device;
cudaMalloc((void**)&src_device, memSize);
cudaMallocHost((void**)&dst_device, memSize);
//copy
cudaMemcpy((void**)src_device, src_host, memSize, cudaMemcpyHostToDevice);
cudaMemcpy((void**)dst_device, dst_host, memSize, cudaMemcpyHostToDevice);
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((width + threadsPerBlock.x - 1) / threadsPerBlock.x, (height + threadsPerBlock.y - 1) / threadsPerBlock.y);
cudaDeviceSynchronize();
kernel_Binaryzation << <blocksPerGrid, threadsPerBlock >> >(range_devive, src_device, dst_device, width, height);
cudaMemcpy((void**)dst_host, dst_device, memSize, cudaMemcpyDeviceToHost);
cudaFree(src_device);
cudaFree(dst_device);
}
