本文通過使用malloc配置設定記憶體和cudaHostAlloc配置設定頁鎖定記憶體,說明使用頁鎖定記憶體可提高運算效率,并指出哪些場合适合使用頁鎖定記憶體。
malloc配置設定的是标準的可分頁的(pagable)的主機記憶體,作業系統在對記憶體進行排程的時候可能會将這種記憶體分頁或者交換到磁盤上,需要的時候再調回記憶體,這樣就會增加運算時間。而cudaHostAlloc配置設定的是頁鎖定的(page-locked)主機記憶體,作業系統不會對這塊記憶體分頁和交換到磁盤上,確定該記憶體始終駐留在實體記憶體中。
下面通過100M資料在主機和裝置上的交換說明二者的差異。貼上代碼:
/********************************************************************
* PageLockedMem.cu
* Compare the performance of general mem and page locked mem.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#define _SIZE 100*1024*1024
/************************************************************************/
/* Init CUDA */
/************************************************************************/
bool InitCUDA(void)
{
......
}
float cudaMallocTest(int size, bool dir)
{
cudaEvent_t start, stop;
float elapsedTime;
int *a, *dev_a;
a = (int*)malloc(size*sizeof(int));
if (!a)
{
printf("Mem error!\n");
}
cutilSafeCall(cudaMalloc((void**)&dev_a, size*sizeof(int)));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cutilSafeCall(cudaEventRecord(start, 0));
for (int i=0; i<10; i++)
{
if (dir)
{
cutilSafeCall(cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice));
}
else
{
cutilSafeCall(cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost));
}
}
cutilSafeCall(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
free(a);
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaEventDestroy(start));
cutilSafeCall(cudaEventDestroy(stop));
return elapsedTime;
}
float cudaHostAllocTest(int size, bool dir)
{
cudaEvent_t start, stop;
float elapsedTime;
int *a, *dev_a;
cutilSafeCall(cudaHostAlloc((void**)&a, size*sizeof(int), cudaHostAllocDefault));
cutilSafeCall(cudaMalloc((void**)&dev_a, size*sizeof(int)));
cutilSafeCall(cudaEventCreate(&start));
cutilSafeCall(cudaEventCreate(&stop));
cutilSafeCall(cudaEventRecord(start, 0));
for (int i=0; i<10; i++)
{
if (dir)
{
cutilSafeCall(cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice));
}
else
{
cutilSafeCall(cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost));
}
}
cutilSafeCall(cudaEventRecord(stop, 0));
cudaEventSynchronize(stop);
cutilSafeCall(cudaEventElapsedTime(&elapsedTime, start, stop));
cutilSafeCall(cudaFreeHost(a));
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaEventDestroy(start));
cutilSafeCall(cudaEventDestroy(stop));
return elapsedTime;
}
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
float elapsedTime;
float MB = (float)100*_SIZE*sizeof(int)/1024/1024;
elapsedTime = cudaMallocTest(_SIZE, true);
printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy up: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaMallocTest(_SIZE, true);
printf("Time using cudaMalloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy down: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaHostAllocTest(_SIZE, true);
printf("Time using cudaHostAlloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy up: %3.1f \n", MB/(elapsedTime/1000));
elapsedTime = cudaHostAllocTest(_SIZE, true);
printf("Time using cudaHostAlloc: %3.1f ms\n", elapsedTime);
printf("\tMB/s during copy down: %3.1f \n", MB/(elapsedTime/1000));
return 0;
}
可以看出運算時間縮短了約2倍。
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsICdzFWRoRXdvN1LclHdpZXYyd2LcBzNvwVZ2x2bzNXak9CX90TQNNkRrFlQKBTSvwFbslmZvwFMwQzLcVmepNHdu9mZvwFVywUNMZTY18CX052bm9CX9kEWadnVzMmZs5mY1lTbhZXUYpVd1kmYr50MZV3YyI2cKJDT29GRjBjUIF2LcRHelR3LcJzLctmch1mclRXY39jN0AjMycDM2EDOxMDM0EDMy8CX0Vmbu4GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)
但是并不是所有的場合都适合用頁鎖定記憶體,因為使用固定記憶體時,将失去虛拟記憶體的所有功能,即需要為每個頁鎖定記憶體配置設定實體記憶體,系統将更快耗盡記憶體(跟使用普通記憶體相比)。是以要根據需要進行選擇。