天天看點

Atomic Functions

考慮兩個線程均是要往同一個全局或者共享資料中。

Atomic Functions

如果x的初始值是10,那麼x的最終結果是?答案是無法确定的,主要是因為有通路沖突。

Atomic Functions 就是讀-修改-寫操作時避免與其它線程沖突,計算時會将其位址鎖定,直到結束計算。

atomic opeations:

intatomicAdd(int* address, intval); 

intatomicSub(int* address, intval);

intatomicExch(int* address, intval);

intatomicMin(int* address, intval);

intatomicMax(int* address, intval);

unsigned intatomicInc(unsigned int* address, unsigned intval);

unsigned intatomicDec(unsigned int* address, unsigned intval);

intatomicCAS(int* address, int compare, intval); //compare and swap

intatomicAnd(int* address, intval);

intatomicOr(int* address, intval);

intatomicXor(int* address, intval);  

測試例子:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <iostream>

using namespace std;

__device__ int gpu_hist[10];

__global__ void init()
{
	int tid = blockIdx.x * blockDim.x + threadIdx.x;
	gpu_hist[tid] = 0;
}

__global__ void gpu_histogram(int *a, int n)
{
	//int *ptr;
	int tid = blockIdx.x * blockDim.x + threadIdx.x;
	int numberThreads = blockDim.x * gridDim.x;
	while (tid < n)
	{
		//ptr = &gpu_hist[a[tid]];
		//atomicAdd(ptr, 1);
		gpu_hist[a[tid]]++; // have no atomic functions
		tid += numberThreads;
	}
}

int main()
{
	int N = 32;
	int *a, *dev_a;
	int hist[10];
	int size = N * sizeof(int);
	a = (int *)malloc(size);
	srand(1);
	for (int i = 0; i < N; ++i)
	{
		a[i] = rand() % 10;
		printf("%d ", a[i]);
	}
	printf("\n");
	cudaMalloc((void**)&dev_a, size);
	cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
	init << <1, 10 >> >();
	gpu_histogram<<<1, 32>>>(dev_a, N);
	cudaThreadSynchronize();
	cudaMemcpyFromSymbol(&hist, gpu_hist, 10 * sizeof(int));
	printf("Histogram as computed on GPU\n");
	for (int i = 0; i < 10; ++i)
	{
		printf("Number of %d s = %d\n", i, hist[i]);
	}
	free(a);
	cudaFree(dev_a);
}
           

繼續閱讀