考慮兩個線程均是要往同一個全局或者共享資料中。

如果x的初始值是10,那麼x的最終結果是?答案是無法确定的,主要是因為有通路沖突。
Atomic Functions 就是讀-修改-寫操作時避免與其它線程沖突,計算時會将其位址鎖定,直到結束計算。
atomic opeations:
intatomicAdd(int* address, intval);
intatomicSub(int* address, intval);
intatomicExch(int* address, intval);
intatomicMin(int* address, intval);
intatomicMax(int* address, intval);
unsigned intatomicInc(unsigned int* address, unsigned intval);
unsigned intatomicDec(unsigned int* address, unsigned intval);
intatomicCAS(int* address, int compare, intval); //compare and swap
intatomicAnd(int* address, intval);
intatomicOr(int* address, intval);
intatomicXor(int* address, intval);
測試例子:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <iostream>
using namespace std;
__device__ int gpu_hist[10];
__global__ void init()
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
gpu_hist[tid] = 0;
}
__global__ void gpu_histogram(int *a, int n)
{
//int *ptr;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int numberThreads = blockDim.x * gridDim.x;
while (tid < n)
{
//ptr = &gpu_hist[a[tid]];
//atomicAdd(ptr, 1);
gpu_hist[a[tid]]++; // have no atomic functions
tid += numberThreads;
}
}
int main()
{
int N = 32;
int *a, *dev_a;
int hist[10];
int size = N * sizeof(int);
a = (int *)malloc(size);
srand(1);
for (int i = 0; i < N; ++i)
{
a[i] = rand() % 10;
printf("%d ", a[i]);
}
printf("\n");
cudaMalloc((void**)&dev_a, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
init << <1, 10 >> >();
gpu_histogram<<<1, 32>>>(dev_a, N);
cudaThreadSynchronize();
cudaMemcpyFromSymbol(&hist, gpu_hist, 10 * sizeof(int));
printf("Histogram as computed on GPU\n");
for (int i = 0; i < 10; ++i)
{
printf("Number of %d s = %d\n", i, hist[i]);
}
free(a);
cudaFree(dev_a);
}