加速了两倍
Makefile
CUDACXX=nvcc
CUDACXXFLAGS=-arch=sm_70 -O3
CXXFLAGS=-march=native -fopenmp
NSYS=nsys profile
NSYSFLAGS=--stats=true --force-overwrite=true
all: streams
streams: streams.cu
$(CUDACXX) $(CUDACXXFLAGS) -Xcompiler="$(CXXFLAGS)" streams.cu -o streams
streams_solution: streams_solution.cu
$(CUDACXX) $(CUDACXXFLAGS) -Xcompiler="$(CXXFLAGS)" streams_solution.cu -o streams_solution
profile: streams
$(NSYS) $(NSYSFLAGS) -o streams-report ./streams
profile_solution: streams_solution
$(NSYS) $(NSYSFLAGS) -o streams-solution-report ./streams_solution
clean:
rm -f streams streams_solution *.qdrep *.sqlite
streams.cu 没有进行多流加速使用默认流
#include <cstdint>
#include <iostream>
#include "helpers.cuh"
#include "encryption.cuh"
void encrypt_cpu(uint64_t * data, uint64_t num_entries,
uint64_t num_iters, bool parallel=true) {
#pragma omp parallel for if (parallel)
for (uint64_t entry = 0; entry < num_entries; entry++)
data[entry] = permute64(entry, num_iters);
}
__global__
void decrypt_gpu(uint64_t * data, uint64_t num_entries,
uint64_t num_iters) {
const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
const uint64_t stride = blockDim.x*gridDim.x;
for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
data[entry] = unpermute64(data[entry], num_iters);
}
bool check_result_cpu(uint64_t * data, uint64_t num_entries,
bool parallel=true) {
uint64_t counter = 0;
#pragma omp parallel for reduction(+: counter) if (parallel)
for (uint64_t entry = 0; entry < num_entries; entry++)
counter += data[entry] == entry;
return counter == num_entries;
}
int main (int argc, char * argv[]) {
Timer timer;
Timer overall;
const uint64_t num_entries = 1UL << 26;
const uint64_t num_iters = 1UL << 10;
const bool openmp = true;
// Define the number of streams.
const uint64_t num_streams = 32;
// Use round-up division to calculate chunk size.
const uint64_t chunk_size = sdiv(num_entries, num_streams);
timer.start();
uint64_t * data_cpu, * data_gpu;
cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
cudaMalloc (&data_gpu, sizeof(uint64_t)*num_entries);
timer.stop("allocate memory");
check_last_error();
timer.start();
encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
timer.stop("encrypt data on CPU");
timer.start();
// Create array for storing streams.
cudaStream_t streams[num_streams];
// Create number of streams and store in array.
for (uint64_t stream = 0; stream < num_streams; stream++)
cudaStreamCreate(&streams[stream]);
timer.stop("create streams");
check_last_error();
overall.start();
timer.start();
// For each stream...
for (uint64_t stream = 0; stream < num_streams; stream++) {
// ...calculate index into global data (`lower`) and size of data for it to process (`width`).
const uint64_t lower = chunk_size*stream;
const uint64_t upper = min(lower+chunk_size, num_entries);
const uint64_t width = upper-lower;
// ...copy stream's chunk to device.
cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,
sizeof(uint64_t)*width, cudaMemcpyHostToDevice,
streams[stream]);
// ...compute stream's chunk.
decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
(data_gpu+lower, width, num_iters);
// ...copy stream's chunk to host.
cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,
sizeof(uint64_t)*width, cudaMemcpyDeviceToHost,
streams[stream]);
}
for (uint64_t stream = 0; stream < num_streams; stream++)
// Synchronize streams before checking results on host.
cudaStreamSynchronize(streams[stream]);
// Note modification of timer instance use.
timer.stop("asynchronous H2D->kernel->D2H");
overall.stop("total time on GPU");
check_last_error();
timer.start();
const bool success = check_result_cpu(data_cpu, num_entries, openmp);
std::cout << "STATUS: test "
<< ( success ? "passed" : "failed")
<< std::endl;
timer.stop("checking result on CPU");
timer.start();
for (uint64_t stream = 0; stream < num_streams; stream++)
// Destroy streams.
cudaStreamDestroy(streams[stream]);
timer.stop("destroy streams");
check_last_error();
timer.start();
cudaFreeHost(data_cpu);
cudaFree (data_gpu);
timer.stop("free memory");
check_last_error();
}
使用了多流加速streams_solution.cu
#include <cstdint>
#include <iostream>
#include "helpers.cuh"
#include "encryption.cuh"
void encrypt_cpu(uint64_t * data, uint64_t num_entries,
uint64_t num_iters, bool parallel=true) {
#pragma omp parallel for if (parallel)
for (uint64_t entry = 0; entry < num_entries; entry++)
data[entry] = permute64(entry, num_iters);
}
__global__
void decrypt_gpu(uint64_t * data, uint64_t num_entries,
uint64_t num_iters) {
const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
const uint64_t stride = blockDim.x*gridDim.x;
for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
data[entry] = unpermute64(data[entry], num_iters);
}
bool check_result_cpu(uint64_t * data, uint64_t num_entries,
bool parallel=true) {
uint64_t counter = 0;
#pragma omp parallel for reduction(+: counter) if (parallel)
for (uint64_t entry = 0; entry < num_entries; entry++)
counter += data[entry] == entry;
return counter == num_entries;
}
int main (int argc, char * argv[]) {
Timer timer;
Timer overall;
const uint64_t num_entries = 1UL << 26;
const uint64_t num_iters = 1UL << 10;
const bool openmp = true;
// Define the number of streams.
const uint64_t num_streams = 32;
// Use round-up division to calculate chunk size.
const uint64_t chunk_size = sdiv(num_entries, num_streams);
timer.start();
uint64_t * data_cpu, * data_gpu;
cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
cudaMalloc (&data_gpu, sizeof(uint64_t)*num_entries);
timer.stop("allocate memory");
check_last_error();
timer.start();
encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
timer.stop("encrypt data on CPU");
timer.start();
// Create array for storing streams.
cudaStream_t streams[num_streams];
// Create number of streams and store in array.
for (uint64_t stream = 0; stream < num_streams; stream++)
cudaStreamCreate(&streams[stream]);
timer.stop("create streams");
check_last_error();
overall.start();
timer.start();
// For each stream...
for (uint64_t stream = 0; stream < num_streams; stream++) {
// ...calculate index into global data (`lower`) and size of data for it to process (`width`).
const uint64_t lower = chunk_size*stream;
const uint64_t upper = min(lower+chunk_size, num_entries);
const uint64_t width = upper-lower;
// ...copy stream's chunk to device.
cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,
sizeof(uint64_t)*width, cudaMemcpyHostToDevice,
streams[stream]);
// ...compute stream's chunk.
decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
(data_gpu+lower, width, num_iters);
// ...copy stream's chunk to host.
cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,
sizeof(uint64_t)*width, cudaMemcpyDeviceToHost,
streams[stream]);
}
for (uint64_t stream = 0; stream < num_streams; stream++)
// Synchronize streams before checking results on host.
cudaStreamSynchronize(streams[stream]);
// Note modification of timer instance use.
timer.stop("asynchronous H2D->kernel->D2H");
overall.stop("total time on GPU");
check_last_error();
timer.start();
const bool success = check_result_cpu(data_cpu, num_entries, openmp);
std::cout << "STATUS: test "
<< ( success ? "passed" : "failed")
<< std::endl;
timer.stop("checking result on CPU");
timer.start();
for (uint64_t stream = 0; stream < num_streams; stream++)
// Destroy streams.
cudaStreamDestroy(streams[stream]);
timer.stop("destroy streams");
check_last_error();
timer.start();
cudaFreeHost(data_cpu);
cudaFree (data_gpu);
timer.stop("free memory");
check_last_error();
}
中间的准备文件为helpers.cuh和encryption.cuh
helpers.cuh
#pragma once
#include <iostream>
#include <cstdint>
#include <string>
uint64_t sdiv (uint64_t a, uint64_t b) {
return (a+b-1)/b;
}
void check_last_error ( ) {
cudaError_t err;
if ((err = cudaGetLastError()) != cudaSuccess) {
std::cout << "CUDA error: " << cudaGetErrorString(err) << " : "
<< __FILE__ << ", line " << __LINE__ << std::endl;
exit(1);
}
}
class Timer {
float time;
const uint64_t gpu;
cudaEvent_t ying, yang;
public:
Timer (uint64_t gpu=0) : gpu(gpu) {
cudaSetDevice(gpu);
cudaEventCreate(&ying);
cudaEventCreate(&yang);
}
~Timer ( ) {
cudaSetDevice(gpu);
cudaEventDestroy(ying);
cudaEventDestroy(yang);
}
void start ( ) {
cudaSetDevice(gpu);
cudaEventRecord(ying, 0);
}
void stop (std::string label) {
cudaSetDevice(gpu);
cudaEventRecord(yang, 0);
cudaEventSynchronize(yang);
cudaEventElapsedTime(&time, ying, yang);
std::cout << "TIMING: " << time << " ms (" << label << ")" << std::endl;
}
encryption.cuh
#pragma once
#include <cstdint>
#include <assert.h>
__host__ __device__ __forceinline__
uint32_t hash (uint32_t x) {
x ^= x >> 16;
x *= 0x85ebca6b;
x ^= x >> 13;
x *= 0xc2b2ae35;
x ^= x >> 16;
return x;
}
__host__ __device__ __forceinline__
uint64_t permute64(uint64_t x, uint64_t num_iters) {
constexpr uint64_t mask = (1UL << 32)-1;
for (uint64_t iter = 0; iter < num_iters; iter++) {
const uint64_t upper = x >> 32;
const uint64_t lower = x & mask;
const uint64_t mixer = hash(upper);
x = upper + ((lower^mixer&mask) << 32);
}
return x;
}
__host__ __device__ __forceinline__
uint64_t unpermute64(uint64_t x, uint64_t num_iters) {
constexpr uint64_t mask = (1UL << 32)-1;
for (uint64_t iter = 0; iter < num_iters; iter++) {
const uint64_t upper = x & mask;
const uint64_t lower = x >> 32;
const uint64_t mixer = hash(upper);
x = (upper << 32) + (lower^mixer&mask);
}
return x;
}