cuda多gpu c++加速9 应用数据复制与计算的重叠

加速了两倍

Makefile

CUDACXX=nvcc
CUDACXXFLAGS=-arch=sm_70 -O3
CXXFLAGS=-march=native -fopenmp
NSYS=nsys profile
NSYSFLAGS=--stats=true --force-overwrite=true

all: streams

streams: streams.cu
	$(CUDACXX) $(CUDACXXFLAGS) -Xcompiler="$(CXXFLAGS)" streams.cu -o streams

streams_solution: streams_solution.cu
	$(CUDACXX) $(CUDACXXFLAGS) -Xcompiler="$(CXXFLAGS)" streams_solution.cu -o streams_solution

profile: streams
	$(NSYS) $(NSYSFLAGS) -o streams-report ./streams

profile_solution: streams_solution
	$(NSYS) $(NSYSFLAGS) -o streams-solution-report ./streams_solution

clean:
	rm -f streams streams_solution *.qdrep *.sqlite

streams.cu 没有进行多流加速使用默认流

#include <cstdint>
#include <iostream>
#include "helpers.cuh"
#include "encryption.cuh"

void encrypt_cpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters, bool parallel=true) {

    #pragma omp parallel for if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        data[entry] = permute64(entry, num_iters);
}

__global__ 
void decrypt_gpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters) {

    const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
    const uint64_t stride = blockDim.x*gridDim.x;

    for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
        data[entry] = unpermute64(data[entry], num_iters);
}

bool check_result_cpu(uint64_t * data, uint64_t num_entries,
                      bool parallel=true) {

    uint64_t counter = 0;

    #pragma omp parallel for reduction(+: counter) if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        counter += data[entry] == entry;

    return counter == num_entries;
}

int main (int argc, char * argv[]) {

    Timer timer;
    Timer overall;

    const uint64_t num_entries = 1UL << 26;
    const uint64_t num_iters = 1UL << 10;
    const bool openmp = true;

    // Define the number of streams.
    const uint64_t num_streams = 32;
    
    // Use round-up division to calculate chunk size.
    const uint64_t chunk_size = sdiv(num_entries, num_streams);

    timer.start();
    uint64_t * data_cpu, * data_gpu;
    cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
    cudaMalloc    (&data_gpu, sizeof(uint64_t)*num_entries);
    timer.stop("allocate memory");
    check_last_error();

    timer.start();
    encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
    timer.stop("encrypt data on CPU");

    timer.start();
    
    // Create array for storing streams.
    cudaStream_t streams[num_streams];
    
    // Create number of streams and store in array.
    for (uint64_t stream = 0; stream < num_streams; stream++)
        cudaStreamCreate(&streams[stream]);
    timer.stop("create streams");
    check_last_error();

    overall.start();
    timer.start();
    
    // For each stream...
    for (uint64_t stream = 0; stream < num_streams; stream++) {
        
        // ...calculate index into global data (`lower`) and size of data for it to process (`width`).
        const uint64_t lower = chunk_size*stream;
        const uint64_t upper = min(lower+chunk_size, num_entries);
        const uint64_t width = upper-lower;

        // ...copy stream's chunk to device.
        cudaMemcpyAsync(data_gpu+lower, data_cpu+lower, 
               sizeof(uint64_t)*width, cudaMemcpyHostToDevice, 
               streams[stream]);

        // ...compute stream's chunk.
        decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
            (data_gpu+lower, width, num_iters);

        // ...copy stream's chunk to host.
        cudaMemcpyAsync(data_cpu+lower, data_gpu+lower, 
               sizeof(uint64_t)*width, cudaMemcpyDeviceToHost, 
               streams[stream]);
    }

    for (uint64_t stream = 0; stream < num_streams; stream++)
	// Synchronize streams before checking results on host.
        cudaStreamSynchronize(streams[stream]);    
    
    // Note modification of timer instance use.
    timer.stop("asynchronous H2D->kernel->D2H");
    overall.stop("total time on GPU");
    check_last_error();
    
    timer.start();
    const bool success = check_result_cpu(data_cpu, num_entries, openmp);
    std::cout << "STATUS: test " 
              << ( success ? "passed" : "failed")
              << std::endl;
    timer.stop("checking result on CPU");

    timer.start();
    for (uint64_t stream = 0; stream < num_streams; stream++)
        // Destroy streams.
        cudaStreamDestroy(streams[stream]);    
    timer.stop("destroy streams");
    check_last_error();

    timer.start();
    cudaFreeHost(data_cpu);
    cudaFree    (data_gpu);
    timer.stop("free memory");
    check_last_error();
}

使用了多流加速streams_solution.cu

#include <cstdint>
#include <iostream>
#include "helpers.cuh"
#include "encryption.cuh"

void encrypt_cpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters, bool parallel=true) {

    #pragma omp parallel for if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        data[entry] = permute64(entry, num_iters);
}

__global__ 
void decrypt_gpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters) {

    const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
    const uint64_t stride = blockDim.x*gridDim.x;

    for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
        data[entry] = unpermute64(data[entry], num_iters);
}

bool check_result_cpu(uint64_t * data, uint64_t num_entries,
                      bool parallel=true) {

    uint64_t counter = 0;

    #pragma omp parallel for reduction(+: counter) if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        counter += data[entry] == entry;

    return counter == num_entries;
}

int main (int argc, char * argv[]) {

    Timer timer;
    Timer overall;

    const uint64_t num_entries = 1UL << 26;
    const uint64_t num_iters = 1UL << 10;
    const bool openmp = true;

    // Define the number of streams.
    const uint64_t num_streams = 32;
    
    // Use round-up division to calculate chunk size.
    const uint64_t chunk_size = sdiv(num_entries, num_streams);

    timer.start();
    uint64_t * data_cpu, * data_gpu;
    cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
    cudaMalloc    (&data_gpu, sizeof(uint64_t)*num_entries);
    timer.stop("allocate memory");
    check_last_error();

    timer.start();
    encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
    timer.stop("encrypt data on CPU");

    timer.start();
    
    // Create array for storing streams.
    cudaStream_t streams[num_streams];
    
    // Create number of streams and store in array.
    for (uint64_t stream = 0; stream < num_streams; stream++)
        cudaStreamCreate(&streams[stream]);
    timer.stop("create streams");
    check_last_error();

    overall.start();
    timer.start();
    
    // For each stream...
    for (uint64_t stream = 0; stream < num_streams; stream++) {
        
        // ...calculate index into global data (`lower`) and size of data for it to process (`width`).
        const uint64_t lower = chunk_size*stream;
        const uint64_t upper = min(lower+chunk_size, num_entries);
        const uint64_t width = upper-lower;

        // ...copy stream's chunk to device.
        cudaMemcpyAsync(data_gpu+lower, data_cpu+lower, 
               sizeof(uint64_t)*width, cudaMemcpyHostToDevice, 
               streams[stream]);

        // ...compute stream's chunk.
        decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
            (data_gpu+lower, width, num_iters);

        // ...copy stream's chunk to host.
        cudaMemcpyAsync(data_cpu+lower, data_gpu+lower, 
               sizeof(uint64_t)*width, cudaMemcpyDeviceToHost, 
               streams[stream]);
    }

    for (uint64_t stream = 0; stream < num_streams; stream++)
	// Synchronize streams before checking results on host.
        cudaStreamSynchronize(streams[stream]);    
    
    // Note modification of timer instance use.
    timer.stop("asynchronous H2D->kernel->D2H");
    overall.stop("total time on GPU");
    check_last_error();
    
    timer.start();
    const bool success = check_result_cpu(data_cpu, num_entries, openmp);
    std::cout << "STATUS: test " 
              << ( success ? "passed" : "failed")
              << std::endl;
    timer.stop("checking result on CPU");

    timer.start();
    for (uint64_t stream = 0; stream < num_streams; stream++)
        // Destroy streams.
        cudaStreamDestroy(streams[stream]);    
    timer.stop("destroy streams");
    check_last_error();

    timer.start();
    cudaFreeHost(data_cpu);
    cudaFree    (data_gpu);
    timer.stop("free memory");
    check_last_error();
}

中间的准备文件为helpers.cuh和encryption.cuh

helpers.cuh

#pragma once

#include <iostream>
#include <cstdint>
#include <string>

uint64_t sdiv (uint64_t a, uint64_t b) {
    return (a+b-1)/b;
}

void check_last_error ( ) {

    cudaError_t err;
    if ((err = cudaGetLastError()) != cudaSuccess) {
        std::cout << "CUDA error: " << cudaGetErrorString(err) << " : "
                  << __FILE__ << ", line " << __LINE__ << std::endl;
            exit(1);
    }
}

class Timer {

    float time;
    const uint64_t gpu;
    cudaEvent_t ying, yang;

public:

    Timer (uint64_t gpu=0) : gpu(gpu) {
        cudaSetDevice(gpu);
        cudaEventCreate(&ying);
        cudaEventCreate(&yang);
    }

    ~Timer ( ) {
        cudaSetDevice(gpu);
        cudaEventDestroy(ying);
        cudaEventDestroy(yang);
    }

    void start ( ) {
        cudaSetDevice(gpu);
        cudaEventRecord(ying, 0);
    }

    void stop (std::string label) {
        cudaSetDevice(gpu);
        cudaEventRecord(yang, 0);
        cudaEventSynchronize(yang);
        cudaEventElapsedTime(&time, ying, yang);
        std::cout << "TIMING: " << time << " ms (" << label << ")" << std::endl;
    }

encryption.cuh

#pragma once

#include <cstdint>
#include <assert.h>

__host__ __device__ __forceinline__
uint32_t hash (uint32_t x) {

        x ^= x >> 16;
        x *= 0x85ebca6b;
        x ^= x >> 13;
        x *= 0xc2b2ae35;
        x ^= x >> 16;

        return x;
}

__host__ __device__ __forceinline__
uint64_t permute64(uint64_t x, uint64_t num_iters) {

    constexpr uint64_t mask = (1UL << 32)-1;

    for (uint64_t iter = 0; iter < num_iters; iter++) {
        const uint64_t upper = x >> 32;
        const uint64_t lower = x & mask;
        const uint64_t mixer = hash(upper);

        x = upper + ((lower^mixer&mask) << 32);
    }

    return x;
}

__host__ __device__ __forceinline__
uint64_t unpermute64(uint64_t x, uint64_t num_iters) {

    constexpr uint64_t mask = (1UL << 32)-1;
    
    for (uint64_t iter = 0; iter < num_iters; iter++) {
        const uint64_t upper = x & mask;
        const uint64_t lower = x >> 32;
        const uint64_t mixer = hash(upper);

        x = (upper << 32) + (lower^mixer&mask);
    }

    return x;
}

cuda多gpu c++加速9 应用数据复制与计算的重叠

继续阅读

硬盘固件是什么

usb3.1和3.0的区别，usb3.1有什么优点

PCI-E与SATA SSD如何选

义务维修结束了！！！

3分钟读懂AI教父Hinton。大家好，这期主要介绍AI教父Hinton主要回答三个问题。Hinon是如何做到今天它如何

高性能AI服务器集群内部通信800G模块是一种用于高性能人工智能服务器集群内部通信的模块，具有高速数据传输、低延迟、高可

＜＜程序是怎么跑起来的＞＞学习笔记总结一<<程序是怎么跑起来的>>学习笔记总结

意大利的网络和通信技术的历史一、前言意大利在网络和通信技术领域有着丰富的历史和贡献。下面是意大利网络和通信技术的一些

了解！报考计算机硬件维护师证书有用吗？证书报考流程、时间是？

软考 - 02 计算机组成与结构

Linux系统基础命令（四）计算机硬件组成部分1

PCI总线的配置

CPU性能衡量参数-主频，MIPS，CPI，时钟周期，机器周期，指令周期 CPU性能衡量参数-主频，MIPS，CPI，时钟周期，机器周期，指令周期

CPU设计实战第4章实践任务三前言一、实验内容

计算机组成笔记2——指令系统体系结构(MIPS)2、指令系统体系结构

CPU设计实战第6章实践任务前言一、实验内容