cuda多gpu编程11 多gpu进行运算

2023-09-26 21:39:00

在这里插入图片描述
四块gpu相比一块也只快了一倍
处理效果如下，全部使用了默认流，没有进行cuda加速

在这里插入图片描述没改之前

#include 
#include 
#include "helpers.cuh"
#include "encryption.cuh"void encrypt_cpu(uint64_t * data, uint64_t num_entries, uint64_t num_iters, bool parallel=true) {#pragma omp parallel for if (parallel)for (uint64_t entry = 0; entry < num_entries; entry++)data[entry] = permute64(entry, num_iters);
}__global__ 
void decrypt_gpu(uint64_t * data, uint64_t num_entries, uint64_t num_iters) {const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;const uint64_t stride = blockDim.x*gridDim.x;for (uint64_t entry = thrdID; entry < num_entries; entry += stride)data[entry] = unpermute64(data[entry], num_iters);
}bool check_result_cpu(uint64_t * data, uint64_t num_entries,bool parallel=true) {uint64_t counter = 0;#pragma omp parallel for reduction(+: counter) if (parallel)for (uint64_t entry = 0; entry < num_entries; entry++)counter += data[entry] == entry;return counter == num_entries;
}int main (int argc, char * argv[]) {Timer timer;Timer overall;const uint64_t num_entries = 1UL << 26;const uint64_t num_iters = 1UL << 10;const bool openmp = true;timer.start();uint64_t * data_cpu, * data_gpu;cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);cudaMalloc    (&data_gpu, sizeof(uint64_t)*num_entries);timer.stop("allocate memory");check_last_error();timer.start();encrypt_cpu(data_cpu, num_entries, num_iters, openmp);timer.stop("encrypt data on CPU");overall.start();timer.start();cudaMemcpy(data_gpu, data_cpu, sizeof(uint64_t)*num_entries, cudaMemcpyHostToDevice);timer.stop("copy data from CPU to GPU");check_last_error();timer.start();decrypt_gpu<<<80*32, 64>>>(data_gpu, num_entries, num_iters);timer.stop("decrypt data on GPU");check_last_error();timer.start();cudaMemcpy(data_cpu, data_gpu, sizeof(uint64_t)*num_entries, cudaMemcpyDeviceToHost);timer.stop("copy data from GPU to CPU");overall.stop("total time on GPU");check_last_error();timer.start();const bool success = check_result_cpu(data_cpu, num_entries, openmp);std::cout << "STATUS: test " << ( success ? "passed" : "failed")<< std::endl;timer.stop("checking result on CPU");timer.start();cudaFreeHost(data_cpu);cudaFree    (data_gpu);timer.stop("free memory");check_last_error();
}

结果文档 mgpu_solution.cu

#include 
#include 
#include "helpers.cuh"
#include "encryption.cuh"void encrypt_cpu(uint64_t * data, uint64_t num_entries, uint64_t num_iters, bool parallel=true) {#pragma omp parallel for if (parallel)for (uint64_t entry = 0; entry < num_entries; entry++)data[entry] = permute64(entry, num_iters);
}__global__ 
void decrypt_gpu(uint64_t * data, uint64_t num_entries, uint64_t num_iters) {const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;const uint64_t stride = blockDim.x*gridDim.x;for (uint64_t entry = thrdID; entry < num_entries; entry += stride)data[entry] = unpermute64(data[entry], num_iters);
}bool check_result_cpu(uint64_t * data, uint64_t num_entries,bool parallel=true) {uint64_t counter = 0;#pragma omp parallel for reduction(+: counter) if (parallel)for (uint64_t entry = 0; entry < num_entries; entry++)counter += data[entry] == entry;return counter == num_entries;
}int main (int argc, char * argv[]) {Timer timer;Timer overall;const uint64_t num_entries = 1UL << 26;const uint64_t num_iters = 1UL << 10;const bool openmp = true;// Set number of available GPUs.const uint64_t num_gpus = 4;// Get chunk size using round up division.const uint64_t chunk_size = sdiv(num_entries, num_gpus);timer.start();// Use array of pointers for multiple GPU memory.uint64_t * data_cpu, * data_gpu[num_gpus];cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);// For each GPU...for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {// ...set GPU as active...cudaSetDevice(gpu);// ...get width of this GPUs data chunk...const uint64_t lower = chunk_size*gpu;const uint64_t upper = min(lower+chunk_size, num_entries);const uint64_t width = upper-lower;// ...allocate data for this GPU.cudaMalloc(&data_gpu[gpu], sizeof(uint64_t)*width);}    timer.stop("allocate memory");check_last_error();timer.start();encrypt_cpu(data_cpu, num_entries, num_iters, openmp);timer.stop("encrypt data on CPU");overall.start();timer.start();// For each GPU...for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {cudaSetDevice(gpu);const uint64_t lower = chunk_size*gpu;const uint64_t upper = min(lower+chunk_size, num_entries);const uint64_t width = upper-lower;// ...copy correct chunk of data to active GPU.cudaMemcpy(data_gpu[gpu], data_cpu+lower, sizeof(uint64_t)*width, cudaMemcpyHostToDevice);}timer.stop("copy data from CPU to GPU");check_last_error();timer.start();// For each GPU...for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {cudaSetDevice(gpu);const uint64_t lower = chunk_size*gpu;const uint64_t upper = min(lower+chunk_size, num_entries);const uint64_t width = upper-lower;// ...decrypt its chunk of data.decrypt_gpu<<<80*32, 64>>>(data_gpu[gpu], width, num_iters);}timer.stop("decrypt data on the GPU");check_last_error();timer.start();// For each GPU...for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {cudaSetDevice(gpu);const uint64_t lower = chunk_size*gpu;const uint64_t upper = min(lower+chunk_size, num_entries);const uint64_t width = upper-lower;// ...copy its chunk of data back to the host.cudaMemcpy(data_cpu+lower, data_gpu[gpu], sizeof(uint64_t)*width, cudaMemcpyDeviceToHost);}timer.stop("copy data from GPU to CPU");overall.stop("total time on GPU");check_last_error();timer.start();const bool success = check_result_cpu(data_cpu, num_entries, openmp);std::cout << "STATUS: test " << ( success ? "passed" : "failed")<< std::endl;timer.stop("checking result on CPU");timer.start();cudaFreeHost(data_cpu);for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {cudaSetDevice(gpu);cudaFree(data_gpu[gpu]);}timer.stop("free memory");check_last_error();
}

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

cuda多gpu编程11 多gpu进行运算

相关文章