首页 > 代码库 > 多GPU设备处理点积示例

多GPU设备处理点积示例

 多GPU设备处理点积示例,项目打包下载

  1 /*  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.  3 *  4 * NVIDIA Corporation and its licensors retain all intellectual property and  5 * proprietary rights in and to this software and related documentation.  6 * Any use, reproduction, disclosure, or distribution of this software  7 * and related documentation without an express license agreement from  8 * NVIDIA Corporation is strictly prohibited.  9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15  16  17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "device_launch_parameters.h" 20 #include "device_functions.h" 21 #include "cuda_runtime.h" 22  23 #define imin(a,b) (a<b?a:b) 24  25 #define     N    (33*1024*1024) 26 const int threadsPerBlock = 256; 27 const int blocksPerGrid = 28 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock); 29  30  31 __global__ void dot(int size, float *a, float *b, float *c) { 32     __shared__ float cache[threadsPerBlock]; 33     int tid = threadIdx.x + blockIdx.x * blockDim.x; 34     int cacheIndex = threadIdx.x; 35  36     float   temp = 0; 37     while (tid < size) { 38         temp += a[tid] * b[tid]; 39         tid += blockDim.x * gridDim.x; 40     } 41  42     // set the cache values 43     cache[cacheIndex] = temp; 44  45     // synchronize threads in this block 46     __syncthreads(); 47  48     //块内归约 49     int i = blockDim.x / 2; 50     while (i != 0) { 51         if (cacheIndex < i) 52             cache[cacheIndex] += cache[cacheIndex + i]; 53         __syncthreads(); 54         i /= 2; 55     } 56  57     if (cacheIndex == 0) 58         c[blockIdx.x] = cache[0]; 59 } 60  61  62 struct DataStruct { 63     int     deviceID; 64     int     size; 65     float   *a; 66     float   *b; 67     float   returnValue; 68 }; 69  70 unsigned WINAPI routine(void *pvoidData) 71 //void* routine(void *pvoidData)  72 { 73     DataStruct  *data = http://www.mamicode.com/(DataStruct*)pvoidData; 74     HANDLE_ERROR(cudaSetDevice(data->deviceID)); 75  76     int     size = data->size; 77     float   *a, *b, c, *partial_c; 78     float   *dev_a, *dev_b, *dev_partial_c; 79  80     // allocate memory on the CPU side 81     a = data->a; 82     b = data->b; 83     partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 84  85     // allocate the memory on the GPU 86     HANDLE_ERROR(cudaMalloc((void**)&dev_a, 87         size*sizeof(float))); 88     HANDLE_ERROR(cudaMalloc((void**)&dev_b, 89         size*sizeof(float))); 90     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 91         blocksPerGrid*sizeof(float))); 92  93     // copy the arrays ‘a‘ and ‘b‘ to the GPU 94     HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float), 95         cudaMemcpyHostToDevice)); 96     HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float), 97         cudaMemcpyHostToDevice)); 98  99     dot <<<blocksPerGrid, threadsPerBlock >>>(size, dev_a, dev_b,100         dev_partial_c);101     // copy the array ‘c‘ back from the GPU to the CPU102     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,103         blocksPerGrid*sizeof(float),104         cudaMemcpyDeviceToHost));105 106     // finish up on the CPU side107     c = 0;108     for (int i = 0; i<blocksPerGrid; i++) {109         c += partial_c[i];110     }111 112     HANDLE_ERROR(cudaFree(dev_a));113     HANDLE_ERROR(cudaFree(dev_b));114     HANDLE_ERROR(cudaFree(dev_partial_c));115 116     // free memory on the CPU side117     free(partial_c);118 119     data->returnValue =http://www.mamicode.com/ c;120     return 0;121 }122 123 124 int main(void) {125     int deviceCount;126     HANDLE_ERROR(cudaGetDeviceCount(&deviceCount));127     //要求两个设备128     if (deviceCount < 2) {129         printf("We need at least two compute 1.0 or greater "130             "devices, but only found %d\n", deviceCount);131         return 0;132     }133 134     float   *a = (float*)malloc(sizeof(float)* N);135     HANDLE_NULL(a);136     float   *b = (float*)malloc(sizeof(float)* N);137     HANDLE_NULL(b);138 139     // fill in the host memory with data140     for (int i = 0; i<N; i++) {141         a[i] = i;142         b[i] = i * 2;143     }144 145     /*146     为多线程做准备147     每个DateStruct都为数据集大小的一半148     */149     DataStruct  data[2];150     data[0].deviceID = 0;151     data[0].size = N / 2;152     data[0].a = a;153     data[0].b = b;154 155     data[1].deviceID = 1;156     data[1].size = N / 2;157     data[1].a = a + N / 2;158     data[1].b = b + N / 2;159 160     CUTThread   thread = start_thread(routine, &(data[0]));161     routine(&(data[1]));162     end_thread(thread);163 164 165     // free memory on the CPU side166     free(a);167     free(b);168 169     printf("Value calculated:  %f\n",170         data[0].returnValue + data[1].returnValue);171 172     return 0;173 }

 

多GPU设备处理点积示例