首页 > 代码库 > 零内存拷贝和普通拷贝对比

零内存拷贝和普通拷贝对比

下载链接

  1 /*  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.  3 *  4 * NVIDIA Corporation and its licensors retain all intellectual property and  5 * proprietary rights in and to this software and related documentation.  6 * Any use, reproduction, disclosure, or distribution of this software  7 * and related documentation without an express license agreement from  8 * NVIDIA Corporation is strictly prohibited.  9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15  16  17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #include "device_functions.h" 22 #define imin(a,b) (a<b?a:b) 23  24 const int N = 33 * 1024 * 1024; 25 const int threadsPerBlock = 256; 26 const int blocksPerGrid = 27 imin(32, (N + threadsPerBlock - 1) / threadsPerBlock); 28  29  30 __global__ void dot(int size, float *a, float *b, float *c) { 31     __shared__ float cache[threadsPerBlock]; 32     int tid = threadIdx.x + blockIdx.x * blockDim.x; 33     int cacheIndex = threadIdx.x; 34  35     float   temp = 0; 36     while (tid < size) { 37         temp += a[tid] * b[tid]; 38         tid += blockDim.x * gridDim.x; 39     } 40  41     // set the cache values 42     cache[cacheIndex] = temp; 43  44     // synchronize threads in this block 45     __syncthreads(); 46  47     // for reductions, threadsPerBlock must be a power of 2 48     // because of the following code 49     int i = blockDim.x / 2; 50     while (i != 0) { 51         if (cacheIndex < i) 52             cache[cacheIndex] += cache[cacheIndex + i]; 53         __syncthreads(); 54         i /= 2; 55     } 56  57     if (cacheIndex == 0) 58         c[blockIdx.x] = cache[0]; 59 } 60  61  62 float malloc_test(int size) { 63     cudaEvent_t     start, stop; 64     float           *a, *b, c, *partial_c; 65     float           *dev_a, *dev_b, *dev_partial_c; 66     float           elapsedTime; 67  68     HANDLE_ERROR(cudaEventCreate(&start)); 69     HANDLE_ERROR(cudaEventCreate(&stop)); 70  71     // allocate memory on the CPU side 72     a = (float*)malloc(size*sizeof(float)); 73     b = (float*)malloc(size*sizeof(float)); 74     partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 75  76     // allocate the memory on the GPU 77     HANDLE_ERROR(cudaMalloc((void**)&dev_a, 78         size*sizeof(float))); 79     HANDLE_ERROR(cudaMalloc((void**)&dev_b, 80         size*sizeof(float))); 81     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 82         blocksPerGrid*sizeof(float))); 83  84     // fill in the host memory with data 85     for (int i = 0; i<size; i++) { 86         a[i] = i; 87         b[i] = i * 2; 88     } 89  90     HANDLE_ERROR(cudaEventRecord(start, 0)); 91     // copy the arrays ‘a‘ and ‘b‘ to the GPU 92     HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float), 93         cudaMemcpyHostToDevice)); 94     HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float), 95         cudaMemcpyHostToDevice)); 96  97     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 98         dev_partial_c); 99     // copy the array ‘c‘ back from the GPU to the CPU100     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,101         blocksPerGrid*sizeof(float),102         cudaMemcpyDeviceToHost));103 104     HANDLE_ERROR(cudaEventRecord(stop, 0));105     HANDLE_ERROR(cudaEventSynchronize(stop));106     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,107         start, stop));108 109     // finish up on the CPU side110     c = 0;111     for (int i = 0; i<blocksPerGrid; i++) {112         c += partial_c[i];113     }114 115     HANDLE_ERROR(cudaFree(dev_a));116     HANDLE_ERROR(cudaFree(dev_b));117     HANDLE_ERROR(cudaFree(dev_partial_c));118 119     // free memory on the CPU side120     free(a);121     free(b);122     free(partial_c);123 124     // free events125     HANDLE_ERROR(cudaEventDestroy(start));126     HANDLE_ERROR(cudaEventDestroy(stop));127 128     printf("Value calculated:  %f\n", c);129 130     return elapsedTime;131 }132 133 134 float cuda_host_alloc_test(int size) {135     cudaEvent_t     start, stop;136     float           *a, *b, c, *partial_c;137     float           *dev_a, *dev_b, *dev_partial_c;138     float           elapsedTime;139 140     HANDLE_ERROR(cudaEventCreate(&start));141     HANDLE_ERROR(cudaEventCreate(&stop));142 143     // allocate the memory on the CPU144     HANDLE_ERROR(cudaHostAlloc((void**)&a,145         size*sizeof(float),146         cudaHostAllocWriteCombined |147         cudaHostAllocMapped));148     HANDLE_ERROR(cudaHostAlloc((void**)&b,149         size*sizeof(float),150         cudaHostAllocWriteCombined |151         cudaHostAllocMapped));152     HANDLE_ERROR(cudaHostAlloc((void**)&partial_c,153         blocksPerGrid*sizeof(float),154         cudaHostAllocMapped));155 156     // find out the GPU pointers157     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0));158     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0));159     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_partial_c,160         partial_c, 0));161 162     // fill in the host memory with data163     for (int i = 0; i<size; i++) {164         a[i] = i;165         b[i] = i * 2;166     }167 168     HANDLE_ERROR(cudaEventRecord(start, 0));169 170     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,171         dev_partial_c);172 173     HANDLE_ERROR(cudaThreadSynchronize());174     HANDLE_ERROR(cudaEventRecord(stop, 0));175     HANDLE_ERROR(cudaEventSynchronize(stop));176     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,177         start, stop));178 179     // finish up on the CPU side180     c = 0;181     for (int i = 0; i<blocksPerGrid; i++) {182         c += partial_c[i];183     }184 185     HANDLE_ERROR(cudaFreeHost(a));186     HANDLE_ERROR(cudaFreeHost(b));187     HANDLE_ERROR(cudaFreeHost(partial_c));188 189     // free events190     HANDLE_ERROR(cudaEventDestroy(start));191     HANDLE_ERROR(cudaEventDestroy(stop));192 193     printf("Value calculated:  %f\n", c);194 195     return elapsedTime;196 }197 198 199 int main(void) {200     cudaDeviceProp  prop;201     int whichDevice;202     HANDLE_ERROR(cudaGetDevice(&whichDevice));203     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));204     if (prop.canMapHostMemory != 1) {205         printf("Device can not map memory.\n");206         return 0;207     }208 209     float           elapsedTime;210 211     HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));212 213     // try it with malloc214     elapsedTime = malloc_test(N);215     printf("Time using cudaMalloc:  %3.1f ms\n",216         elapsedTime);217 218     // now try it with cudaHostAlloc219     elapsedTime = cuda_host_alloc_test(N);220     printf("Time using cudaHostAlloc:  %3.1f ms\n",221         elapsedTime);222 }

 

零内存拷贝和普通拷贝对比