首页 > 代码库 > 零内存拷贝和普通拷贝对比
零内存拷贝和普通拷贝对比
下载链接
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #include "device_functions.h" 22 #define imin(a,b) (a<b?a:b) 23 24 const int N = 33 * 1024 * 1024; 25 const int threadsPerBlock = 256; 26 const int blocksPerGrid = 27 imin(32, (N + threadsPerBlock - 1) / threadsPerBlock); 28 29 30 __global__ void dot(int size, float *a, float *b, float *c) { 31 __shared__ float cache[threadsPerBlock]; 32 int tid = threadIdx.x + blockIdx.x * blockDim.x; 33 int cacheIndex = threadIdx.x; 34 35 float temp = 0; 36 while (tid < size) { 37 temp += a[tid] * b[tid]; 38 tid += blockDim.x * gridDim.x; 39 } 40 41 // set the cache values 42 cache[cacheIndex] = temp; 43 44 // synchronize threads in this block 45 __syncthreads(); 46 47 // for reductions, threadsPerBlock must be a power of 2 48 // because of the following code 49 int i = blockDim.x / 2; 50 while (i != 0) { 51 if (cacheIndex < i) 52 cache[cacheIndex] += cache[cacheIndex + i]; 53 __syncthreads(); 54 i /= 2; 55 } 56 57 if (cacheIndex == 0) 58 c[blockIdx.x] = cache[0]; 59 } 60 61 62 float malloc_test(int size) { 63 cudaEvent_t start, stop; 64 float *a, *b, c, *partial_c; 65 float *dev_a, *dev_b, *dev_partial_c; 66 float elapsedTime; 67 68 HANDLE_ERROR(cudaEventCreate(&start)); 69 HANDLE_ERROR(cudaEventCreate(&stop)); 70 71 // allocate memory on the CPU side 72 a = (float*)malloc(size*sizeof(float)); 73 b = (float*)malloc(size*sizeof(float)); 74 partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 75 76 // allocate the memory on the GPU 77 HANDLE_ERROR(cudaMalloc((void**)&dev_a, 78 size*sizeof(float))); 79 HANDLE_ERROR(cudaMalloc((void**)&dev_b, 80 size*sizeof(float))); 81 HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 82 blocksPerGrid*sizeof(float))); 83 84 // fill in the host memory with data 85 for (int i = 0; i<size; i++) { 86 a[i] = i; 87 b[i] = i * 2; 88 } 89 90 HANDLE_ERROR(cudaEventRecord(start, 0)); 91 // copy the arrays ‘a‘ and ‘b‘ to the GPU 92 HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float), 93 cudaMemcpyHostToDevice)); 94 HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float), 95 cudaMemcpyHostToDevice)); 96 97 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 98 dev_partial_c); 99 // copy the array ‘c‘ back from the GPU to the CPU100 HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,101 blocksPerGrid*sizeof(float),102 cudaMemcpyDeviceToHost));103 104 HANDLE_ERROR(cudaEventRecord(stop, 0));105 HANDLE_ERROR(cudaEventSynchronize(stop));106 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,107 start, stop));108 109 // finish up on the CPU side110 c = 0;111 for (int i = 0; i<blocksPerGrid; i++) {112 c += partial_c[i];113 }114 115 HANDLE_ERROR(cudaFree(dev_a));116 HANDLE_ERROR(cudaFree(dev_b));117 HANDLE_ERROR(cudaFree(dev_partial_c));118 119 // free memory on the CPU side120 free(a);121 free(b);122 free(partial_c);123 124 // free events125 HANDLE_ERROR(cudaEventDestroy(start));126 HANDLE_ERROR(cudaEventDestroy(stop));127 128 printf("Value calculated: %f\n", c);129 130 return elapsedTime;131 }132 133 134 float cuda_host_alloc_test(int size) {135 cudaEvent_t start, stop;136 float *a, *b, c, *partial_c;137 float *dev_a, *dev_b, *dev_partial_c;138 float elapsedTime;139 140 HANDLE_ERROR(cudaEventCreate(&start));141 HANDLE_ERROR(cudaEventCreate(&stop));142 143 // allocate the memory on the CPU144 HANDLE_ERROR(cudaHostAlloc((void**)&a,145 size*sizeof(float),146 cudaHostAllocWriteCombined |147 cudaHostAllocMapped));148 HANDLE_ERROR(cudaHostAlloc((void**)&b,149 size*sizeof(float),150 cudaHostAllocWriteCombined |151 cudaHostAllocMapped));152 HANDLE_ERROR(cudaHostAlloc((void**)&partial_c,153 blocksPerGrid*sizeof(float),154 cudaHostAllocMapped));155 156 // find out the GPU pointers157 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0));158 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0));159 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_partial_c,160 partial_c, 0));161 162 // fill in the host memory with data163 for (int i = 0; i<size; i++) {164 a[i] = i;165 b[i] = i * 2;166 }167 168 HANDLE_ERROR(cudaEventRecord(start, 0));169 170 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,171 dev_partial_c);172 173 HANDLE_ERROR(cudaThreadSynchronize());174 HANDLE_ERROR(cudaEventRecord(stop, 0));175 HANDLE_ERROR(cudaEventSynchronize(stop));176 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,177 start, stop));178 179 // finish up on the CPU side180 c = 0;181 for (int i = 0; i<blocksPerGrid; i++) {182 c += partial_c[i];183 }184 185 HANDLE_ERROR(cudaFreeHost(a));186 HANDLE_ERROR(cudaFreeHost(b));187 HANDLE_ERROR(cudaFreeHost(partial_c));188 189 // free events190 HANDLE_ERROR(cudaEventDestroy(start));191 HANDLE_ERROR(cudaEventDestroy(stop));192 193 printf("Value calculated: %f\n", c);194 195 return elapsedTime;196 }197 198 199 int main(void) {200 cudaDeviceProp prop;201 int whichDevice;202 HANDLE_ERROR(cudaGetDevice(&whichDevice));203 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));204 if (prop.canMapHostMemory != 1) {205 printf("Device can not map memory.\n");206 return 0;207 }208 209 float elapsedTime;210 211 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));212 213 // try it with malloc214 elapsedTime = malloc_test(N);215 printf("Time using cudaMalloc: %3.1f ms\n",216 elapsedTime);217 218 // now try it with cudaHostAlloc219 elapsedTime = cuda_host_alloc_test(N);220 printf("Time using cudaHostAlloc: %3.1f ms\n",221 elapsedTime);222 }
零内存拷贝和普通拷贝对比
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。