首页 > 代码库 > 可移动固定内存测试
可移动固定内存测试
可移动固定内存测试,项目打包下载
1 #include "../common/book.h" 2 #include "cuda_runtime.h" 3 #include "device_launch_parameters.h" 4 #include "device_functions.h" 5 #define imin(a,b) (a<b?a:b) 6 7 #define N (33*1024*1024) 8 const int threadsPerBlock = 256; 9 const int blocksPerGrid = 10 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock); 11 12 13 __global__ void dot(int size, float *a, float *b, float *c) { 14 __shared__ float cache[threadsPerBlock]; 15 int tid = threadIdx.x + blockIdx.x * blockDim.x; 16 int cacheIndex = threadIdx.x; 17 18 float temp = 0; 19 while (tid < size) { 20 temp += a[tid] * b[tid]; 21 tid += blockDim.x * gridDim.x; 22 } 23 24 // set the cache values 25 cache[cacheIndex] = temp; 26 27 //块内线程同步 28 __syncthreads(); 29 30 // for reductions, threadsPerBlock must be a power of 2 31 // because of the following code 32 int i = blockDim.x / 2; 33 while (i != 0) { 34 if (cacheIndex < i) 35 cache[cacheIndex] += cache[cacheIndex + i]; 36 __syncthreads(); 37 i /= 2; 38 } 39 40 if (cacheIndex == 0) 41 c[blockIdx.x] = cache[0]; 42 } 43 44 45 struct DataStruct { 46 int deviceID; 47 int size; 48 int offset; 49 float *a; 50 float *b; 51 float returnValue; 52 }; 53 54 unsigned WINAPI routine(void *pvoidData) 55 //void* routine(void *pvoidData) 56 { 57 DataStruct *data = http://www.mamicode.com/(DataStruct*)pvoidData; 58 //device0上已经调用了这个代码,这里是device为非0才调用 59 if (data->deviceID != 0) { 60 HANDLE_ERROR(cudaSetDevice(data->deviceID)); 61 //告诉运行时希望在和这个设备上分配零拷贝内存,不用在设定是否为可移动的,因为在device0中已经设定 62 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost)); 63 } 64 65 int size = data->size; 66 float *a, *b, c, *partial_c; 67 float *dev_a, *dev_b, *dev_partial_c; 68 69 // allocate memory on the CPU side 70 a = data->a; 71 b = data->b; 72 partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 73 74 // allocate the memory on the GPU 75 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0)); 76 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0)); 77 HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 78 blocksPerGrid*sizeof(float))); 79 80 // offset ‘a‘ and ‘b‘ to where this GPU is gets it data 81 dev_a += data->offset; 82 dev_b += data->offset; 83 84 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 85 dev_partial_c); 86 // copy the array ‘c‘ back from the GPU to the CPU 87 HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, 88 blocksPerGrid*sizeof(float), 89 cudaMemcpyDeviceToHost)); 90 91 // finish up on the CPU side 92 c = 0; 93 for (int i = 0; i<blocksPerGrid; i++) { 94 c += partial_c[i]; 95 } 96 97 HANDLE_ERROR(cudaFree(dev_partial_c)); 98 99 // free memory on the CPU side100 free(partial_c);101 102 data->returnValue =http://www.mamicode.com/ c;103 return 0;104 }105 106 107 int main(void) {108 int deviceCount;109 HANDLE_ERROR(cudaGetDeviceCount(&deviceCount));110 if (deviceCount < 2) {111 printf("We need at least two compute 1.0 or greater "112 "devices, but only found %d\n", deviceCount);113 return 0;114 }115 116 cudaDeviceProp prop;117 for (int i = 0; i<2; i++) {118 HANDLE_ERROR(cudaGetDeviceProperties(&prop, i));119 if (prop.canMapHostMemory != 1) {120 printf("Device %d can not map memory.\n", i);121 return 0;122 }123 }124 125 float *a, *b;126 HANDLE_ERROR(cudaSetDevice(0));127 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));128 /*129 在设置了设备0后,设置了分配内存的类型为cudaHostAllocPortable,130 否则只有设备0会将这些分配的内存视为固定内存131 只在device0中设定为可移动的132 */133 HANDLE_ERROR(cudaHostAlloc((void**)&a, N*sizeof(float),134 cudaHostAllocWriteCombined |135 cudaHostAllocPortable |136 cudaHostAllocMapped));137 HANDLE_ERROR(cudaHostAlloc((void**)&b, N*sizeof(float),138 cudaHostAllocWriteCombined |139 cudaHostAllocPortable |140 cudaHostAllocMapped));141 142 // fill in the host memory with data143 for (int i = 0; i<N; i++) {144 a[i] = i;145 b[i] = i * 2;146 }147 148 // prepare for multithread149 DataStruct data[2];150 data[0].deviceID = 0;151 data[0].offset = 0;152 data[0].size = N / 2;153 data[0].a = a;154 data[0].b = b;155 156 data[1].deviceID = 1;157 data[1].offset = N / 2;158 data[1].size = N / 2;159 data[1].a = a;160 data[1].b = b;161 162 CUTThread thread = start_thread(routine, &(data[1]));163 routine(&(data[0]));164 end_thread(thread);165 166 167 // free memory on the CPU side168 HANDLE_ERROR(cudaFreeHost(a));169 HANDLE_ERROR(cudaFreeHost(b));170 171 printf("Value calculated: %f\n",172 data[0].returnValue + data[1].returnValue);173 174 return 0;175 }
可移动固定内存测试
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。