首页 > 代码库 > 可移动固定内存测试

可移动固定内存测试

可移动固定内存测试,项目打包下载

  1 #include "../common/book.h"  2 #include "cuda_runtime.h"  3 #include "device_launch_parameters.h"  4 #include "device_functions.h"  5 #define imin(a,b) (a<b?a:b)  6   7 #define     N    (33*1024*1024)  8 const int threadsPerBlock = 256;  9 const int blocksPerGrid = 10 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock); 11  12  13 __global__ void dot(int size, float *a, float *b, float *c) { 14     __shared__ float cache[threadsPerBlock]; 15     int tid = threadIdx.x + blockIdx.x * blockDim.x; 16     int cacheIndex = threadIdx.x; 17  18     float   temp = 0; 19     while (tid < size) { 20         temp += a[tid] * b[tid]; 21         tid += blockDim.x * gridDim.x; 22     } 23  24     // set the cache values 25     cache[cacheIndex] = temp; 26  27     //块内线程同步 28     __syncthreads(); 29  30     // for reductions, threadsPerBlock must be a power of 2 31     // because of the following code 32     int i = blockDim.x / 2; 33     while (i != 0) { 34         if (cacheIndex < i) 35             cache[cacheIndex] += cache[cacheIndex + i]; 36         __syncthreads(); 37         i /= 2; 38     } 39  40     if (cacheIndex == 0) 41         c[blockIdx.x] = cache[0]; 42 } 43  44  45 struct DataStruct { 46     int     deviceID; 47     int     size; 48     int     offset; 49     float   *a; 50     float   *b; 51     float   returnValue; 52 }; 53  54 unsigned WINAPI routine(void *pvoidData) 55 //void* routine(void *pvoidData)  56 { 57     DataStruct  *data = http://www.mamicode.com/(DataStruct*)pvoidData; 58     //device0上已经调用了这个代码,这里是device为非0才调用 59     if (data->deviceID != 0) { 60         HANDLE_ERROR(cudaSetDevice(data->deviceID)); 61         //告诉运行时希望在和这个设备上分配零拷贝内存,不用在设定是否为可移动的,因为在device0中已经设定 62         HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost)); 63     } 64  65     int     size = data->size; 66     float   *a, *b, c, *partial_c; 67     float   *dev_a, *dev_b, *dev_partial_c; 68  69     // allocate memory on the CPU side 70     a = data->a; 71     b = data->b; 72     partial_c = (float*)malloc(blocksPerGrid*sizeof(float)); 73  74     // allocate the memory on the GPU 75     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0)); 76     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0)); 77     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c, 78         blocksPerGrid*sizeof(float))); 79  80     // offset ‘a‘ and ‘b‘ to where this GPU is gets it data 81     dev_a += data->offset; 82     dev_b += data->offset; 83  84     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b, 85         dev_partial_c); 86     // copy the array ‘c‘ back from the GPU to the CPU 87     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c, 88         blocksPerGrid*sizeof(float), 89         cudaMemcpyDeviceToHost)); 90  91     // finish up on the CPU side 92     c = 0; 93     for (int i = 0; i<blocksPerGrid; i++) { 94         c += partial_c[i]; 95     } 96  97     HANDLE_ERROR(cudaFree(dev_partial_c)); 98  99     // free memory on the CPU side100     free(partial_c);101 102     data->returnValue =http://www.mamicode.com/ c;103     return 0;104 }105 106 107 int main(void) {108     int deviceCount;109     HANDLE_ERROR(cudaGetDeviceCount(&deviceCount));110     if (deviceCount < 2) {111         printf("We need at least two compute 1.0 or greater "112             "devices, but only found %d\n", deviceCount);113         return 0;114     }115 116     cudaDeviceProp  prop;117     for (int i = 0; i<2; i++) {118         HANDLE_ERROR(cudaGetDeviceProperties(&prop, i));119         if (prop.canMapHostMemory != 1) {120             printf("Device %d can not map memory.\n", i);121             return 0;122         }123     }124 125     float *a, *b;126     HANDLE_ERROR(cudaSetDevice(0));127     HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));128     /*129     在设置了设备0后,设置了分配内存的类型为cudaHostAllocPortable,130     否则只有设备0会将这些分配的内存视为固定内存131     只在device0中设定为可移动的132     */133     HANDLE_ERROR(cudaHostAlloc((void**)&a, N*sizeof(float),134         cudaHostAllocWriteCombined |135         cudaHostAllocPortable |136         cudaHostAllocMapped));137     HANDLE_ERROR(cudaHostAlloc((void**)&b, N*sizeof(float),138         cudaHostAllocWriteCombined |139         cudaHostAllocPortable |140         cudaHostAllocMapped));141 142     // fill in the host memory with data143     for (int i = 0; i<N; i++) {144         a[i] = i;145         b[i] = i * 2;146     }147 148     // prepare for multithread149     DataStruct  data[2];150     data[0].deviceID = 0;151     data[0].offset = 0;152     data[0].size = N / 2;153     data[0].a = a;154     data[0].b = b;155 156     data[1].deviceID = 1;157     data[1].offset = N / 2;158     data[1].size = N / 2;159     data[1].a = a;160     data[1].b = b;161 162     CUTThread   thread = start_thread(routine, &(data[1]));163     routine(&(data[0]));164     end_thread(thread);165 166 167     // free memory on the CPU side168     HANDLE_ERROR(cudaFreeHost(a));169     HANDLE_ERROR(cudaFreeHost(b));170 171     printf("Value calculated:  %f\n",172         data[0].returnValue + data[1].returnValue);173 174     return 0;175 }

 

可移动固定内存测试