首页 > 代码库 > cuda流测试=basic_single_stream

cuda流测试=basic_single_stream

cuda流测试

  1 /*  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.  3 *  4 * NVIDIA Corporation and its licensors retain all intellectual property and  5 * proprietary rights in and to this software and related documentation.  6 * Any use, reproduction, disclosure, or distribution of this software  7 * and related documentation without an express license agreement from  8 * NVIDIA Corporation is strictly prohibited.  9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15  16  17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N   (1024*1024) 22 #define FULL_DATA_SIZE   (N*20) 23  24  25 __global__ void kernel(int *a, int *b, int *c) { 26     int idx = threadIdx.x + blockIdx.x * blockDim.x; 27     if (idx < N) { 28         //idx后两个数 29         int idx1 = (idx + 1) % 256; 30         int idx2 = (idx + 2) % 256; 31         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 32         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 33         c[idx] = (as + bs) / 2; 34     } 35 } 36  37  38 int main(void) { 39     cudaDeviceProp  prop; 40     int whichDevice; 41     HANDLE_ERROR(cudaGetDevice(&whichDevice)); 42     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 43     if (!prop.deviceOverlap) { 44         printf("Device will not handle overlaps, so no speed up from streams\n"); 45         return 0; 46     } 47  48     cudaEvent_t     start, stop; 49     float           elapsedTime; 50  51     cudaStream_t    stream; 52     int *host_a, *host_b, *host_c; 53     int *dev_a, *dev_b, *dev_c; 54  55     // start the timers 56     HANDLE_ERROR(cudaEventCreate(&start)); 57     HANDLE_ERROR(cudaEventCreate(&stop)); 58  59     //初始化流 60     HANDLE_ERROR(cudaStreamCreate(&stream)); 61  62     // allocate the memory on the GPU 63     HANDLE_ERROR(cudaMalloc((void**)&dev_a, 64         N * sizeof(int))); 65     HANDLE_ERROR(cudaMalloc((void**)&dev_b, 66         N * sizeof(int))); 67     HANDLE_ERROR(cudaMalloc((void**)&dev_c, 68         N * sizeof(int))); 69  70     //分配由于GPU访问的主机无分页内存(锁定内存页) 71     HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 72         FULL_DATA_SIZE * sizeof(int), 73         cudaHostAllocDefault)); 74     HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 75         FULL_DATA_SIZE * sizeof(int), 76         cudaHostAllocDefault)); 77     HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 78         FULL_DATA_SIZE * sizeof(int), 79         cudaHostAllocDefault)); 80  81     for (int i = 0; i<FULL_DATA_SIZE; i++) { 82         host_a[i] = rand(); 83         host_b[i] = rand(); 84     } 85  86     HANDLE_ERROR(cudaEventRecord(start, 0)); 87     // now loop over full data, in bite-sized chunks 88     for (int i = 0; i<FULL_DATA_SIZE; i += N) { 89         //异步复制主机上内存的值到设备上 90         HANDLE_ERROR(cudaMemcpyAsync(dev_a, host_a + i, 91             N * sizeof(int), 92             cudaMemcpyHostToDevice, 93             stream)); 94         HANDLE_ERROR(cudaMemcpyAsync(dev_b, host_b + i, 95             N * sizeof(int), 96             cudaMemcpyHostToDevice, 97             stream)); 98  99         kernel << <N / 256, 256, 0, stream >> >(dev_a, dev_b, dev_c);100 101         //将计算的值复制会主机102         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c,103             N * sizeof(int),104             cudaMemcpyDeviceToHost,105             stream));106 107     }108     //从锁定页将结果块复制到主机内存109     HANDLE_ERROR(cudaStreamSynchronize(stream));110 111     HANDLE_ERROR(cudaEventRecord(stop, 0));112 113     HANDLE_ERROR(cudaEventSynchronize(stop));114     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,115         start, stop));116     printf("Time taken:  %3.1f ms\n", elapsedTime);117 118     // cleanup the streams and memory119     HANDLE_ERROR(cudaFreeHost(host_a));120     HANDLE_ERROR(cudaFreeHost(host_b));121     HANDLE_ERROR(cudaFreeHost(host_c));122     HANDLE_ERROR(cudaFree(dev_a));123     HANDLE_ERROR(cudaFree(dev_b));124     HANDLE_ERROR(cudaFree(dev_c));125     HANDLE_ERROR(cudaStreamDestroy(stream));126 127     return 0;128 }

项目打包下载

cuda流测试=basic_single_stream