首页 > 代码库 > 利用两个流进行操作演示

利用两个流进行操作演示

项目下载链接

  1 /*  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.  3 *  4 * NVIDIA Corporation and its licensors retain all intellectual property and  5 * proprietary rights in and to this software and related documentation.  6 * Any use, reproduction, disclosure, or distribution of this software  7 * and related documentation without an express license agreement from  8 * NVIDIA Corporation is strictly prohibited.  9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15  16  17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N   (1024*1024) 22 #define FULL_DATA_SIZE   (N*20) 23  24  25 __global__ void kernel(int *a, int *b, int *c) { 26     int idx = threadIdx.x + blockIdx.x * blockDim.x; 27     if (idx < N) { 28         int idx1 = (idx + 1) % 256; 29         int idx2 = (idx + 2) % 256; 30         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 31         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 32         c[idx] = (as + bs) / 2; 33     } 34 } 35  36  37 int main(void) { 38     cudaDeviceProp  prop; 39     int whichDevice; 40     HANDLE_ERROR(cudaGetDevice(&whichDevice)); 41     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 42     if (!prop.deviceOverlap) { 43         printf("Device will not handle overlaps, so no speed up from streams\n"); 44         return 0; 45     } 46  47     cudaEvent_t     start, stop; 48     float           elapsedTime; 49  50     cudaStream_t    stream0, stream1; 51     int *host_a, *host_b, *host_c; 52     int *dev_a0, *dev_b0, *dev_c0; 53     int *dev_a1, *dev_b1, *dev_c1; 54  55     // start the timers 56     HANDLE_ERROR(cudaEventCreate(&start)); 57     HANDLE_ERROR(cudaEventCreate(&stop)); 58  59     //初始化两个流 60     HANDLE_ERROR(cudaStreamCreate(&stream0)); 61     HANDLE_ERROR(cudaStreamCreate(&stream1)); 62  63     // allocate the memory on the GPU 64     HANDLE_ERROR(cudaMalloc((void**)&dev_a0, 65         N * sizeof(int))); 66     HANDLE_ERROR(cudaMalloc((void**)&dev_b0, 67         N * sizeof(int))); 68     HANDLE_ERROR(cudaMalloc((void**)&dev_c0, 69         N * sizeof(int))); 70     HANDLE_ERROR(cudaMalloc((void**)&dev_a1, 71         N * sizeof(int))); 72     HANDLE_ERROR(cudaMalloc((void**)&dev_b1, 73         N * sizeof(int))); 74     HANDLE_ERROR(cudaMalloc((void**)&dev_c1, 75         N * sizeof(int))); 76  77     //在主机上分配锁定页内存 78     HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 79         FULL_DATA_SIZE * sizeof(int), 80         cudaHostAllocDefault)); 81     HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 82         FULL_DATA_SIZE * sizeof(int), 83         cudaHostAllocDefault)); 84     HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 85         FULL_DATA_SIZE * sizeof(int), 86         cudaHostAllocDefault)); 87  88     for (int i = 0; i<FULL_DATA_SIZE; i++) { 89         host_a[i] = rand(); 90         host_b[i] = rand(); 91     } 92  93     HANDLE_ERROR(cudaEventRecord(start, 0)); 94     // now loop over full data, in bite-sized chunks 95     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) { 96         // enqueue copies of a in stream0 and stream1 97         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, 98             N * sizeof(int), 99             cudaMemcpyHostToDevice,100             stream0));101         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,102             N * sizeof(int),103             cudaMemcpyHostToDevice,104             stream1));105         // enqueue copies of b in stream0 and stream1106         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,107             N * sizeof(int),108             cudaMemcpyHostToDevice,109             stream0));110         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,111             N * sizeof(int),112             cudaMemcpyHostToDevice,113             stream1));114 115         // enqueue kernels in stream0 and stream1   116         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);117         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);118 119         //从设备上将结果拷贝回主机上的锁定页内存120         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,121             N * sizeof(int),122             cudaMemcpyDeviceToHost,123             stream0));124         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,125             N * sizeof(int),126             cudaMemcpyDeviceToHost,127             stream1));128     }129     //将计算结果从锁定页内存复制会主机内存130     HANDLE_ERROR(cudaStreamSynchronize(stream0));131     HANDLE_ERROR(cudaStreamSynchronize(stream1));132 133     HANDLE_ERROR(cudaEventRecord(stop, 0));134 135     HANDLE_ERROR(cudaEventSynchronize(stop));136     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,137         start, stop));138     printf("Time taken:  %3.1f ms\n", elapsedTime);139 140     // cleanup the streams and memory141     HANDLE_ERROR(cudaFreeHost(host_a));142     HANDLE_ERROR(cudaFreeHost(host_b));143     HANDLE_ERROR(cudaFreeHost(host_c));144     HANDLE_ERROR(cudaFree(dev_a0));145     HANDLE_ERROR(cudaFree(dev_b0));146     HANDLE_ERROR(cudaFree(dev_c0));147     HANDLE_ERROR(cudaFree(dev_a1));148     HANDLE_ERROR(cudaFree(dev_b1));149     HANDLE_ERROR(cudaFree(dev_c1));150     HANDLE_ERROR(cudaStreamDestroy(stream0));151     HANDLE_ERROR(cudaStreamDestroy(stream1));152 153     return 0;154 }

 

利用两个流进行操作演示