首页 > 代码库 > basic_double_stream_incorrect

basic_double_stream_incorrect

不合理的代码

  1 /*  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.  3 *  4 * NVIDIA Corporation and its licensors retain all intellectual property and  5 * proprietary rights in and to this software and related documentation.  6 * Any use, reproduction, disclosure, or distribution of this software  7 * and related documentation without an express license agreement from  8 * NVIDIA Corporation is strictly prohibited.  9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15  16  17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N   (1024*1024) 22 #define FULL_DATA_SIZE   (N*20) 23  24  25 __global__ void kernel(int *a, int *b, int *c) { 26     int idx = threadIdx.x + blockIdx.x * blockDim.x; 27     if (idx < N) { 28         int idx1 = (idx + 1) % 256; 29         int idx2 = (idx + 2) % 256; 30         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 31         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 32         c[idx] = (as + bs) / 2; 33     } 34 } 35  36  37 int main(void) { 38     cudaDeviceProp  prop; 39     int whichDevice; 40     HANDLE_ERROR(cudaGetDevice(&whichDevice)); 41     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 42     if (!prop.deviceOverlap) { 43         printf("Device will not handle overlaps, so no speed up from streams\n"); 44         return 0; 45     } 46  47     cudaEvent_t     start, stop; 48     float           elapsedTime; 49  50     cudaStream_t    stream0, stream1; 51     int *host_a, *host_b, *host_c; 52     int *dev_a0, *dev_b0, *dev_c0; 53     int *dev_a1, *dev_b1, *dev_c1; 54  55     // start the timers 56     HANDLE_ERROR(cudaEventCreate(&start)); 57     HANDLE_ERROR(cudaEventCreate(&stop)); 58  59     // initialize the streams 60     HANDLE_ERROR(cudaStreamCreate(&stream0)); 61     HANDLE_ERROR(cudaStreamCreate(&stream1)); 62  63     // allocate the memory on the GPU 64     HANDLE_ERROR(cudaMalloc((void**)&dev_a0, 65         N * sizeof(int))); 66     HANDLE_ERROR(cudaMalloc((void**)&dev_b0, 67         N * sizeof(int))); 68     HANDLE_ERROR(cudaMalloc((void**)&dev_c0, 69         N * sizeof(int))); 70     HANDLE_ERROR(cudaMalloc((void**)&dev_a1, 71         N * sizeof(int))); 72     HANDLE_ERROR(cudaMalloc((void**)&dev_b1, 73         N * sizeof(int))); 74     HANDLE_ERROR(cudaMalloc((void**)&dev_c1, 75         N * sizeof(int))); 76  77     // allocate host locked memory, used to stream 78     HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 79         FULL_DATA_SIZE * sizeof(int), 80         cudaHostAllocDefault)); 81     HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 82         FULL_DATA_SIZE * sizeof(int), 83         cudaHostAllocDefault)); 84     HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 85         FULL_DATA_SIZE * sizeof(int), 86         cudaHostAllocDefault)); 87  88     for (int i = 0; i<FULL_DATA_SIZE; i++) { 89         host_a[i] = rand(); 90         host_b[i] = rand(); 91     } 92  93     HANDLE_ERROR(cudaEventRecord(start, 0)); 94     // now loop over full data, in bite-sized chunks 95     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) { 96         // copy the locked memory to the device, async 97         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, 98             N * sizeof(int), 99             cudaMemcpyHostToDevice,100             stream0));101         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,102             N * sizeof(int),103             cudaMemcpyHostToDevice,104             stream0));105 106         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);107 108         // copy the data from device to locked memory109         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,110             N * sizeof(int),111             cudaMemcpyDeviceToHost,112             stream0));113 114 115         // copy the locked memory to the device, async116         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,117             N * sizeof(int),118             cudaMemcpyHostToDevice,119             stream1));120         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,121             N * sizeof(int),122             cudaMemcpyHostToDevice,123             stream1));124 125         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);126 127         // copy the data from device to locked memory128         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,129             N * sizeof(int),130             cudaMemcpyDeviceToHost,131             stream1));132     }133     HANDLE_ERROR(cudaStreamSynchronize(stream0));134     HANDLE_ERROR(cudaStreamSynchronize(stream1));135 136     HANDLE_ERROR(cudaEventRecord(stop, 0));137 138     HANDLE_ERROR(cudaEventSynchronize(stop));139     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,140         start, stop));141     printf("Time taken:  %3.1f ms\n", elapsedTime);142 143     // cleanup the streams and memory144     HANDLE_ERROR(cudaFreeHost(host_a));145     HANDLE_ERROR(cudaFreeHost(host_b));146     HANDLE_ERROR(cudaFreeHost(host_c));147     HANDLE_ERROR(cudaFree(dev_a0));148     HANDLE_ERROR(cudaFree(dev_b0));149     HANDLE_ERROR(cudaFree(dev_c0));150     HANDLE_ERROR(cudaFree(dev_a1));151     HANDLE_ERROR(cudaFree(dev_b1));152     HANDLE_ERROR(cudaFree(dev_c1));153     HANDLE_ERROR(cudaStreamDestroy(stream0));154     HANDLE_ERROR(cudaStreamDestroy(stream1));155 156     return 0;157 }

代码下载

basic_double_stream_incorrect