首页 > 代码库 > basic_double_stream_incorrect
basic_double_stream_incorrect
不合理的代码
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N (1024*1024) 22 #define FULL_DATA_SIZE (N*20) 23 24 25 __global__ void kernel(int *a, int *b, int *c) { 26 int idx = threadIdx.x + blockIdx.x * blockDim.x; 27 if (idx < N) { 28 int idx1 = (idx + 1) % 256; 29 int idx2 = (idx + 2) % 256; 30 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 31 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 32 c[idx] = (as + bs) / 2; 33 } 34 } 35 36 37 int main(void) { 38 cudaDeviceProp prop; 39 int whichDevice; 40 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 41 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 42 if (!prop.deviceOverlap) { 43 printf("Device will not handle overlaps, so no speed up from streams\n"); 44 return 0; 45 } 46 47 cudaEvent_t start, stop; 48 float elapsedTime; 49 50 cudaStream_t stream0, stream1; 51 int *host_a, *host_b, *host_c; 52 int *dev_a0, *dev_b0, *dev_c0; 53 int *dev_a1, *dev_b1, *dev_c1; 54 55 // start the timers 56 HANDLE_ERROR(cudaEventCreate(&start)); 57 HANDLE_ERROR(cudaEventCreate(&stop)); 58 59 // initialize the streams 60 HANDLE_ERROR(cudaStreamCreate(&stream0)); 61 HANDLE_ERROR(cudaStreamCreate(&stream1)); 62 63 // allocate the memory on the GPU 64 HANDLE_ERROR(cudaMalloc((void**)&dev_a0, 65 N * sizeof(int))); 66 HANDLE_ERROR(cudaMalloc((void**)&dev_b0, 67 N * sizeof(int))); 68 HANDLE_ERROR(cudaMalloc((void**)&dev_c0, 69 N * sizeof(int))); 70 HANDLE_ERROR(cudaMalloc((void**)&dev_a1, 71 N * sizeof(int))); 72 HANDLE_ERROR(cudaMalloc((void**)&dev_b1, 73 N * sizeof(int))); 74 HANDLE_ERROR(cudaMalloc((void**)&dev_c1, 75 N * sizeof(int))); 76 77 // allocate host locked memory, used to stream 78 HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 79 FULL_DATA_SIZE * sizeof(int), 80 cudaHostAllocDefault)); 81 HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 82 FULL_DATA_SIZE * sizeof(int), 83 cudaHostAllocDefault)); 84 HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 85 FULL_DATA_SIZE * sizeof(int), 86 cudaHostAllocDefault)); 87 88 for (int i = 0; i<FULL_DATA_SIZE; i++) { 89 host_a[i] = rand(); 90 host_b[i] = rand(); 91 } 92 93 HANDLE_ERROR(cudaEventRecord(start, 0)); 94 // now loop over full data, in bite-sized chunks 95 for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) { 96 // copy the locked memory to the device, async 97 HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, 98 N * sizeof(int), 99 cudaMemcpyHostToDevice,100 stream0));101 HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,102 N * sizeof(int),103 cudaMemcpyHostToDevice,104 stream0));105 106 kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);107 108 // copy the data from device to locked memory109 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,110 N * sizeof(int),111 cudaMemcpyDeviceToHost,112 stream0));113 114 115 // copy the locked memory to the device, async116 HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,117 N * sizeof(int),118 cudaMemcpyHostToDevice,119 stream1));120 HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,121 N * sizeof(int),122 cudaMemcpyHostToDevice,123 stream1));124 125 kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);126 127 // copy the data from device to locked memory128 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,129 N * sizeof(int),130 cudaMemcpyDeviceToHost,131 stream1));132 }133 HANDLE_ERROR(cudaStreamSynchronize(stream0));134 HANDLE_ERROR(cudaStreamSynchronize(stream1));135 136 HANDLE_ERROR(cudaEventRecord(stop, 0));137 138 HANDLE_ERROR(cudaEventSynchronize(stop));139 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,140 start, stop));141 printf("Time taken: %3.1f ms\n", elapsedTime);142 143 // cleanup the streams and memory144 HANDLE_ERROR(cudaFreeHost(host_a));145 HANDLE_ERROR(cudaFreeHost(host_b));146 HANDLE_ERROR(cudaFreeHost(host_c));147 HANDLE_ERROR(cudaFree(dev_a0));148 HANDLE_ERROR(cudaFree(dev_b0));149 HANDLE_ERROR(cudaFree(dev_c0));150 HANDLE_ERROR(cudaFree(dev_a1));151 HANDLE_ERROR(cudaFree(dev_b1));152 HANDLE_ERROR(cudaFree(dev_c1));153 HANDLE_ERROR(cudaStreamDestroy(stream0));154 HANDLE_ERROR(cudaStreamDestroy(stream1));155 156 return 0;157 }
代码下载
basic_double_stream_incorrect
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。