首页 > 代码库 > 利用两个流进行操作演示
利用两个流进行操作演示
项目下载链接
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N (1024*1024) 22 #define FULL_DATA_SIZE (N*20) 23 24 25 __global__ void kernel(int *a, int *b, int *c) { 26 int idx = threadIdx.x + blockIdx.x * blockDim.x; 27 if (idx < N) { 28 int idx1 = (idx + 1) % 256; 29 int idx2 = (idx + 2) % 256; 30 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 31 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 32 c[idx] = (as + bs) / 2; 33 } 34 } 35 36 37 int main(void) { 38 cudaDeviceProp prop; 39 int whichDevice; 40 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 41 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 42 if (!prop.deviceOverlap) { 43 printf("Device will not handle overlaps, so no speed up from streams\n"); 44 return 0; 45 } 46 47 cudaEvent_t start, stop; 48 float elapsedTime; 49 50 cudaStream_t stream0, stream1; 51 int *host_a, *host_b, *host_c; 52 int *dev_a0, *dev_b0, *dev_c0; 53 int *dev_a1, *dev_b1, *dev_c1; 54 55 // start the timers 56 HANDLE_ERROR(cudaEventCreate(&start)); 57 HANDLE_ERROR(cudaEventCreate(&stop)); 58 59 //初始化两个流 60 HANDLE_ERROR(cudaStreamCreate(&stream0)); 61 HANDLE_ERROR(cudaStreamCreate(&stream1)); 62 63 // allocate the memory on the GPU 64 HANDLE_ERROR(cudaMalloc((void**)&dev_a0, 65 N * sizeof(int))); 66 HANDLE_ERROR(cudaMalloc((void**)&dev_b0, 67 N * sizeof(int))); 68 HANDLE_ERROR(cudaMalloc((void**)&dev_c0, 69 N * sizeof(int))); 70 HANDLE_ERROR(cudaMalloc((void**)&dev_a1, 71 N * sizeof(int))); 72 HANDLE_ERROR(cudaMalloc((void**)&dev_b1, 73 N * sizeof(int))); 74 HANDLE_ERROR(cudaMalloc((void**)&dev_c1, 75 N * sizeof(int))); 76 77 //在主机上分配锁定页内存 78 HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 79 FULL_DATA_SIZE * sizeof(int), 80 cudaHostAllocDefault)); 81 HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 82 FULL_DATA_SIZE * sizeof(int), 83 cudaHostAllocDefault)); 84 HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 85 FULL_DATA_SIZE * sizeof(int), 86 cudaHostAllocDefault)); 87 88 for (int i = 0; i<FULL_DATA_SIZE; i++) { 89 host_a[i] = rand(); 90 host_b[i] = rand(); 91 } 92 93 HANDLE_ERROR(cudaEventRecord(start, 0)); 94 // now loop over full data, in bite-sized chunks 95 for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) { 96 // enqueue copies of a in stream0 and stream1 97 HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, 98 N * sizeof(int), 99 cudaMemcpyHostToDevice,100 stream0));101 HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,102 N * sizeof(int),103 cudaMemcpyHostToDevice,104 stream1));105 // enqueue copies of b in stream0 and stream1106 HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,107 N * sizeof(int),108 cudaMemcpyHostToDevice,109 stream0));110 HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,111 N * sizeof(int),112 cudaMemcpyHostToDevice,113 stream1));114 115 // enqueue kernels in stream0 and stream1 116 kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);117 kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);118 119 //从设备上将结果拷贝回主机上的锁定页内存120 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,121 N * sizeof(int),122 cudaMemcpyDeviceToHost,123 stream0));124 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,125 N * sizeof(int),126 cudaMemcpyDeviceToHost,127 stream1));128 }129 //将计算结果从锁定页内存复制会主机内存130 HANDLE_ERROR(cudaStreamSynchronize(stream0));131 HANDLE_ERROR(cudaStreamSynchronize(stream1));132 133 HANDLE_ERROR(cudaEventRecord(stop, 0));134 135 HANDLE_ERROR(cudaEventSynchronize(stop));136 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,137 start, stop));138 printf("Time taken: %3.1f ms\n", elapsedTime);139 140 // cleanup the streams and memory141 HANDLE_ERROR(cudaFreeHost(host_a));142 HANDLE_ERROR(cudaFreeHost(host_b));143 HANDLE_ERROR(cudaFreeHost(host_c));144 HANDLE_ERROR(cudaFree(dev_a0));145 HANDLE_ERROR(cudaFree(dev_b0));146 HANDLE_ERROR(cudaFree(dev_c0));147 HANDLE_ERROR(cudaFree(dev_a1));148 HANDLE_ERROR(cudaFree(dev_b1));149 HANDLE_ERROR(cudaFree(dev_c1));150 HANDLE_ERROR(cudaStreamDestroy(stream0));151 HANDLE_ERROR(cudaStreamDestroy(stream1));152 153 return 0;154 }
利用两个流进行操作演示
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。