首页 > 代码库 > cuda多线程间通信

cuda多线程间通信

  1 #include "cuda_runtime.h"  2 #include "device_launch_parameters.h"  3   4 #include <stdio.h>  5 #include <time.h>  6 #include <stdlib.h>  7   8 #define MAX 120  9 #define MIN 0 10 cudaError_t addWithCuda(int *c, const int *a, size_t size); 11  12 __global__ void addKernel(int *c, const int *a) { 13     int i = threadIdx.x; 14     extern __shared__ int smem[]; 15     smem[i] = a[i]; 16     __syncthreads(); 17     if (i == 0)  // 0号线程做平方和 18             { 19         c[0] = 0; 20         for (int d = 0; d < 5; d++) { 21             c[0] += smem[d] * smem[d]; 22         } 23     } 24     if (i == 1)  //1号线程做累加 25             { 26         c[1] = 0; 27         for (int d = 0; d < 5; d++) { 28             c[1] += smem[d]; 29         } 30     } 31     if (i == 2)  //2号线程做累乘 32             { 33         c[2] = 1; 34         for (int d = 0; d < 5; d++) { 35             c[2] = smem[d]; 36         } 37  38     } 39  40     if (i == 3)  //3号线程做异或 41                 { 42             c[3] = 0; 43             for (int d = 0; d < 5; d++) { 44                 c[3] ^= smem[d]; 45             } 46  47         } 48 } 49  50 int main() { 51     const int arraySize = 5; 52     srand((unsigned) time(NULL)); 53     const int a[arraySize] = { rand() % (MAX + 1 - MIN) + MIN, rand() 54             % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN, rand() 55             % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN }; 56     int c[arraySize] = { 0 }; 57     // Add vectors in parallel. 58     cudaError_t cudaStatus = addWithCuda(c, a, arraySize); 59     if (cudaStatus != cudaSuccess) { 60         fprintf(stderr, "addWithCuda failed!"); 61         return 1; 62     } 63     printf( 64             "\t%d+%d+%d+%d+%d = %d\n\t%d^2+%d^2+%d^2+%d^2+%d^2 = %d\n\t%d*%d*%d*%d*%d = %d\n\t%d^%d^%d^%d^%d = %d\n\n\n\n\n", 65             a[0], a[1], a[2], a[3], a[4], c[1], a[0], a[1], a[2], a[3], a[4], 66             c[0], a[0], a[1], a[2], a[3], a[4], c[2],a[0], a[1], a[2], a[3], a[4], c[3]); 67     // cudaThreadExit must be called before exiting in order for profiling and 68     // tracing tools such as Nsight and Visual Profiler to show complete traces. 69     cudaStatus = cudaThreadExit(); 70     if (cudaStatus != cudaSuccess) { 71         fprintf(stderr, "cudaThreadExit failed!"); 72         return 1; 73     } 74     return 0; 75 } 76  77 // Helper function for using CUDA to add vectors in parallel. 78 cudaError_t addWithCuda(int *c, const int *a, size_t size) { 79     int *dev_a = 0; 80     int *dev_c = 0; 81     cudaError_t cudaStatus; 82  83     // Choose which GPU to run on, change this on a multi-GPU system. 84     cudaStatus = cudaSetDevice(0); 85     if (cudaStatus != cudaSuccess) { 86         fprintf(stderr, 87                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?"); 88         goto Error; 89     } 90  91     // Allocate GPU buffers for three vectors (two input, one output)    . 92     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int)); 93     if (cudaStatus != cudaSuccess) { 94         fprintf(stderr, "cudaMalloc failed!"); 95         goto Error; 96     } 97  98     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int)); 99     if (cudaStatus != cudaSuccess) {100         fprintf(stderr, "cudaMalloc failed!");101         goto Error;102     }103     // Copy input vectors from host memory to GPU buffers.104     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),105             cudaMemcpyHostToDevice);106     if (cudaStatus != cudaSuccess) {107         fprintf(stderr, "cudaMemcpy failed!");108         goto Error;109     }110     // Launch a kernel on the GPU with one thread for each element.111     addKernel<<<1, size, size * sizeof(int), 0>>>(dev_c, dev_a);112 113     // cudaThreadSynchronize waits for the kernel to finish, and returns114     // any errors encountered during the launch.115     cudaStatus = cudaThreadSynchronize();116     if (cudaStatus != cudaSuccess) {117         fprintf(stderr,118                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",119                 cudaStatus);120         goto Error;121     }122 123     // Copy output vector from GPU buffer to host memory.124     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),125             cudaMemcpyDeviceToHost);126     if (cudaStatus != cudaSuccess) {127         fprintf(stderr, "cudaMemcpy failed!");128         goto Error;129     }130 131     Error: cudaFree(dev_c);132     cudaFree(dev_a);133     return cudaStatus;134 }

 

22+103+61+63+17 = 266
22^2+103^2+61^2+63^2+17^2 = 19072
22*103*61*63*17 = 17
22^103^61^63^17 = 98

 

cuda多线程间通信