首页 > 代码库 > cuda多线程间通信
cuda多线程间通信
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 4 #include <stdio.h> 5 #include <time.h> 6 #include <stdlib.h> 7 8 #define MAX 120 9 #define MIN 0 10 cudaError_t addWithCuda(int *c, const int *a, size_t size); 11 12 __global__ void addKernel(int *c, const int *a) { 13 int i = threadIdx.x; 14 extern __shared__ int smem[]; 15 smem[i] = a[i]; 16 __syncthreads(); 17 if (i == 0) // 0号线程做平方和 18 { 19 c[0] = 0; 20 for (int d = 0; d < 5; d++) { 21 c[0] += smem[d] * smem[d]; 22 } 23 } 24 if (i == 1) //1号线程做累加 25 { 26 c[1] = 0; 27 for (int d = 0; d < 5; d++) { 28 c[1] += smem[d]; 29 } 30 } 31 if (i == 2) //2号线程做累乘 32 { 33 c[2] = 1; 34 for (int d = 0; d < 5; d++) { 35 c[2] = smem[d]; 36 } 37 38 } 39 40 if (i == 3) //3号线程做异或 41 { 42 c[3] = 0; 43 for (int d = 0; d < 5; d++) { 44 c[3] ^= smem[d]; 45 } 46 47 } 48 } 49 50 int main() { 51 const int arraySize = 5; 52 srand((unsigned) time(NULL)); 53 const int a[arraySize] = { rand() % (MAX + 1 - MIN) + MIN, rand() 54 % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN, rand() 55 % (MAX + 1 - MIN) + MIN, rand() % (MAX + 1 - MIN) + MIN }; 56 int c[arraySize] = { 0 }; 57 // Add vectors in parallel. 58 cudaError_t cudaStatus = addWithCuda(c, a, arraySize); 59 if (cudaStatus != cudaSuccess) { 60 fprintf(stderr, "addWithCuda failed!"); 61 return 1; 62 } 63 printf( 64 "\t%d+%d+%d+%d+%d = %d\n\t%d^2+%d^2+%d^2+%d^2+%d^2 = %d\n\t%d*%d*%d*%d*%d = %d\n\t%d^%d^%d^%d^%d = %d\n\n\n\n\n", 65 a[0], a[1], a[2], a[3], a[4], c[1], a[0], a[1], a[2], a[3], a[4], 66 c[0], a[0], a[1], a[2], a[3], a[4], c[2],a[0], a[1], a[2], a[3], a[4], c[3]); 67 // cudaThreadExit must be called before exiting in order for profiling and 68 // tracing tools such as Nsight and Visual Profiler to show complete traces. 69 cudaStatus = cudaThreadExit(); 70 if (cudaStatus != cudaSuccess) { 71 fprintf(stderr, "cudaThreadExit failed!"); 72 return 1; 73 } 74 return 0; 75 } 76 77 // Helper function for using CUDA to add vectors in parallel. 78 cudaError_t addWithCuda(int *c, const int *a, size_t size) { 79 int *dev_a = 0; 80 int *dev_c = 0; 81 cudaError_t cudaStatus; 82 83 // Choose which GPU to run on, change this on a multi-GPU system. 84 cudaStatus = cudaSetDevice(0); 85 if (cudaStatus != cudaSuccess) { 86 fprintf(stderr, 87 "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 88 goto Error; 89 } 90 91 // Allocate GPU buffers for three vectors (two input, one output) . 92 cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int)); 93 if (cudaStatus != cudaSuccess) { 94 fprintf(stderr, "cudaMalloc failed!"); 95 goto Error; 96 } 97 98 cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int)); 99 if (cudaStatus != cudaSuccess) {100 fprintf(stderr, "cudaMalloc failed!");101 goto Error;102 }103 // Copy input vectors from host memory to GPU buffers.104 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),105 cudaMemcpyHostToDevice);106 if (cudaStatus != cudaSuccess) {107 fprintf(stderr, "cudaMemcpy failed!");108 goto Error;109 }110 // Launch a kernel on the GPU with one thread for each element.111 addKernel<<<1, size, size * sizeof(int), 0>>>(dev_c, dev_a);112 113 // cudaThreadSynchronize waits for the kernel to finish, and returns114 // any errors encountered during the launch.115 cudaStatus = cudaThreadSynchronize();116 if (cudaStatus != cudaSuccess) {117 fprintf(stderr,118 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",119 cudaStatus);120 goto Error;121 }122 123 // Copy output vector from GPU buffer to host memory.124 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),125 cudaMemcpyDeviceToHost);126 if (cudaStatus != cudaSuccess) {127 fprintf(stderr, "cudaMemcpy failed!");128 goto Error;129 }130 131 Error: cudaFree(dev_c);132 cudaFree(dev_a);133 return cudaStatus;134 }
22+103+61+63+17 = 266
22^2+103^2+61^2+63^2+17^2 = 19072
22*103*61*63*17 = 17
22^103^61^63^17 = 98
cuda多线程间通信
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。