首页 > 代码库 > cuda并行计算的几种模式
cuda并行计算的几种模式
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include <stdio.h> 4 #include <time.h> 5 #include <stdlib.h> 6 7 #define MAX 120 8 #define MIN 0 9 10 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size, 11 float* etime); 12 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size, 13 float* etime, int type); 14 __global__ void addKernel(int *c, const int *a, const int *b) { 15 int i = blockIdx.x; 16 c[i] = a[i] + b[i]; 17 } 18 19 __global__ void addKernelThread(int *c, const int *a, const int *b) { 20 int i = threadIdx.x; 21 c[i] = a[i] + b[i]; 22 } 23 int main() { 24 const int arraySize = 800; 25 srand((unsigned) time(NULL)); 26 int a[arraySize] = { 1, 2, 3, 4, 5 }; 27 int b[arraySize] = { 10, 20, 30, 40, 50 }; 28 29 for (int i = 0; i < arraySize; i++) { 30 a[i] = rand() % (MAX + 1 - MIN) + MIN; 31 b[i] = rand() % (MAX + 1 - MIN) + MIN; 32 } 33 int c[arraySize] = { 0 }; 34 // Add vectors in parallel. 35 cudaError_t cudaStatus; 36 int num = 0; 37 cudaDeviceProp prop; 38 cudaStatus = cudaGetDeviceCount(&num); 39 for (int i = 0; i < num; i++) { 40 cudaGetDeviceProperties(&prop, i); 41 } 42 43 float time; 44 cudaStatus = addWithCudaStream(c, a, b, arraySize, &time); 45 printf("Elasped time of stream is : %f \n", time); 46 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 47 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 48 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 49 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 50 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 51 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 52 if (cudaStatus != cudaSuccess) { 53 fprintf(stderr, "addWithCudaStream failed!"); 54 return 1; 55 } 56 cudaStatus = addWithCuda(c, a, b, arraySize, &time, 0); 57 printf("Elasped time of Block is : %f \n", time); 58 if (cudaStatus != cudaSuccess) { 59 fprintf(stderr, "addWithCudaStream failed!"); 60 return 1; 61 } 62 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 63 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 64 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 65 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 66 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 67 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 68 69 cudaStatus = addWithCuda(c, a, b, arraySize, &time, 1); 70 printf("Elasped time of thread is : %f \n", time); 71 if (cudaStatus != cudaSuccess) { 72 fprintf(stderr, "addWithCudaStream failed!"); 73 return 1; 74 } 75 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 76 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 77 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 78 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 79 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 80 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 81 82 cudaStatus = addWithCudaStream(c, a, b, arraySize, &time); 83 printf("Elasped time of stream is : %f \n", time); 84 printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 85 a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 86 a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 87 b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 88 b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 89 c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 90 if (cudaStatus != cudaSuccess) { 91 fprintf(stderr, "addWithCudaStream failed!"); 92 return 1; 93 } 94 // cudaThreadExit must be called before exiting in order for profiling and 95 // tracing tools such as Nsight and Visual Profiler to show complete traces. 96 cudaStatus = cudaThreadExit(); 97 if (cudaStatus != cudaSuccess) { 98 fprintf(stderr, "cudaThreadExit failed!"); 99 return 1;100 }101 return 0;102 }103 // Helper function for using CUDA to add vectors in parallel.104 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,105 float* etime) {106 int *dev_a = 0;107 int *dev_b = 0;108 int *dev_c = 0;109 clock_t start, stop;110 float time;111 cudaError_t cudaStatus;112 113 // Choose which GPU to run on, change this on a multi-GPU system.114 cudaStatus = cudaSetDevice(0);115 if (cudaStatus != cudaSuccess) {116 fprintf(stderr,117 "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");118 goto Error;119 }120 // Allocate GPU buffers for three vectors (two input, one output) .121 cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));122 if (cudaStatus != cudaSuccess) {123 fprintf(stderr, "cudaMalloc failed!");124 goto Error;125 }126 cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));127 if (cudaStatus != cudaSuccess) {128 fprintf(stderr, "cudaMalloc failed!");129 goto Error;130 }131 cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));132 if (cudaStatus != cudaSuccess) {133 fprintf(stderr, "cudaMalloc failed!");134 goto Error;135 }136 // Copy input vectors from host memory to GPU buffers.137 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),138 cudaMemcpyHostToDevice);139 if (cudaStatus != cudaSuccess) {140 fprintf(stderr, "cudaMemcpy failed!");141 goto Error;142 }143 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),144 cudaMemcpyHostToDevice);145 if (cudaStatus != cudaSuccess) {146 fprintf(stderr, "cudaMemcpy failed!");147 goto Error;148 }149 cudaStream_t stream[5];150 for (int i = 0; i < 5; i++) {151 cudaStreamCreate(&stream[i]); //创建流152 }153 // Launch a kernel on the GPU with one thread for each element.154 for (int i = 0; i < 5; i++) {155 addKernel<<<1, 1, 0, stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流156 }157 start = clock();158 cudaDeviceSynchronize();159 stop = clock();160 time = (float) (stop - start) / CLOCKS_PER_SEC;161 *etime = time;162 // cudaThreadSynchronize waits for the kernel to finish, and returns163 // any errors encountered during the launch.164 cudaStatus = cudaThreadSynchronize();165 if (cudaStatus != cudaSuccess) {166 fprintf(stderr,167 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",168 cudaStatus);169 goto Error;170 }171 // Copy output vector from GPU buffer to host memory.172 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),173 cudaMemcpyDeviceToHost);174 if (cudaStatus != cudaSuccess) {175 fprintf(stderr, "cudaMemcpy failed!");176 goto Error;177 }178 Error: for (int i = 0; i < 5; i++) {179 cudaStreamDestroy(stream[i]); //销毁流180 }181 cudaFree(dev_c);182 cudaFree(dev_a);183 cudaFree(dev_b);184 return cudaStatus;185 }186 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,187 float * etime, int type) {188 int *dev_a = 0;189 int *dev_b = 0;190 int *dev_c = 0;191 clock_t start, stop;192 float time;193 cudaError_t cudaStatus;194 195 // Choose which GPU to run on, change this on a multi-GPU system.196 cudaStatus = cudaSetDevice(0);197 if (cudaStatus != cudaSuccess) {198 fprintf(stderr,199 "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");200 goto Error;201 }202 // Allocate GPU buffers for three vectors (two input, one output) .203 cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));204 if (cudaStatus != cudaSuccess) {205 fprintf(stderr, "cudaMalloc failed!");206 goto Error;207 }208 cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));209 if (cudaStatus != cudaSuccess) {210 fprintf(stderr, "cudaMalloc failed!");211 goto Error;212 }213 cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));214 if (cudaStatus != cudaSuccess) {215 fprintf(stderr, "cudaMalloc failed!");216 goto Error;217 }218 // Copy input vectors from host memory to GPU buffers.219 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),220 cudaMemcpyHostToDevice);221 if (cudaStatus != cudaSuccess) {222 fprintf(stderr, "cudaMemcpy failed!");223 goto Error;224 }225 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),226 cudaMemcpyHostToDevice);227 if (cudaStatus != cudaSuccess) {228 fprintf(stderr, "cudaMemcpy failed!");229 goto Error;230 }231 232 if (type == 0) {233 start = clock();234 addKernel<<<size, 1>>>(dev_c, dev_a, dev_b);235 } else {236 start = clock();237 addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);238 }239 stop = clock();240 time = (float) (stop - start) / CLOCKS_PER_SEC;241 *etime = time;242 // cudaThreadSynchronize waits for the kernel to finish, and returns243 // any errors encountered during the launch.244 cudaStatus = cudaThreadSynchronize();245 if (cudaStatus != cudaSuccess) {246 fprintf(stderr,247 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",248 cudaStatus);249 goto Error;250 }251 // Copy output vector from GPU buffer to host memory.252 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),253 cudaMemcpyDeviceToHost);254 if (cudaStatus != cudaSuccess) {255 fprintf(stderr, "cudaMemcpy failed!");256 goto Error;257 }258 Error: cudaFree(dev_c);259 cudaFree(dev_a);260 cudaFree(dev_b);261 return cudaStatus;262 }
如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?
Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
cuda并行计算的几种模式
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。