首页 > 代码库 > cuda并行计算的几种模式

cuda并行计算的几种模式

技术分享
  1 #include "cuda_runtime.h"  2 #include "device_launch_parameters.h"  3 #include <stdio.h>  4 #include <time.h>  5 #include <stdlib.h>  6   7 #define MAX 120  8 #define MIN 0  9  10 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size, 11         float* etime); 12 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size, 13         float* etime, int type); 14 __global__ void addKernel(int *c, const int *a, const int *b) { 15     int i = blockIdx.x; 16     c[i] = a[i] + b[i]; 17 } 18  19 __global__ void addKernelThread(int *c, const int *a, const int *b) { 20     int i = threadIdx.x; 21     c[i] = a[i] + b[i]; 22 } 23 int main() { 24     const int arraySize = 800; 25     srand((unsigned) time(NULL)); 26     int a[arraySize] = { 1, 2, 3, 4, 5 }; 27     int b[arraySize] = { 10, 20, 30, 40, 50 }; 28  29     for (int i = 0; i < arraySize; i++) { 30         a[i] = rand() % (MAX + 1 - MIN) + MIN; 31         b[i] = rand() % (MAX + 1 - MIN) + MIN; 32     } 33     int c[arraySize] = { 0 }; 34     // Add vectors in parallel. 35     cudaError_t cudaStatus; 36     int num = 0; 37     cudaDeviceProp prop; 38     cudaStatus = cudaGetDeviceCount(&num); 39     for (int i = 0; i < num; i++) { 40         cudaGetDeviceProperties(&prop, i); 41     } 42  43     float time; 44     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time); 45     printf("Elasped time of stream is : %f \n", time); 46     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 47             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 48             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 49             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 50             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 51             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 52     if (cudaStatus != cudaSuccess) { 53         fprintf(stderr, "addWithCudaStream failed!"); 54         return 1; 55     } 56     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 0); 57     printf("Elasped time of Block is : %f \n", time); 58     if (cudaStatus != cudaSuccess) { 59         fprintf(stderr, "addWithCudaStream failed!"); 60         return 1; 61     } 62     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 63             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 64             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 65             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 66             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 67             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 68  69     cudaStatus = addWithCuda(c, a, b, arraySize, &time, 1); 70     printf("Elasped time of thread is : %f \n", time); 71     if (cudaStatus != cudaSuccess) { 72         fprintf(stderr, "addWithCudaStream failed!"); 73         return 1; 74     } 75     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 76             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 77             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 78             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 79             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 80             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 81  82     cudaStatus = addWithCudaStream(c, a, b, arraySize, &time); 83     printf("Elasped time of stream is : %f \n", time); 84     printf("{%d,%d,%d,%d,%d} + {%d,%d,%d,%d,%d} = {%d,%d,%d,%d,%d}\n", 85             a[arraySize - 1 - 0], a[arraySize - 1 - 1], a[arraySize - 1 - 2], 86             a[arraySize - 1 - 3], a[arraySize - 1 - 4], b[arraySize - 1 - 0], 87             b[arraySize - 1 - 1], b[arraySize - 1 - 2], b[arraySize - 1 - 3], 88             b[arraySize - 1 - 4], c[arraySize - 1 - 0], c[arraySize - 1 - 1], 89             c[arraySize - 1 - 2], c[arraySize - 1 - 3], c[arraySize - 1 - 4]); 90     if (cudaStatus != cudaSuccess) { 91         fprintf(stderr, "addWithCudaStream failed!"); 92         return 1; 93     } 94     // cudaThreadExit must be called before exiting in order for profiling and 95     // tracing tools such as Nsight and Visual Profiler to show complete traces. 96     cudaStatus = cudaThreadExit(); 97     if (cudaStatus != cudaSuccess) { 98         fprintf(stderr, "cudaThreadExit failed!"); 99         return 1;100     }101     return 0;102 }103 // Helper function for using CUDA to add vectors in parallel.104 cudaError_t addWithCudaStream(int *c, const int *a, const int *b, size_t size,105         float* etime) {106     int *dev_a = 0;107     int *dev_b = 0;108     int *dev_c = 0;109     clock_t start, stop;110     float time;111     cudaError_t cudaStatus;112 113     // Choose which GPU to run on, change this on a multi-GPU system.114     cudaStatus = cudaSetDevice(0);115     if (cudaStatus != cudaSuccess) {116         fprintf(stderr,117                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");118         goto Error;119     }120     // Allocate GPU buffers for three vectors (two input, one output)    .121     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));122     if (cudaStatus != cudaSuccess) {123         fprintf(stderr, "cudaMalloc failed!");124         goto Error;125     }126     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));127     if (cudaStatus != cudaSuccess) {128         fprintf(stderr, "cudaMalloc failed!");129         goto Error;130     }131     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));132     if (cudaStatus != cudaSuccess) {133         fprintf(stderr, "cudaMalloc failed!");134         goto Error;135     }136     // Copy input vectors from host memory to GPU buffers.137     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),138             cudaMemcpyHostToDevice);139     if (cudaStatus != cudaSuccess) {140         fprintf(stderr, "cudaMemcpy failed!");141         goto Error;142     }143     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),144             cudaMemcpyHostToDevice);145     if (cudaStatus != cudaSuccess) {146         fprintf(stderr, "cudaMemcpy failed!");147         goto Error;148     }149     cudaStream_t stream[5];150     for (int i = 0; i < 5; i++) {151         cudaStreamCreate(&stream[i]);   //创建流152     }153     // Launch a kernel on the GPU with one thread for each element.154     for (int i = 0; i < 5; i++) {155         addKernel<<<1, 1, 0, stream[i]>>>(dev_c + i, dev_a + i, dev_b + i); //执行流156     }157     start = clock();158     cudaDeviceSynchronize();159     stop = clock();160     time = (float) (stop - start) / CLOCKS_PER_SEC;161     *etime = time;162     // cudaThreadSynchronize waits for the kernel to finish, and returns163     // any errors encountered during the launch.164     cudaStatus = cudaThreadSynchronize();165     if (cudaStatus != cudaSuccess) {166         fprintf(stderr,167                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",168                 cudaStatus);169         goto Error;170     }171     // Copy output vector from GPU buffer to host memory.172     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),173             cudaMemcpyDeviceToHost);174     if (cudaStatus != cudaSuccess) {175         fprintf(stderr, "cudaMemcpy failed!");176         goto Error;177     }178     Error: for (int i = 0; i < 5; i++) {179         cudaStreamDestroy(stream[i]);   //销毁流180     }181     cudaFree(dev_c);182     cudaFree(dev_a);183     cudaFree(dev_b);184     return cudaStatus;185 }186 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size,187         float * etime, int type) {188     int *dev_a = 0;189     int *dev_b = 0;190     int *dev_c = 0;191     clock_t start, stop;192     float time;193     cudaError_t cudaStatus;194 195     // Choose which GPU to run on, change this on a multi-GPU system.196     cudaStatus = cudaSetDevice(0);197     if (cudaStatus != cudaSuccess) {198         fprintf(stderr,199                 "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");200         goto Error;201     }202     // Allocate GPU buffers for three vectors (two input, one output)    .203     cudaStatus = cudaMalloc((void**) &dev_c, size * sizeof(int));204     if (cudaStatus != cudaSuccess) {205         fprintf(stderr, "cudaMalloc failed!");206         goto Error;207     }208     cudaStatus = cudaMalloc((void**) &dev_a, size * sizeof(int));209     if (cudaStatus != cudaSuccess) {210         fprintf(stderr, "cudaMalloc failed!");211         goto Error;212     }213     cudaStatus = cudaMalloc((void**) &dev_b, size * sizeof(int));214     if (cudaStatus != cudaSuccess) {215         fprintf(stderr, "cudaMalloc failed!");216         goto Error;217     }218     // Copy input vectors from host memory to GPU buffers.219     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int),220             cudaMemcpyHostToDevice);221     if (cudaStatus != cudaSuccess) {222         fprintf(stderr, "cudaMemcpy failed!");223         goto Error;224     }225     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int),226             cudaMemcpyHostToDevice);227     if (cudaStatus != cudaSuccess) {228         fprintf(stderr, "cudaMemcpy failed!");229         goto Error;230     }231 232     if (type == 0) {233         start = clock();234         addKernel<<<size, 1>>>(dev_c, dev_a, dev_b);235     } else {236         start = clock();237         addKernelThread<<<1, size>>>(dev_c, dev_a, dev_b);238     }239     stop = clock();240     time = (float) (stop - start) / CLOCKS_PER_SEC;241     *etime = time;242     // cudaThreadSynchronize waits for the kernel to finish, and returns243     // any errors encountered during the launch.244     cudaStatus = cudaThreadSynchronize();245     if (cudaStatus != cudaSuccess) {246         fprintf(stderr,247                 "cudaThreadSynchronize returned error code %d after launching addKernel!\n",248                 cudaStatus);249         goto Error;250     }251     // Copy output vector from GPU buffer to host memory.252     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int),253             cudaMemcpyDeviceToHost);254     if (cudaStatus != cudaSuccess) {255         fprintf(stderr, "cudaMemcpy failed!");256         goto Error;257     }258     Error: cudaFree(dev_c);259     cudaFree(dev_a);260     cudaFree(dev_b);261     return cudaStatus;262 }
View Code

如上文的实现程序,使用了thread并行,block并行,stream并行三种,使用三种方法法进行了五次计算,发现stream第一次计算时会出错,调用的子程序没有变化,没有搞懂?

Elasped time of stream is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {158,123,92,107,127}
Elasped time of Block is : 0.000006
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000008
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of thread is : 0.000004
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}
Elasped time of stream is : 0.000007
{47,86,67,35,16} + {114,39,110,20,101} = {161,125,177,55,117}

cuda并行计算的几种模式