首页 > 代码库 > GPU和CPU耗时统计方法

GPU和CPU耗时统计方法

GPU端耗时统计

 1     cudaEvent_t start, stop; 2     checkCudaErrors(cudaEventCreate(&start)); 3     checkCudaErrors(cudaEventCreate(&stop)); 4     checkCudaErrors(cudaDeviceSynchronize()); 5  6     float gpu_time = 0.0f; 7     cudaEventRecord(start, 0);//cuda context中的操作完毕事件被记录 8     //分配设备端内存 9     float *d_idata;10     checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));11     12     //将主机端数据拷贝到设备端内存13     checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size,  cudaMemcpyHostToDevice));14 15     //设备端为结果分配内存16     float *d_odata;17     checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));18 19     //设置执行参数20     dim3  grid(1, 1, 1);21     dim3  threads(num_threads, 1, 1);22 23     //执行内核,参数含义:grid是网格的纬度,threads是块的纬度,mem_size最多能动态分配的共享内存大小24     testKernel<<< grid, threads, mem_size >>>(d_idata, d_odata);25 26     //检查内核执行状态27     getLastCudaError("Kernel execution failed");28 29     //在主机端为结果分配内存30     float *h_odata = http://www.mamicode.com/(float *) malloc(mem_size);31     //从设备端拷贝结果到主机端32     checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,33                                cudaMemcpyDeviceToHost));34 35     cudaEventRecord(stop, 0);36     unsigned long int counter = 0;37     while (cudaEventQuery(stop) == cudaErrorNotReady)38     {39         counter++;40     }41     checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));42     printf("GPU执行耗时: %.2f (ms)\n", gpu_time);43     printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);

CPU端耗时统计

 1     StopWatchInterface *timer = 0; 2     sdkCreateTimer(&timer); 3     sdkResetTimer(&timer); 4  5     sdkStartTimer(&timer); 6     //计算参考方案 7     float *reference = (float *) malloc(mem_size); 8     computeGold(reference, h_idata, num_threads); 9     sdkStopTimer(&timer);10     printf("串行耗时:%f (ms)\n", sdkGetTimerValue(&timer));

 

GPU和CPU耗时统计方法