首页 > 代码库 > GPU和CPU耗时统计方法
GPU和CPU耗时统计方法
GPU端耗时统计
1 cudaEvent_t start, stop; 2 checkCudaErrors(cudaEventCreate(&start)); 3 checkCudaErrors(cudaEventCreate(&stop)); 4 checkCudaErrors(cudaDeviceSynchronize()); 5 6 float gpu_time = 0.0f; 7 cudaEventRecord(start, 0);//cuda context中的操作完毕事件被记录 8 //分配设备端内存 9 float *d_idata;10 checkCudaErrors(cudaMalloc((void **) &d_idata, mem_size));11 12 //将主机端数据拷贝到设备端内存13 checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));14 15 //设备端为结果分配内存16 float *d_odata;17 checkCudaErrors(cudaMalloc((void **) &d_odata, mem_size));18 19 //设置执行参数20 dim3 grid(1, 1, 1);21 dim3 threads(num_threads, 1, 1);22 23 //执行内核,参数含义:grid是网格的纬度,threads是块的纬度,mem_size最多能动态分配的共享内存大小24 testKernel<<< grid, threads, mem_size >>>(d_idata, d_odata);25 26 //检查内核执行状态27 getLastCudaError("Kernel execution failed");28 29 //在主机端为结果分配内存30 float *h_odata = http://www.mamicode.com/(float *) malloc(mem_size);31 //从设备端拷贝结果到主机端32 checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,33 cudaMemcpyDeviceToHost));34 35 cudaEventRecord(stop, 0);36 unsigned long int counter = 0;37 while (cudaEventQuery(stop) == cudaErrorNotReady)38 {39 counter++;40 }41 checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));42 printf("GPU执行耗时: %.2f (ms)\n", gpu_time);43 printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
CPU端耗时统计
1 StopWatchInterface *timer = 0; 2 sdkCreateTimer(&timer); 3 sdkResetTimer(&timer); 4 5 sdkStartTimer(&timer); 6 //计算参考方案 7 float *reference = (float *) malloc(mem_size); 8 computeGold(reference, h_idata, num_threads); 9 sdkStopTimer(&timer);10 printf("串行耗时:%f (ms)\n", sdkGetTimerValue(&timer));
GPU和CPU耗时统计方法
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。