首页 > 代码库 > 4.2 Reduction优化

4.2 Reduction优化

首先添加上Heterogeneous Parallel Programming class 中 lab: Reduction的代码:

myReduction.c

技术分享
// MP Reduction// Given a list (lst) of length n// Output its sum = lst[0] + lst[1] + ... + lst[n-1];#include    <wb.h>#define BLOCK_SIZE 512 //@@ You can change this#define wbCheck(stmt) do {                                                    \        cudaError_t err = stmt;                                                       if (err != cudaSuccess) {                                                         wbLog(ERROR, "Failed to run stmt ", #stmt);                                   wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));                return -1;                                                                }                                                                         } while(0)__global__ void reduction(float *g_idata, float *g_odata, unsigned int n){        __shared__ float sdata[BLOCK_SIZE];    // load shared mem    unsigned int tid = threadIdx.x;    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;    sdata[tid] = (i < n) ? g_idata[i] : 0;    __syncthreads();    // do reduction in shared mem, stride is divided by 2,    for (unsigned int s=blockDim.x/2; s>0; s>>=1)    {        //__syncthreads();        if (tid < s)        {            sdata[tid] += sdata[tid + s];        }        __syncthreads();    }    // write result for this block to global mem    if (tid == 0) g_odata[blockIdx.x] = sdata[0];}  __global__ void total(float * input, float * output, int len) {    //@@ Load a segment of the input vector into shared memory    __shared__ float partialSum[2 * BLOCK_SIZE];  //blockDim.x is not okay, compile fail    unsigned int t = threadIdx.x;    unsigned int start = 2 * blockIdx.x * blockDim.x;    if (start + t < len)       partialSum[t] = input[start + t];    else       partialSum[t] = 0;        if (start + blockDim.x + t < len)       partialSum[blockDim.x + t] = input[start + blockDim.x + t];    else       partialSum[blockDim.x + t] = 0;        //@@ Traverse the reduction tree    for (unsigned int stride = blockDim.x; stride >= 1; stride >>= 1) {       __syncthreads();       if (t < stride)          partialSum[t] += partialSum[t+stride];    }    //@@ Write the computed sum of the block to the output vector at the     //@@ correct index    if (t == 0)       output[blockIdx.x] = partialSum[0];}int main(int argc, char ** argv) {    int ii;    wbArg_t args;    float * hostInput; // The input 1D list    float * hostOutput; // The output list    float * deviceInput;    float * deviceOutput;    int numInputElements; // number of elements in the input list    int numOutputElements; // number of elements in the output list    args = wbArg_read(argc, argv);    wbTime_start(Generic, "Importing data and creating memory on host");    hostInput = (float *) wbImport(wbArg_getInputFile(args, 0), &numInputElements);    numOutputElements = numInputElements / (BLOCK_SIZE);    if (numInputElements % (BLOCK_SIZE)) {        numOutputElements++;    }        //This for kernel total    /*numOutputElements = numInputElements / (BLOCK_SIZE <<1);    if (numInputElements % (BLOCK_SIZE)<<1) {        numOutputElements++;    } */    hostOutput = (float*) malloc(numOutputElements * sizeof(float));    wbTime_stop(Generic, "Importing data and creating memory on host");    wbLog(TRACE, "The number of input elements in the input is ", numInputElements);    wbLog(TRACE, "The number of output elements in the input is ", numOutputElements);    wbTime_start(GPU, "Allocating GPU memory.");    //@@ Allocate GPU memory here    cudaMalloc((void **) &deviceInput, numInputElements * sizeof(float));    cudaMalloc((void **) &deviceOutput, numOutputElements * sizeof(float));    wbTime_stop(GPU, "Allocating GPU memory.");    wbTime_start(GPU, "Copying input memory to the GPU.");    //@@ Copy memory to the GPU here    cudaMemcpy(deviceInput,               hostInput,               numInputElements * sizeof(float),               cudaMemcpyHostToDevice);    wbTime_stop(GPU, "Copying input memory to the GPU.");    //@@ Initialize the grid and block dimensions here    dim3 dimGrid(numOutputElements, 1, 1);    dim3 dimBlock(BLOCK_SIZE, 1, 1);    wbTime_start(Compute, "Performing CUDA computation");    //@@ Launch the GPU Kernel here    reduction<<<dimGrid,dimBlock>>>(deviceInput, deviceOutput, numInputElements);    //total<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, numInputElements);    cudaDeviceSynchronize();    wbTime_stop(Compute, "Performing CUDA computation");    wbTime_start(Copy, "Copying output memory to the CPU");    //@@ Copy the GPU memory back to the CPU here    cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * numOutputElements, cudaMemcpyDeviceToHost);    wbTime_stop(Copy, "Copying output memory to the CPU");    /********************************************************************     * Reduce output vector on the host     * NOTE: One could also perform the reduction of the output vector     * recursively and support any size input. For simplicity, we do not     * require that for this lab.     ********************************************************************/    for (ii = 1; ii < numOutputElements; ii++) {        hostOutput[0] += hostOutput[ii];    }    wbTime_start(GPU, "Freeing GPU Memory");    //@@ Free the GPU memory here    cudaFree(deviceInput);    cudaFree(deviceOutput);    wbTime_stop(GPU, "Freeing GPU Memory");    wbSolution(args, hostOutput, 1);    free(hostInput);    free(hostOutput);    return 0;}
View Code

 

Reduction优化:

 

4.2 Reduction优化