首页 > 代码库 > GPU openEXR image(RGBA) -> gray image

GPU openEXR image(RGBA) -> gray image

<1> Basic

技术分享
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define NUM 15
__global__ void square(float *dout,float *din)
{
    int idx = threadIdx.x;
    float f  = din[idx];
    dout[idx] = f*f;
}


int main(int argc,char **argv)
{
        
    const int bytes = sizeof(float) * NUM;
    float host_in[NUM];
    // save some value
    for(int i=0;i<NUM;i++)
    {
        host_in[i] = float(i);
    }

    float host_out[NUM];

    cudaError_t cudaStatus;
    // GPU SETTINGS
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        return;
    }





    // define gpu memory, GPU memory allocation
    float *device_in =  0;
    float *device_out = 0;
    cudaStatus = cudaMalloc((void**)&device_in, bytes);
    cudaStatus = cudaMalloc((void**)&device_out,bytes);

    cudaStatus = cudaMemcpy(device_in,host_in,bytes,cudaMemcpyHostToDevice);



    // GPU kernel
    // 1 block,Num threads
    square<<<1,NUM>>>(device_out,device_in);

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }


    cudaStatus = cudaMemcpy(host_out, device_out, bytes, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }


    // Free GPU memory
    cudaFree(device_in);
    cudaFree(device_out);

    for(int i=0;i<NUM;i++)
    {
        fprintf(stdout,"%f \n",host_out[i]);
    }

    getchar();

    return 0;

}
View Code

 

 

<1>simple caculation:
I = (R+G+B)/2

I = R*0.299f + G*0.587f + 0.114f*B

CPU:

 

// Serial implementation for running on CPU using a single thread.
void rgbaToGreyscaleCpu(const uchar4* const rgbaImage, unsigned char *const greyImage,
        const size_t numRows, const size_t numCols)
{
    for (size_t r = 0; r < numRows; ++r) {
        for (size_t c = 0; c < numCols; ++c) {
            const uchar4 rgba = rgbaImage[r * numCols + c];
            const float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
            greyImage[r * numCols + c] = channelSum;
        }
    }
}

 

 

 

GPU:

// CUDA kernel which is run in parallel by many GPU threads.
__global__
void rgbaToGreyscaleCudaKernel(const uchar4* const rgbaImage,
        unsigned char* const greyImage,
        const int numRows, const int numCols)
{
    //First create a mapping from the 2D block and grid locations
    //to an absolute 2D location in the image, then use that to
    //calculate a 1D offset
    const long pointIndex = threadIdx.x + blockDim.x*blockIdx.x;
 
    if(pointIndex<numRows*numCols) { // this is necessary only if too many threads are started
        uchar4 const imagePoint = rgbaImage[pointIndex];
        greyImage[pointIndex] = .299f*imagePoint.x + .587f*imagePoint.y  + .114f*imagePoint.z;
    }
}
 
// Parallel implementation for running on GPU using multiple threads.
void rgbaToGreyscaleCuda(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
        unsigned char* const d_greyImage, const size_t numRows, const size_t numCols)
{
    const int blockThreadSize = 256;
    const int numberOfBlocks = 1 + ((numRows*numCols - 1) / blockThreadSize); // a/b rounded up
    const dim3 blockSize(blockThreadSize, 1, 1);
    const dim3 gridSize(numberOfBlocks , 1, 1);
    rgbaToGreyscaleCudaKernel<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
}

 

GPU openEXR image(RGBA) -> gray image