首页 > 代码库 > 全局内存性能测试

全局内存性能测试



对10w大小的数组做1w次重复赋值,

分别用多线程和单线程实现。

结果发现

1begin
1 main Time to generate:  50.0 ms
2begin
2 main Time to generate:  117630.0 ms


#include <stdio.h>

#define MAX_BLOCKS_PER_GRID 65535
#define MAX_BLOCK_ROWS 255
#define MAX_BLOCK_COLS 255

#define MAX_THREADS_PER_BLOCK 1024
#define MAX_THREAD_ROWS 32
#define MAX_ThREAD_COLS 32

__global__ void kernel(double* dev_array)
{

	int tid = blockIdx.x*blockDim.x+threadIdx.x;
	dev_array[tid] = 1;
	//dev_array[0] = 1;
}

__global__ void kernel2(double* dev_array,int array_size)
{
	for(int i=0;i<array_size;i++)
	{
		dev_array[i] = 1;
		//dev_array[0] = 1;
	}
}


int main()
{
	int rounds = 10000;
	clock_t start,stop;
	float   elapsedTime;

	int array_size = 100000;

/*
	double* array = new double[array_size];
	printf("0begin\n");
	start = clock();
	for(int i=0;i<rounds;i++)
	{
		for(int i=0;i<array_size;i++)
		{
			array[i] = 1;
		}
	}
	stop= clock();
    elapsedTime = (float)(stop - start) /
                          (float)CLOCKS_PER_SEC * 1000.0f;
    printf( "0 main Time to generate:  %3.1f ms\n", elapsedTime );
    delete []array;
*/
	double * dev_array = NULL;
	cudaMalloc(&dev_array,array_size*sizeof(double));



	int threads = MAX_THREADS_PER_BLOCK;
	if(threads>array_size) threads = array_size;
	int blocks = (array_size+threads-1)/threads;
	if(blocks > MAX_BLOCKS_PER_GRID) blocks = MAX_BLOCKS_PER_GRID;


	printf("1begin\n");
	start = clock();
	for(int i=0;i<rounds;i++)
	{
		kernel<<<blocks,threads>>>(dev_array);
	}
	cudaDeviceSynchronize();
	stop= clock();
    elapsedTime = (float)(stop - start) /
                          (float)CLOCKS_PER_SEC * 1000.0f;
    printf( "1 main Time to generate:  %3.1f ms\n", elapsedTime );


    printf("2begin\n");
	start = clock();
	for(int i=0;i<rounds;i++)
	{
		kernel2<<<1,1>>>(dev_array,array_size);
	}
	cudaDeviceSynchronize();
	stop= clock();
    elapsedTime = (float)(stop - start) /
                          (float)CLOCKS_PER_SEC * 1000.0f;
    printf( "2 main Time to generate:  %3.1f ms\n", elapsedTime );

    cudaFree(dev_array);
return 0;
}