首页 > 代码库 > 20140822_BP神经网络

20140822_BP神经网络

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "math.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#include <time.h>
#ifndef nullptr
#define nullptr 0
#endif

#define BLOCK_NUM 6
#define THREAD_NUM 256


//#define SAMPLE_NUM 300
#define LAYERS 3
#define Num_In 30
#define Num_Hide 50
#define Num_Out 5

#define ALFA 0.85
#define Beta 0
#define ACCURACY = 0.005; //网络要求精度
#define ITER_MAX = 500; //最大循环次数

#define CUDA_CALL(x) {const cudaError_t a=(x); if(a!= cudaSuccess) {printf("\nCUDA ERROR:%s(err_num = %d)\n",cudaGetErrorString(a),a); cudaDeviceReset(); assert(0);}}


void Init(float* weight0, const int w0_Num, float* weight1, const int w1_Num, float* weight2, const int w2_Num)
{
assert(weight0!=nullptr);
assert(weight1!=nullptr);
assert(weight2!=nullptr);

srand( (unsigned)time(NULL));

for(size_t i = 0; i<w0_Num; ++i)
{
weight0[i] = (float)(rand()%1000)/1000.0f;
}

for(size_t i = 0; i<w1_Num; ++i)
{
weight1[i] = (float)(rand()%1000)/1000.0f;
}

for(size_t i = 0; i<w2_Num; ++i)
{
weight2[i] = (float)(rand()%1000)/1000.0f;
}
}

int main()
{
float* Samples;
int Samples_Num;
float* Targets;
/*
read Data
*/
float* weight0 = (float*) malloc(sizeof(float)*Num_In*Num_In);
float* weight1 = (float*) malloc(sizeof(float)*Num_In*Num_Hide);
float* weight2 = (float*) malloc(sizeof(float)*Num_Hide*Num_Out);

float* bias0 = (float*)malloc(sizeof(float)*Num_In);
float* bias1 = (float*)malloc(sizeof(float)*Num_Hide);
float* bias2 = (float*)malloc(sizeof(float)*Num_Out);

memset( bias0,0,sizeof(float)*Num_In);
memset( bias1,0,sizeof(float)*Num_Hide);
memset( bias2,0,sizeof(float)*Num_Out);

Init(weight0, Num_In*Num_In , weight1, Num_In*Num_Hide, weight2, Num_Hide*Num_Out);

 

free(weight0);
free(weight1);
free(weight2);

free(bias0);
free(bias1);
free(bias2);

return 0;
}

 

void BP_Train( float* Samples, int Samples_Num, float* Targets,
float* weight0,float* weight1, float* weight2,
float* bias0,float* bias1,float* bias2)
{
float mse = 0xFFFFFFFF;

float* gpuSamples = nullptr;
float* gpuTargets = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuSamples,sizeof(float)*Samples_Num*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuTargets,sizeof(float)*Samples_Num));

float* gpuWeight0 = nullptr;
float* gpuWeight1 = nullptr;
float* gpuWeight2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuWeight0,sizeof(float)*Num_In*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuWeight1,sizeof(float)*Num_In*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&gpuWeight2,sizeof(float)*Num_Hide*Num_Out));

float* gpuBias0 = nullptr;
float* gpuBias1 = nullptr;
float* gpuBias2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuBias0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuBias1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&gpuBias2,sizeof(float)*Num_Out));

float* gpuErr0 = nullptr;
float* gpuErr1 = nullptr;
float* gpuErr2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&gpuErr0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&gpuErr1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&gpuErr2,sizeof(float)*Num_Out));

float* O0 = nullptr;
float* O1 = nullptr;
float* O2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&O0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&O1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&O2,sizeof(float)*Num_Out));

CUDA_CALL(cudaMemcpy(gpuWeight0, weight0, sizeof(float)*Num_In*Num_In, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuWeight1, weight1, sizeof(float)*Num_In*Num_Hide, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuWeight2, weight2, sizeof(float)*Num_Hide*Num_Out, cudaMemcpyHostToDevice));

CUDA_CALL(cudaMemcpy(gpuBias0,bias0,sizeof(float)*Num_In,cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuBias1,bias1,sizeof(float)*Num_Hide,cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(gpuBias2,bias2,sizeof(float)*Num_Out,cudaMemcpyHostToDevice));

 

 

cudaFree(gpuSamples);
cudaFree(gpuTargets);

cudaFree(gpuWeight0);
cudaFree(gpuWeight1);
cudaFree(gpuWeight2);

cudaFree(gpuBias0);
cudaFree(gpuBias1);
cudaFree(gpuBias2);

cudaFree(gpuErr0);
cudaFree(gpuErr1);
cudaFree(gpuErr2);

cudaFree(O0);
cudaFree(O1);
cudaFree(O2);
}

__global__ void SIGMOD(float* IO, float*Bias, const int NodeNum)
{
int idx = threadIdx.x;
if(idx < NodeNum) // NodeNum<THREAD_NUM 时不用考虑多块
{
IO[idx] = 1/(1+exp(-IO[idx]-Bias[idx]));
}
}


//前向传播一个样本pSamples,计算结果O0,O1,O2
void FeedForward( float* O0,float* O1,float* O2, float* pSamples,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2)
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

const float alpha = 1.0f;
const float beta = 0.0f;

//输入层到第一层 + bias0 得到结果经过激活函数得到O0
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, Num_In, 1, Num_In, &alpha, weight0, Num_In, pSamples, 1, &beta, O0, Num_In);
//ret = cublasSaxpy(handle, Num_In,&alpha,bias0,1,O0,1);
SIGMOD<<<1,THREAD_NUM>>>(O0,bias0,Num_In);

//第一层到第二层 + bias1 得到结果经过激活函数得到O1
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Hide, 1, Num_In, &alpha, weight1, Num_In, O0, Num_In, &beta, O1, Num_Hide);
//ret = cublasSaxpy(handle, Num_Hide,&alpha,bias1,1,O1,1);
SIGMOD<<<1,THREAD_NUM>>>(O1,bias1,Num_Hide);

//第二层到第三层 + bias2 得到结果经过激活函数得到O2
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Out, 1, Num_Hide, &alpha, weight2, Num_Hide, O1, Num_Hide, &beta, O2, Num_Out);
//ret = cublasSaxpy(handle, Num_Out,&alpha,bias2,1,O2,1);
SIGMOD<<<1,THREAD_NUM>>>(O2,bias2,Num_Out);

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

}

//反向传播一个样本pSamples,调整weight0,weight1,weight2,bias0,bias1,bias2
__global__ void BackPropagation( float* O0,float* O1,float* O2, float* Targets,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2,
float* err0,float* err1, float* err2)
{
int idx = blockIdx.x + threadIdx.x;

//修改最后一层的误差项和偏差项
if(idx<Num_Out)
{
err2[idx] = (Targets[idx]- O2[idx])*O2[idx]*(1-O2[idx]);
bias2[idx] += ALFA*err2[idx];
}
//__syncthreads(); Num_Out<32 不用同步

//修改最后一层的权值
if(idx< Num_Out*Num_Hide)
{
int r = (int) (idx/Num_Hide);
weight2[idx] += ALFA*err2[r]*O1[idx - r*Num_Hide];
}
__syncthreads();

//修改隐藏层的误差项和偏差项
if(idx<Num_Hide)
{
err1[idx] = 0;
for(int i =0 ; i< Num_Out; i++)
{
err1[idx] += err2[i]*weight2[i*Num_Hide+idx];
}

bias1[idx] += ALFA*err1[idx];
err1[idx] *= O1[idx]*(1-O1[idx]);

}
__syncthreads();

//修改隐藏层的权值
for(int i = 0; i< Num_Hide*Num_In; ++i)
{
int r = (int)(idx/Num_In);
weight1[idx] += ALFA*err1[r]*O1[idx - r*Num_In];
}
__syncthreads();

//修改输入层的误差项和偏差项
if(idx<Num_In)
{
err0[idx] = 0;
for(int i=0; i<Num_Hide; ++i)
{
err0[idx] += err1[i]*weight1[i*Num_In+ idx];
}
bias0[idx] += ALFA*err0[idx];
err0[idx] *= O0[idx]*(1-O0[idx]);
}
//__syncthreads(); Num_In<32 不用同步

for(int i =0; i<Num_In*Num_In; ++i)
{
int r = (int)(idx/Num_In);
weight0[idx] += ALFA*err0[r]*O0[idx - r*Num_In] ;
}
__syncthreads();
}

__global__ void Add_MSE(float* mse, float* O, float * Targets)
{
int idx = threadIdx.x;
if(idx<Num_Out)
{
atomicAdd(mse, sqrt(O[idx]-Targets[idx]));
}
}

//用训练好的模型weight0,weight1,weight2,bias0,bias1,bias2,计算网络对于所有样本Samples的均方误差
float MSE( float* Samples,int Samples_Num, float* Targets,
float* weight0, float* weight1, float* weight2,
float* bias0, float* bias1, float* bias2)
{
float mse = 0.0;

float Output[Num_Out];
float* O0 = nullptr;
float* O1 = nullptr;
float* O2 = nullptr;
CUDA_CALL(cudaMalloc((void**)&O0,sizeof(float)*Num_In));
CUDA_CALL(cudaMalloc((void**)&O1,sizeof(float)*Num_Hide));
CUDA_CALL(cudaMalloc((void**)&O2,sizeof(float)*Num_Out));

cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

const float alpha = 1.0f;
const float beta = 0.0f;

for(int i = 0; i<Samples_Num; ++i)
{
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, Num_In, 1, Num_In, &alpha, weight0, Num_In, Samples+ i*Num_In, 1, &beta, O0, Num_In);
SIGMOD<<<1,THREAD_NUM>>>(O0,bias0,Num_In);
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Hide, 1, Num_In, &alpha, weight1, Num_In, O0, Num_In, &beta, O1, Num_Hide);
SIGMOD<<<1,THREAD_NUM>>>(O1,bias1,Num_Hide);
ret = cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, Num_Out, 1, Num_Hide, &alpha, weight2, Num_Hide, O1, Num_Hide, &beta, O2, Num_Out);
SIGMOD<<<1,THREAD_NUM>>>(O2,bias2,Num_Out);

}


ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasDestroy returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}

cudaFree(O0);
cudaFree(O1);
cudaFree(O2);
}


/*
cudaEvent_t start,stop;
CUDA_CALL( cudaEventCreate(&start));
CUDA_CALL( cudaEventCreate(&stop));
CUDA_CALL(cudaEventRecord(start, NULL));
*/

20140822_BP神经网络