首页 > 代码库 > 简单直接的CUDA改造

简单直接的CUDA改造

把前一篇中的MNIST数据识别程序进行了简单的CUDA改造,得到的结果很差,一个epoch从大约5秒变成了50秒。

也可以理解,我把每个操作(比如mul, add)单独拎出来实现,结果由于不知道自己的上下文,就不要不断的 __syncthreads。

不过还是有一些收获,第一次写CUDA程序,最终还是基本保证了程序的正确性。

 

// 修正: 5秒和50秒的对比有错,因为两个网络的结构不一样,把之前的网络改成和CUDA程序一样的网络之后,时间变成了 30 vs 50,CUDA还是慢一些。

 

  1 #include <iostream>  2 #include <cstdlib>  3 #include <cassert>  4 #include <string>  5 #include <cstring>  6 #include <fstream>  7 #include <vector>  8 #include <memory>  9 #include <cstdlib> 10 #include <cuda_runtime.h> 11 #include <math_functions.h> 12 #include <cmath> 13 #include <ctime> 14 using namespace std; 15  16 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo) 17 { 18     if(code == cudaSuccess) return; 19     cerr << "Cuda call failed at " << fileName << ":" << lineNo  20         << " " << cudaGetErrorString(code) << endl; 21     exit(-1); 22 } 23  24 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__) 25  26 // 为了简单,只用一个BLOCK 27 #define BSIZE (blockDim.x) 28 #define TIDX (threadIdx.x) 29  30 bool InitCUDA() 31 { 32     int count; 33     cudaGetDeviceCount(&count); 34     if(count == 0) { 35         cerr << "There is no cuda device" << endl; 36         return false; 37     } 38     cout << "Toal " << count << " cuda devices" << endl; 39  40     int i; 41     for(i = 0;i < count;i++) { 42         cudaDeviceProp prop; 43         if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { 44             if(prop.major >= 1) { 45                 break; 46             } 47         } 48     } 49  50     if(i == count) { 51         cerr << "There is no device supporting CUDA 1.x" << endl; 52         return false; 53     } 54  55     cudaSetDevice(i); 56     return true; 57 } 58  59 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html 60 // 高斯分布的随机数,均值为0,方差为1 61 double gaussrand() 62 { 63     static double V1, V2, S; 64     static int phase = 0; 65     double X; 66       67     if ( phase == 0 ) { 68         do { 69             double U1 = (double)rand() / RAND_MAX; 70             double U2 = (double)rand() / RAND_MAX; 71               72             V1 = 2 * U1 - 1; 73             V2 = 2 * U2 - 1; 74             S = V1 * V1 + V2 * V2; 75         } while(S >= 1 || S == 0); 76           77         X = V1 * sqrt(-2 * log(S) / S); 78     } else 79         X = V2 * sqrt(-2 * log(S) / S); 80           81     phase = 1 - phase; 82   83     return X; 84 } 85  86 #define ALIGN_FLOAT(x) (((x) + 3) & (~3)) 87  88 template<size_t ROW, size_t COL> 89 struct Matrix 90 { 91     const static int row = ROW; 92     const static int col = COL; 93     float data[ROW][ALIGN_FLOAT(COL)]; 94     __device__ __host__ inline float* operator[](size_t x) 95     { 96         assert(x < ROW); 97         return data[x]; 98     } 99 };100 101 template<size_t SIZE>102 struct Vector103 {104     const static int size = SIZE;105     float data[SIZE];106     __device__ __host__ inline float &operator[](size_t x)107     {108         assert(x < SIZE);109         return data[x];110     }111 };112 113 template<size_t SIZE>114 ostream& operator<<(ostream& out, Vector<SIZE>& v)115 {116     out << "[(" << v.size << ") ";117     for(int i = 0;i < v.size;i++) {118         if(i > 0) out << ",";119         out << v[i];120     }121     out << "]";122     return out;123 }124 125 // 矩阵乘法126 template<size_t ROW, size_t COL>127 __device__ inline void mul(Matrix<ROW,COL> &w, Vector<COL> &x, Vector<ROW> &out)128 {129     /*130     for(int i = 0;i < w.row;i++) {131         out[i] = 0;132         for(int j = 0;j < w.col;j++) {133             out[i] += w[i][j] * x[j];134         }135     }136     */137     for(int i = TIDX;i < w.row;i += BSIZE) {138         out[i] = 0;139         for(int j = 0;j < w.col;j++) {140             out[i] += w[i][j] * x[j];141         }142     }143     __syncthreads();144 }145 146 // 向量点乘147 template<size_t SIZE>148 __device__ inline void dot(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)149 {150     /*151     for(int i = 0;i < x.size;i++) {152         out[i] = x[i] * y[i];153     }154     */155     for(int i = TIDX;i < x.size;i += BSIZE) {156         out[i] = x[i] * y[i];157     }158     __syncthreads();159 }160 161 // w转置,然后和v相乘162 template<size_t ROW, size_t COL>163 __device__ inline void t_and_mul(Matrix<ROW, COL> &w, Vector<ROW> &v, Vector<COL> &out)164 {165     /*166     for(int i = 0;i < w.col;i++) {167         out[i] = 0;168         for(int j = 0;j < w.row;j++) {169             out[i] += w[j][i] * v[j];170         }171     }172     */173     for(int i = TIDX;i < w.col;i++) {174         out[i] = 0;175         for(int j = 0;j < w.row;j++) {176             out[i] += w[j][i] * v[j];177         }178     }179     __syncthreads();180 }181 182 template<size_t SIZE>183 __device__ inline void add(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)184 {185     /*186     for(int i = 0;i < x.size;i++) {187         out[i] = x[i] + y[i];188     }189     */190     for(int i = TIDX;i < x.size;i += BSIZE) {191         out[i] = x[i] + y[i];192     }193     __syncthreads();194 }195 196 template<size_t SIZE>197 __device__ inline void sub(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)198 {199     /*200     for(int i = 0;i < x.size;i++) {201         out[i] = x[i] - y[i];202     }203     */204     for(int i = TIDX;i < x.size;i += BSIZE) {205         out[i] = x[i] - y[i];206     }207     __syncthreads();208 }209 210 template<size_t SIZE>211 __device__ inline void mul(float x, Vector<SIZE> &y, Vector<SIZE> &out)212 {213     /*214     for(int i = 0;i < y.size;i++) {215         out[i] = x * y[i];216     }217     */218     for(int i = TIDX;i < y.size;i += BSIZE) {219         out[i] = x * y[i];220     }221     __syncthreads();222 }223 224 template<size_t SIZE>225 __device__ inline void mul(Vector<SIZE> &x, float y, Vector<SIZE> &out)226 {227     mul(y, x, out);228 }229 230 template<size_t SIZE>231 __device__ inline void copy(Vector<SIZE> &x, Vector<SIZE> &out)232 {233     /*234     for(int i = 0;i < x.size;i++) {235         out[i] = x[i];236     }237     */238     for(int i = TIDX;i < x.size;i += BSIZE) {239         out[i] = x[i];240     }241     __syncthreads();242 }243 244 __device__ inline float sum_of_shared(int size)245 {246     extern __shared__ float shared[];247 248     __syncthreads();249 250     if(TIDX == 0) {251         for(int i = 1;i < size;i++) {252             shared[0] += shared[i];253         }254     }255     __syncthreads();256 257     float ret = shared[0];258 259     __syncthreads();260 261     return ret;262 }263 264 __device__ inline float max_of_shared(int size)265 {266     extern __shared__ float shared[];267 268     __syncthreads();269 270     if(TIDX == 0) {271         for(int i = 1;i < size;i++) {272             if(shared[0] < shared[i]) {273                 shared[0] = shared[i];274             }275         }276     }277     __syncthreads();278 279     float ret = shared[0];280 281     __syncthreads();282 283     return ret;284 }285 286 287 template<size_t SIZE>288 __device__ inline float max(Vector<SIZE>& x)289 {290     assert(x.size > 0);291 292     extern __shared__ float shared[];293 294     if(TIDX < x.size) {295         shared[TIDX] = x[TIDX];296     }297 298     for(int i = TIDX + BSIZE;i < x.size;i += BSIZE) {299         if(shared[TIDX] < x[i]) {300             shared[TIDX] = x[i];301         }302     }303 304     return max_of_shared(min(BSIZE, x.size));305 }306 307 template<size_t SIZE>308 __device__ inline float sum(Vector<SIZE>& x)309 {310     assert(x.size > 0);311 312     extern __shared__ float shared[];313 314     if(TIDX < x.size) {315         shared[TIDX] = 0;316     }317 318     for(int i = TIDX;i < x.size;i += BSIZE) {319         shared[TIDX] += x[i];320     }321 322     return sum_of_shared(min(BSIZE, x.size));323 }324 325 template<size_t SIZE>326 __device__ inline void add_with_xs(Vector<SIZE> &x, float xs, Vector<SIZE> &y, Vector<SIZE> &out)327 {328     /*329     for(int i = 0;i < x.size;i++) {330         out[i] = x[i] + xs * y[i];331     }332     */333     for(int i = TIDX;i < x.size;i += BSIZE) {334         out[i] = x[i] + xs * y[i];335     }336 337     __syncthreads();338 }339 340 template<size_t SIZE>341 struct Softmax342 {343     __device__ static inline float calc(Vector<SIZE>& x, Vector<SIZE>& y)344     {345         // - \sum y_j * log( exp(x_j) / \sum exp(x_k) )346         /*347           log( exp(x_j) / \sum exp(x_k) )348         = x_j - log \sum exp(x_k)349         = x_j - (max + log \sum exp(x_k - max))350         */351 352         float maxX = max(x);353 354         /*355         float xSum = 0;356         for(int i = 0;i < x.size;i++) {357             xSum += expf(x[i] - maxX);358         }359         */360 361         extern __shared__ float shared[];362         if(TIDX < x.size) {363             shared[TIDX] = 0;364         }365 366         for(int i = TIDX;i < x.size;i += BSIZE) {367             shared[TIDX] += expf(x[i] - maxX);368         }369         if(TIDX == 0) {370             for(int i = 1;i < BSIZE && i < x.size;i++) {371                 shared[0] += shared[i];372             }373         }374 375         float xSum = sum_of_shared(min(BSIZE, x.size));376 377         /*378         float ret = 0;379         for(int i = 0;i < x.size;i++) {380             ret += y[i] * (x[i] - (maxX + logf(xSum)));381         }382         */383         if(TIDX < x.size) {384             shared[TIDX] = 0;385         }386         for(int i = TIDX;i < x.size;i += BSIZE) {387             shared[i] += y[i] * (x[i] - (maxX + logf(xSum)));388         }389 390         float ret = sum_of_shared(min(BSIZE, x.size));391 392         return -ret;393     }394 395 396     static inline float calc_host(Vector<SIZE>& x, Vector<SIZE>& y)397     {398         // - \sum y_j * log( exp(x_j) / \sum exp(x_k) )399         /*400           log( exp(x_j) / \sum exp(x_k) )401         = x_j - log \sum exp(x_k)402         = x_j - (max + log \sum exp(x_k - max))403         */404 405         float maxX = x[0];406         for(int i = 1;i < x.size;i++) {407             if(x[i] > maxX) {408                 maxX = x[i];409             }410         }411 412         float xSum = 0;413         for(int i = 0;i < x.size;i++) {414             xSum += expf(x[i] - maxX);415         }416 417         float ret = 0;418         for(int i = 0;i < x.size;i++) {419             ret += y[i] * (x[i] - (maxX + logf(xSum)));420         }421 422         return -ret;423     }424 425     __device__ static inline void propagate_delta(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)426     {427         /*428           - d \sum y_j * log( exp(x_j) / \sum exp(x_k) )429         = - d \sum y_j * x_j - d \sum y_j log (\sum exp(x_k) )430         = - y_i + \sum (y_j * exp(x_i) / \sum exp(x_k))431         = - y_i + exp(x_i) (\sum y_j) / (\sum exp(x_k))432         */433 434         float maxX = max(x);435 436         // -y + exp(x) sum_of_y / sum_of_exp(x)437 438         /*439         float sumOfY = 0;440         float sumOfX = 0;441         442         for(int i = 0;i < x.size;i++) {443             out[i] = expf(x[i] - maxX);444             sumOfY += y[i];445             sumOfX += out[i];446         }447         */448 449         for(int i = TIDX;i < x.size;i += BSIZE) {450             out[i] = expf(x[i] - maxX);451         }452 453         float sumOfY = sum(y);454         float sumOfX = sum(out);455 456         float t = sumOfY/sumOfX;457 458         mul(t, out, out);459         sub(out, y, out);460     }461 };462 463 template<size_t SIZE>464 struct Relu465 {466     __device__ static inline void forward(Vector<SIZE> &x, Vector<SIZE> &out)467     {468         /*469         for(int i = 0;i < x.size;i++) {470             out[i] = x[i] >= 0 ? x[i] : 0;471         }472         */473         for(int i = TIDX;i < x.size;i += BSIZE) {474             out[i] = x[i] >= 0 ? x[i] : 0;475         }476         __syncthreads();477     }478 479     __device__ static inline void derive(Vector<SIZE> &x, Vector<SIZE> &out)480     {481         /*482         for(int i = 0;i < x.size;i++) {483             out[i] = x[i] >= 0 ? 1 : 0;484         }485         */486         for(int i = TIDX;i < x.size;i += BSIZE) {487             out[i] = x[i] >= 0 ? 1 : 0;488         }489         __syncthreads();490     }491 492     __device__ static inline void derive_and_dot_into(Vector<SIZE> &x, Vector<SIZE> &out)493     {494         // out = dot(dx, out)495         /*496         for(int i = 0;i < x.size;i++) {497             out[i] = out[i] * (x[i] >= 0 ? 1 : 0);498         }499         */500         for(int i = TIDX;i < x.size;i += BSIZE) {501             out[i] = out[i] * (x[i] >= 0 ? 1 : 0);502         }503         __syncthreads();504     }505 };506 507 // NN的一层508 // 1. 输入不算一层509 // 2. 层的w矩阵是从前面一层到当前层的w,和NG的定义有些出入510 // 3. 层的b是前面一层到当前层的b,和NG的定义有些出入511 template <size_t IN_SIZE, size_t OUT_SIZE, typename Activator=Relu<OUT_SIZE> >512 struct Layer513 {514     typedef Activator ActivatorType;515 516     // 上一层的输出的个数,不包括bias517     const static int inSize = IN_SIZE;518     // 当前层的输出519     const static int outSize = OUT_SIZE;520 521     Matrix<OUT_SIZE, IN_SIZE> w;522     Vector<OUT_SIZE> b;523     // 最后一次forward计算之后保存的激活值524     Vector<OUT_SIZE> a;525     Vector<OUT_SIZE> z;526     // 最后一次反向传播计算之后保存的delta值527     Vector<OUT_SIZE> delta;528 529     void initWeights()530     {531         for(int i = 0;i < b.size;i++) {532             b[i] = float(gaussrand() * 0.01);533         }534         for(int i = 0;i < w.row;i++) {535             for(int j = 0;j < w.col;j++) {536                 w[i][j] = float(gaussrand() * 0.01);537             }538         }539     }540 541     Layer()542     {543         initWeights();544     }545 546     __device__ inline void calc(Vector<IN_SIZE> &in)547     {548         // w * in + b549         mul(w, in, z);550         add(z, b, z);551         Activator::forward(z, a);552     }553 554     __device__ inline void propagate_delta(Vector<IN_SIZE> &out)555     {556         t_and_mul(w, delta, out);557     }558 559     // alpha是学习率560     // prevA是上一层的输出561     __device__ inline void update_parameters(float alpha, Vector <IN_SIZE> &prevA)562     {563         // b = b - alpha * delta564         add_with_xs(b, -alpha, delta, b);565 566         /*567         for(int i = 0;i < w.row;i++) {568             for(int j = 0;j < w.col;j++) {569                 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i];570             }571         }572         */573         for(int i = TIDX;i < w.row;i += BSIZE) {574             for(int j = 0;j < w.col;j++) {575                 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i];576             }577         }578         __syncthreads();579     }580 };581 582 // model583 #define INPUT_SIZE (28 * 28)584 #define OUTPUT_SIZE 10585 586 typedef Layer<INPUT_SIZE, 100, Relu<100> > L1_TYPE;587 588 typedef Layer<100, OUTPUT_SIZE, Relu<OUTPUT_SIZE> > L2_TYPE;589 590 #define LN_TYPE L2_TYPE591 #define LN L2592 593 __global__ void forward(594     L1_TYPE &L1, L2_TYPE &L2,595     Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &output)596 {597     L1.calc(input);598 599     L2.calc(L1.a);600 601     copy(L2.a, output);602 }603 604 template<typename CostFun>605 __global__ void backward(606     L1_TYPE &L1, L2_TYPE &L2,607     Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &y, float alpha)608 {609     // 最后一层610     CostFun::propagate_delta(LN.a, y, LN.delta);611     LN_TYPE::ActivatorType::derive_and_dot_into(LN.a, LN.delta);612 613     // 其它层614     L2.propagate_delta(L1.delta);615     L1_TYPE::ActivatorType::derive_and_dot_into(L1.a, L1.delta);616 617     // 更新所有的w和b618     L1.update_parameters(alpha, input);619     L2.update_parameters(alpha, L1.a);620 }621 622 // 读取一个整数623 int MsbInt(char buf[], int len=4)624 {625     int base = 1;626     int ret = 0;627     for(int i = len - 1;i >= 0;i--) {628         ret += (unsigned char)buf[i] * base;629         base *= 256;630     }631     return ret;632 }633 634 vector<int> ReadMnistLabels(string fileName)635 {636     vector<int> ret;637     ifstream ifs(fileName.c_str(), ios::binary);638     char buf[1000];639 640     // MAGIC641     ifs.read(buf, 4);642     int magic = MsbInt(buf);643     if(magic != 0x00000801) {644         cerr << "incorrect label file magic number" << endl;645     }646 647     // num of images648     ifs.read(buf, 4);649     int nImages = MsbInt(buf);650 651     while(nImages--) {652         ret.push_back(ifs.get());653     }654 655     return ret;656 }657 658 Vector<INPUT_SIZE> * ReadMnistData(string fileName)659 {660     ifstream ifs(fileName.c_str(), ios::binary);661     char buf[1000];662 663     // MAGIC664     ifs.read(buf, 4);665     int magic = MsbInt(buf);666     if(magic != 0x00000803) {667         cerr << "incorrect data file magic number" << endl;668     }669 670     // num of images671     ifs.read(buf, 4);672     int nImages = MsbInt(buf);673     Vector<INPUT_SIZE> * ret = new Vector<INPUT_SIZE>[nImages];674 675     int row, col;676     ifs.read(buf, 4);677     row = MsbInt(buf);678     ifs.read(buf, 4);679     col = MsbInt(buf);680     if(row * col != INPUT_SIZE) {681         cerr << "incorrect image size" << endl;682     }683 684     for(int k = 0;k < nImages;k++) {685         Vector<INPUT_SIZE> &image = ret[k];686         for(int i = 0;i < row * col;i++) {687             image[i] = ifs.get() / 256.0; // 归一化688         }689     }690 691     return ret;692 }693 694 Vector<OUTPUT_SIZE>* translateLabels(vector<int> &labels, int k=10)695 {696     int n = labels.size();697     Vector<OUTPUT_SIZE> * ret = new Vector<OUTPUT_SIZE>[n];698 699     for(int i = 0;i < labels.size();i++) {700         Vector<OUTPUT_SIZE> &tmp = ret[i];701         memset(&tmp, 0, sizeof(tmp));702         assert(labels[i] >= 0 && labels[i] < k);703         tmp[labels[i]] = 1;704     }705     return ret;706 }707 708 int getMaxIdx(Vector<OUTPUT_SIZE>& x)709 {710     int maxIdx = 0;711     float maxV = x[0];712     for(int i = 0;i < x.size;i++) {713         if(x[i] > maxV) {714             maxV = x[i];715             maxIdx = i;716         }717     }718     return maxIdx;719 }720 721 template <typename T>722 void CUDA_ALLOC_AND_COPY(T *&to, T *from, size_t size)723 {724     CK(cudaMalloc((void**)&to, size));725     CK(cudaMemcpy(to, from, size, cudaMemcpyHostToDevice));726 }727 728 int main()729 {730     srand(1000);731 732     if(!InitCUDA()) {733         return -1;734     }735 736     L1_TYPE *tmpL1 = new L1_TYPE(), *L1;737     CUDA_ALLOC_AND_COPY(L1, tmpL1, sizeof(*L1));738     delete tmpL1;739 740     L2_TYPE *tmpL2 = new L2_TYPE(), *L2;741     CUDA_ALLOC_AND_COPY(L2, tmpL2, sizeof(*L2));742     delete tmpL2;743 744     cout << "Loading data" << endl;745 746     // 读取数据747     vector<int> trainLabels = ReadMnistLabels("mnist/train-labels-idx1-ubyte");748     int nTrain = trainLabels.size();749     Vector<OUTPUT_SIZE>* trainLabels2 = translateLabels(trainLabels);750     Vector<OUTPUT_SIZE>* trainLabels2OnGpu;751     CUDA_ALLOC_AND_COPY(trainLabels2OnGpu, trainLabels2, sizeof(trainLabels2[0]) * nTrain);752 753     Vector<INPUT_SIZE>* trainData = http://www.mamicode.com/ReadMnistData("mnist/train-images-idx3-ubyte");754     Vector<INPUT_SIZE>* trainDataOnGpu;755     CUDA_ALLOC_AND_COPY(trainDataOnGpu, trainData, sizeof(trainData[0]) * nTrain);756 757     vector<int> testLabels = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");758     int nTest = testLabels.size();759 760     Vector<INPUT_SIZE>* testData = http://www.mamicode.com/ReadMnistData("mnist/t10k-images-idx3-ubyte");761     Vector<INPUT_SIZE>* testDataOnGpu;762     CUDA_ALLOC_AND_COPY(testDataOnGpu, testData, sizeof(testData[0]) * nTest);763 764     int M = nTrain;765     int T = nTest;766 767     typedef Softmax<OUTPUT_SIZE> CostFun;768 769     // 开始训练770     cout << "Start training" << endl;771     clock_t fullStartedAt = clock();772 773     Vector<OUTPUT_SIZE> *outputOnCuda;774     CK(cudaMalloc((void**)&outputOnCuda, sizeof(*outputOnCuda)));775 776     Vector<OUTPUT_SIZE> output;777 778     // 为了简单,只支持一个BLOCK779     #define N_BLOCK 1780     // 可以多个THREAD781     #define N_THREAD 1024782     #define SHARED_SIZE (sizeof(float) * N_THREAD)783 784     for(int step = 0;step < 100000;step++) {785         clock_t step_1 = clock();786 787         float avgError = 0;788 789         for(int i = 0;i < M;i++) {790             Vector<INPUT_SIZE> &x = trainDataOnGpu[i];791             Vector<OUTPUT_SIZE> &y = trainLabels2OnGpu[i];792 793             forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);794 795             CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));796 797             Vector<OUTPUT_SIZE> &hostY = trainLabels2[i];798             float error = CostFun::calc_host(output, hostY);799             avgError += error;800             //cout << output << " " << hostY << endl;801 802             backward<CostFun><<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, y, 0.001);803         }804         avgError /= M;805 806         clock_t step_2 = clock();807 808         cout << "step=" << step << " time_cost=" << (step_2 - step_1)*1.0/CLOCKS_PER_SEC << " avgErr=" << avgError << " ";809 810         // validate811         int nTotal = 0;812         int nGood = 0;813         for(int i = 0;i < M;i++) {814             Vector<INPUT_SIZE> &x = trainDataOnGpu[i];815 816             forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);817 818             CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));819 820             int maxIdx = getMaxIdx(output);821             if(maxIdx == trainLabels[i]) {822                 nGood++;823             }824             nTotal++;825         }826         cout << "train_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";827         bool doBreak = false;828         if(nGood * 1.0 / nTotal > 0.95) {829             doBreak = true;830         }831 832         // check833         nTotal = 0;834         nGood = 0;835         for(int i = 0;i < T;i++) {836             Vector<INPUT_SIZE> &x = testDataOnGpu[i];837 838             forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);839 840             CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));841 842             int maxIdx = getMaxIdx(output);843             if(maxIdx == testLabels[i]) {844                 nGood++;845             }846             nTotal++;847         }848         cout << "test_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";849 850         clock_t step_3 = clock();851         cout << "total_time=" << (step_3-step_1)*1.0/CLOCKS_PER_SEC << endl;852         if(doBreak) {853             break;854         }855     }856 857     clock_t fullEndedAt = clock();858     cout << "Total cost " << (fullEndedAt - fullStartedAt)/CLOCKS_PER_SEC << " seconds" << endl;859 860     return 0;861 }

 

简单直接的CUDA改造