首页 > 代码库 > CUDA跑MNIST,加速

CUDA跑MNIST,加速

之前用CUDA写的版本,竟然还不如CPU。

经过好几次的尝试,得到两点经验:

1. CUDA的kernel不需要写得硕大无比。写大了之后,block数和thread数反而不好调整(之前都没有用上block),另外就是会导致数据管理非常复杂。kernel搞成细粒度之后好像也没那么多的影响。

2. 训练还得batch的来,最重要的影响就是能极大的增加数据并行性。

下面的代码,GPU vs CPU达到了接近20倍的加速,代码也不复杂。

   1 #include <iostream>   2 #include <cstdlib>   3 #include <cassert>   4 #include <string>   5 #include <cstring>   6 #include <fstream>   7 #include <vector>   8 #include <memory>   9 #include <cstdlib>  10 #include <cmath>  11 #include <ctime>  12 using namespace std;  13   14 #define CPU 0  15 #define GPU 1  16   17 #ifndef GPU  18 #define __host__   19 #define __device__  20 #define expf exp  21 #define logf log  22 #endif  23   24 #ifdef GPU  25 #include <cuda_runtime.h>  26 #include <math_functions.h>  27 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo)  28 {  29     if(code == cudaSuccess) return;  30     cerr << "Cuda call failed at " << fileName << ":" << lineNo   31         << " " << cudaGetErrorString(code) << endl;  32     exit(-1);  33 }  34 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__)  35   36   37 inline size_t get_thread_cnt(size_t size)  38 {  39     size = min(size, 1024UL);  40   41     // ceil(size / 32)  42     size += 31;  43     size &= ~31;  44     return size;  45 }  46   47 inline size_t ceil(size_t x, size_t y)  48 {  49     return (x + y - 1) / y;  50 }  51 #endif  52   53 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html  54 // 高斯分布的随机数,均值为0,方差为1  55 double gaussrand()  56 {  57     static double V1, V2, S;  58     static int phase = 0;  59     double X;  60        61     if ( phase == 0 ) {  62         do {  63             double U1 = (double)rand() / RAND_MAX;  64             double U2 = (double)rand() / RAND_MAX;  65                66             V1 = 2 * U1 - 1;  67             V2 = 2 * U2 - 1;  68             S = V1 * V1 + V2 * V2;  69         } while(S >= 1 || S == 0);  70            71         X = V1 * sqrt(-2 * log(S) / S);  72     } else  73         X = V2 * sqrt(-2 * log(S) / S);  74            75     phase = 1 - phase;  76    77     return X;  78 }  79   80 float gaussrand_f()  81 {  82     return float(gaussrand() * 0.01);  83 }  84   85 template<size_t IS_GPU>  86 struct Matrix {};  87   88 template<>  89 struct Matrix<CPU>  90 {  91     size_t row, col, size;  92     float *data;  93   94     Matrix(size_t _row, size_t _col) :  95         row(_row), col(_col), size(_row * _col)  96         {  97             assert(row > 0 && col > 0);  98             data = http://www.mamicode.com/(float *)malloc(size * sizeof(float));  99             assert(data); 100             memset(data, 0, row * col * sizeof(float)); 101         } 102  103     Matrix(size_t _row, size_t _col, float *_data) : 104         row(_row), col(_col), size(_row * _col), data(_data) 105         { 106             assert(row > 0 && col > 0 && data); 107         } 108  109     Matrix splice(size_t from, size_t to) 110     { 111         assert(from < to && to <= row); 112         size_t offset = from * col; 113  114         Matrix<CPU> ret(to - from, col, data + offset); 115         return ret; 116     } 117  118     string shape() const { 119         char buf[100]; 120         sprintf(buf, "%d x %d", (int)row, (int)col); 121         return buf; 122     } 123  124     void free() 125     { 126         assert(data); 127         ::free(data); 128         data =http://www.mamicode.com/ NULL; 129     } 130  131     void init(float v) 132     { 133         for(int i = 0;i < row;i++) { 134             for(int j = 0;j < col;j++) { 135                 (*this)[i][j] = v; 136             } 137         } 138     } 139  140     void init_gauss() 141     { 142         for(int i = 0;i < row;i++) { 143             for(int j = 0;j < col;j++) { 144                 (*this)[i][j] = gaussrand_f(); 145             } 146         } 147     } 148  149     float *operator[](size_t i) { 150         assert(i < row); 151         return data + i * col; 152     } 153  154     const float *operator[] (size_t i) const { 155         assert(i < row); 156         return data + i * col; 157     } 158  159     void assert_eq(const Matrix& rhs) 160     { 161         assert(row == rhs.row); 162         assert(col == rhs.col); 163         for(int i = 0;i < row;i++) { 164             for(int j = 0;j < col;j++) { 165                 float x = (*this)[i][j]; 166                 float y = rhs[i][j]; 167                 float d =  x - y; 168                 if(d < 0) d = -d; 169                 if(d > 1E-7) { 170                     cerr << "[" << i << "," << j << "] delta=" << d << " x=" << x << " y=" << y << endl; 171                     exit(-1); 172                 } 173             } 174         } 175     } 176  177     size_t max_idx() const { 178         assert(row == 1); 179         size_t ret = 0; 180         for(int i = 1;i < col;i++) { 181             if((*this)[0][i] > (*this)[0][ret]) { 182                 ret = i; 183             } 184         } 185         return ret; 186     } 187  188     float sum() const { 189         float ret = 0; 190         for(int i = 0;i < row;i++) { 191             for(int j = 0;j < col;j++) { 192                 ret += (*this)[i][j]; 193             } 194         } 195         return ret; 196     } 197  198     void mul_to(const Matrix& in, Matrix& out) const { 199         // out x in 200         // b x in 201         // b x out 202         assert(row == out.col && col == in.col && in.row == out.row); 203         for(int b = 0;b < in.row;b++) { 204             for(int i = 0;i < row;i++) { 205                 out[b][i] = 0; 206                 for(int j = 0;j < col;j++) { 207                     out[b][i] += (*this)[i][j] * in[b][j]; 208                 } 209             } 210         } 211     } 212  213     Matrix t() const { 214         Matrix ret(col, row); 215         for(int i = 0;i < row;i++) { 216             for(int j = 0;j < col;j++) { 217                 ret[j][i] = (*this)[i][j]; 218             } 219         } 220         return ret; 221     } 222  223     void t_and_mul_to(const Matrix& in, Matrix& out) const { 224         // out x in 225         // b x out 226         // b x in 227         assert(row == in.col && col == out.col && in.row == out.row); 228         for(int b = 0;b < in.row;b++) { 229             for(int j = 0;j < col;j++) { 230                 out[b][j] = 0; 231                 for(int i = 0;i < row;i++) { 232                     out[b][j] += (*this)[i][j] * in[b][i]; 233                 } 234             } 235         } 236     } 237  238     void cast_add_to(Matrix& out) const { 239         // 1 x out 240         // b x out 241         assert(row == 1 && col == out.col); 242         for(int b = 0;b < out.row;b++) { 243             for(int j = 0;j < col;j++) { 244                 out[b][j] += (*this)[0][j]; 245             } 246         } 247     } 248  249     void grad(float f, Matrix& delta) { 250         // 1 x out 251         // b x out 252         assert(row == 1 && col == delta.col); 253         for(int j = 0;j < col;j++) { 254             float sum = 0; 255             for(int b = 0;b < delta.row;b++) { 256                 sum += delta[b][j]; 257             } 258             sum *= f; 259             (*this)[0][j] += sum; 260         } 261     } 262  263     void grad(float f, const Matrix& delta, const Matrix& prev_a) { 264         // out x in 265         // b x out 266         // b x in 267         assert(row == delta.col && col == prev_a.col); 268         for(int i = 0;i < row;i++) { 269             for(int j = 0;j < col;j++) { 270                 float sum = 0; 271                 for(int b = 0;b < delta.row;b++) { 272                     sum += delta[b][i] * prev_a[b][j]; 273                 } 274                 sum *= f; 275                 (*this)[i][j] += sum; 276             } 277         } 278     } 279  280     Matrix to_cpu() const { 281         Matrix ret(row, col); 282         memcpy(ret.data, data, size * sizeof(float)); 283         return ret; 284     } 285  286 #ifdef GPU 287     Matrix<GPU> to_gpu() const; 288 #endif 289 }; 290  291 ostream& operator<<(ostream& out, const Matrix<CPU>& v) 292 { 293     out << "[(" << v.row << " x " << v.col << ") " << endl; 294     for(int i = 0;i < v.row;i++) { 295         out << "\t"; 296         for(int j = 0;j < v.col;j++) { 297             if(j > 0) cout << ","; 298             out << v[i][j]; 299         } 300         out << endl; 301     } 302     out << "]"; 303     return out; 304 } 305  306 #ifdef GPU 307 class SmartGPUMem 308 { 309 private: 310     float *m_gpu; 311  312 public: 313     SmartGPUMem() 314     { 315         CK(cudaMalloc((void **)&m_gpu, sizeof *m_gpu)); 316         assert(m_gpu); 317     } 318  319     float *gpu() { return m_gpu; } 320     float cpu() 321     { 322         float t; 323         CK(cudaMemcpy(&t, m_gpu, sizeof t, cudaMemcpyDeviceToHost)); 324         return t; 325     } 326  327     ~SmartGPUMem() 328     { 329         CK(cudaFree(m_gpu)); 330     } 331 }; 332  333 __global__ void k_sum(const float *data, size_t row, size_t col, float *out) { 334     float t = 0; 335     for(int i = 0;i < row;i++) { 336         for(int j = 0;j < col;j++) { 337             t += data[i * col + j]; 338         } 339     } 340     *out = t; 341 } 342  343 __global__ void k_max_idx(const float *data, size_t size, float *out) { 344     int t = 0; 345     for(int i = 1;i < size;i++) { 346         if(data[i] > data[t]) { 347             t = i; 348         } 349     } 350     *out = t; 351 } 352  353 __global__ void k_mul_to(const float * w, const float *in, float *out, size_t row, size_t col, size_t batch_size) 354 { 355     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 356     int b = gidx / row; 357     int i = gidx % row; 358  359     if(b >= batch_size) return; 360  361     float t = 0; 362     for(int j = 0;j < col;j++) { 363         t += w[i * col + j] * in[b * col + j]; 364     } 365     out[b * row + i] = t; 366 } 367  368 __global__ void k_t_and_mul_to(const float *w, const float *in, float *out, size_t row, size_t col, size_t batch_size)  369 { 370     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 371     int b = gidx / col; 372     int j = gidx % col; 373  374     if(b >= batch_size) return; 375  376     float t = 0; 377     for(int i = 0;i < row;i++) { 378         t += w[i * col + j] * in[b * row + i]; 379     } 380     out[b * col + j] = t; 381 } 382  383 __global__ void k_cast_add_to(const float* bx, float *out, size_t col, size_t batch_size) 384 { 385     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 386     int b = gidx / col; 387     int j = gidx % col; 388  389     if(b >= batch_size) return; 390     out[b * col + j] += bx[j]; 391 } 392  393 __global__ void k_grad(float *b, float f, const float *delta, size_t col, size_t batch_size) 394 { 395     size_t j = blockDim.x * blockIdx.x + threadIdx.x; 396     if(j >= col) return; 397  398     float sum = 0; 399     for(int b = 0;b < batch_size;b++) { 400         sum += delta[b * col + j]; 401     } 402     sum *= f; 403     b[j] += sum; 404 } 405  406 __global__ void k_grad(float *w, float f, const float *delta, const float *prev_a, size_t row, size_t col, size_t batch_size) 407 { 408     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 409     size_t i = gidx / col; 410     size_t j = gidx % col; 411     if(i >= row) return; 412  413     float sum = 0; 414     for(int b = 0;b < batch_size;b++) { 415         sum += delta[b * row + i] * prev_a[b * col + j]; 416     } 417     sum *= f; 418     w[i * col + j] += sum; 419 } 420  421 template<> 422 struct Matrix<GPU> 423 { 424     size_t row, col, size; 425     float *data; 426  427     Matrix(size_t _row, size_t _col) : 428         row(_row), col(_col), size(_row * _col) 429         { 430             assert(row > 0 && col > 0); 431             CK(cudaMalloc((void **)&data, size * sizeof(float))); 432             assert(data); 433  434             float *tmp = (float*)malloc(size * sizeof(float)); 435             assert(tmp); 436             memset(tmp, 0, size * sizeof(float)); 437             CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 438             ::free(tmp); 439         } 440  441     Matrix(size_t _row, size_t _col, float *_data) : 442         row(_row), col(_col), size(_row * _col), data(_data) 443         { 444             assert(row > 0 && col > 0 && data); 445         } 446  447     Matrix splice(size_t from, size_t to) 448     { 449         assert(from < to && to <= row); 450         size_t offset = from * col; 451  452         Matrix ret(to - from, col, data + offset); 453         return ret; 454     } 455  456     string shape() const { 457         char buf[100]; 458         sprintf(buf, "%d x %d", (int)row, (int)col); 459         return buf; 460     } 461  462     void free() 463     { 464         assert(data); 465         CK(cudaFree(data)); 466         data =http://www.mamicode.com/ NULL; 467     } 468  469     void init(float v) 470     { 471         float *tmp = (float*)malloc(size * sizeof(float)); 472         assert(tmp); 473         for(int i = 0;i < size;i++) { 474             tmp[i] = v; 475         } 476         CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 477         ::free(tmp); 478     } 479  480     void init_gauss() 481     { 482         float *tmp = (float*)malloc(size * sizeof(float)); 483         assert(tmp); 484         for(int i = 0;i < size;i++) { 485             tmp[i] = gaussrand_f(); 486         } 487         CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 488         ::free(tmp); 489     } 490  491     __device__ float *operator[](size_t i) { 492         assert(i < row); 493         return data + i * col; 494     } 495  496     __device__ const float *operator[] (size_t i) const { 497         assert(i < row); 498         return data + i * col; 499     } 500  501     size_t max_idx() const { 502         assert(row == 1); 503         SmartGPUMem mem; 504         k_max_idx<<<1,1>>>(data, col, mem.gpu()); 505         return mem.cpu(); 506     } 507  508     float sum() const { 509         SmartGPUMem mem; 510         k_sum<<<1,1>>>(data, row, col, mem.gpu()); 511         return mem.cpu(); 512     } 513  514     void mul_to(const Matrix& in, Matrix& out) const { 515         // out x in 516         // b x in 517         // b x out 518         assert(row == out.col && col == in.col && in.row == out.row); 519         size_t thread_cnt = get_thread_cnt(in.row * row); 520         size_t block_cnt = ceil(in.row * row, thread_cnt); 521  522         k_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row); 523  524         /* 525         for(int b = 0;b < in.row;b++) { 526             for(int i = 0;i < row;i++) { 527                 out[b][i] = 0; 528                 for(int j = 0;j < col;j++) { 529                     out[b][i] += (*this)[i][j] * in[b][j]; 530                 } 531             } 532         } 533         */ 534     } 535  536  537     void t_and_mul_to(const Matrix& in, Matrix& out) const { 538         // out x in 539         // b x out 540         // b x in 541         assert(row == in.col && col == out.col && in.row == out.row); 542         size_t thread_cnt = get_thread_cnt(in.row * col); 543         size_t block_cnt = ceil(in.row * col, thread_cnt); 544  545         k_t_and_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row); 546         /* 547         for(int b = 0;b < in.row;b++) { 548             for(int j = 0;j < col;j++) { 549                 out[b][j] = 0; 550                 for(int i = 0;i < row;i++) { 551                     out[b][j] += (*this)[i][j] * in[b][i]; 552                 } 553             } 554         } 555         */ 556     } 557  558     void cast_add_to(Matrix& out) const { 559         // 1 x out 560         // b x out 561         assert(row == 1 && col == out.col); 562         size_t thread_cnt = get_thread_cnt(out.row * col); 563         size_t block_cnt = ceil(out.row * col, thread_cnt); 564  565         k_cast_add_to<<<block_cnt, thread_cnt>>>(data, out.data, col, out.row); 566         /* 567         for(int b = 0;b < out.row;b++) { 568             for(int j = 0;j < col;j++) { 569                 out[b][j] += (*this)[0][j]; 570             } 571         } 572         */ 573     } 574  575     void grad(float f, const Matrix& delta) { 576         // 1 x out 577         // b x out 578         assert(row == 1 && col == delta.col); 579         size_t thread_cnt = get_thread_cnt(col); 580         size_t block_cnt = ceil(col, thread_cnt); 581  582         k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, col, delta.row); 583         /* 584         for(int j = 0;j < col;j++) { 585             float sum = 0; 586             for(int b = 0;b < delta.row;b++) { 587                 sum += delta[b][j]; 588             } 589             sum *= f; 590             (*this)[0][j] += sum; 591         } 592         */ 593     } 594  595     void grad(float f, const Matrix& delta, const Matrix& prev_a) { 596         // out x in 597         // b x out 598         // b x in 599         assert(row == delta.col && col == prev_a.col); 600         size_t thread_cnt = get_thread_cnt(row * col); 601         size_t block_cnt = ceil(row * col, thread_cnt); 602  603         k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, prev_a.data, row, col, delta.row); 604         /* 605         for(int i = 0;i < row;i++) { 606             for(int j = 0;j < col;j++) { 607                 float sum = 0; 608                 for(int b = 0;b < delta.row;b++) { 609                     sum += delta[b][i] * prev_a[b][j]; 610                 } 611                 sum *= f; 612                 (*this)[i][j] += sum; 613             } 614         } 615         */ 616     } 617  618     Matrix<CPU> to_cpu() const { 619         Matrix<CPU> ret(row, col); 620         CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyDeviceToHost)); 621         return ret; 622     } 623 }; 624  625 Matrix<GPU> Matrix<CPU>::to_gpu() const { 626     Matrix<GPU> ret(row, col); 627     CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyHostToDevice)); 628     return ret; 629 } 630 #endif 631  632 template<size_t IS_GPU> 633 struct Softmax{}; 634  635 __host__ __device__ float softmax_calc(const float *x, const float *y, size_t size) 636 { 637     /* 638       log( exp(x_j) / \sum exp(x_k) ) 639     = x_j - log \sum exp(x_k) 640     = x_j - (max + log \sum exp(x_k - max)) 641     */ 642     float maxX = x[0]; 643     for(int i = 0;i < size;i++) { 644         if(x[i] > maxX) { 645             maxX = x[i]; 646         } 647     } 648  649     float xSum = 0; 650     for(int i = 0;i < size;i++) { 651         xSum += expf(x[i] - maxX); 652     } 653  654     float ret = 0; 655     for(int i = 0;i < size;i++) { 656         ret += y[i] * (x[i] - (maxX + logf(xSum))); 657     } 658  659     return -ret; 660 } 661  662 __host__ __device__ void softmax_propagate_delta(const float *x, const float *y, float *z, size_t size, float f) 663 { 664     /* 665       - d \sum y_j * log( exp(x_j) / \sum exp(x_k) ) 666     = - d \sum y_j * x_j - d \sum y_j log (\sum exp(x_k) ) 667     = - y_i + \sum (y_j * exp(x_i) / \sum exp(x_k)) 668     = - y_i + exp(x_i) (\sum y_j) / (\sum exp(x_k)) 669     */ 670  671     float maxX = x[0]; 672     for(int i = 0;i < size;i++) { 673         if(x[i] > maxX) { 674             maxX = x[i]; 675         } 676     } 677  678     // y - exp(x) sum_of_y / sum_of_exp(x) 679     float sumOfY = 0; 680     float sumOfX = 0; 681     for(int i = 0;i < size;i++) { 682         z[i] = expf(x[i] - maxX); 683         sumOfY += y[i]; 684         sumOfX += z[i]; 685     } 686  687     float yx = sumOfY/sumOfX; 688     for(int i = 0;i < size;i++) { 689         z[i] = yx * z[i] - y[i]; 690     } 691  692     for(int i = 0;i < size;i++) { 693         z[i] *= f; 694     } 695 } 696  697 template<> 698 struct Softmax<CPU> 699 { 700     // - \sum y_j * log( exp(x_j) / \sum exp(x_k) ) 701     void calc(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& out) const 702     { 703         assert(x.row == y.row && x.col == y.col); 704         assert(out.row == 1 && out.col == x.row); 705         for(int i = 0;i < x.row;i++) { 706             out[0][i] = softmax_calc(x[i], y[i], x.col); 707         } 708     }  709  710     void propagate_delta(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& z) const 711     { 712         assert(x.row == y.row && x.col == y.col); 713         assert(x.row == z.row && x.col == z.col); 714  715         for(int i = 0;i < x.row;i++) { 716             softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row); 717         } 718     } 719 }; 720  721 #ifdef GPU 722 __global__ void k_calc(const float *x, const float *y, float *out, size_t row, size_t col) 723 { 724     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 725     if(gidx >= row) return; 726     size_t i = gidx; 727  728     out[i] = softmax_calc(x + i * col, y + i * col, col); 729 } 730  731 __global__ void k_propagate_delta(const float *x, const float *y, float *z, size_t row, size_t col, float f) 732 { 733     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 734     if(gidx >= row) return; 735     size_t i = gidx; 736  737     softmax_propagate_delta(x + col * i, y + col * i, z + col * i, col, f); 738 } 739  740 template<> 741 struct Softmax<GPU> 742 { 743     void calc(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& out) const 744     { 745         assert(x.row == y.row && x.col == y.col); 746         assert(out.row == 1 && out.col == x.row); 747  748         size_t thread_cnt = get_thread_cnt(x.row); 749         size_t block_cnt = ceil(x.row, thread_cnt); 750         k_calc<<<block_cnt, thread_cnt>>>(x.data, y.data, out.data, x.row, x.col); 751         /* 752         for(int i = 0;i < x.row;i++) { 753             out[0][i] = softmax_calc(x[i], y[i], x.col); 754         } 755         */ 756     } 757  758     void propagate_delta(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& z) const 759     { 760         assert(x.row == y.row && x.col == y.col); 761         assert(x.row == z.row && x.col == z.col); 762  763         size_t thread_cnt = get_thread_cnt(x.row); 764         size_t block_cnt = ceil(x.row, thread_cnt); 765         k_propagate_delta<<<block_cnt, thread_cnt>>>(x.data, y.data, z.data, x.row, x.col, 1.0/x.row); 766         /* 767         for(int i = 0;i < x.row;i++) { 768             softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row); 769         } 770         */ 771     } 772 }; 773 #endif 774  775 template<size_t IS_GPU> 776 struct Relu{}; 777  778 template<> 779 struct Relu<CPU> 780 { 781     void forward(const Matrix<CPU>& x, Matrix<CPU> &y) const 782     { 783         assert(x.row == y.row && x.col == y.col); 784         for(int i = 0;i < x.row;i++) { 785             for(int j = 0;j < x.col;j++) { 786                 float t = x[i][j]; 787                 y[i][j] = t > 0 ? t : 0; 788             } 789         } 790     } 791  792     void derive_and_dot_into(const Matrix<CPU>& x, Matrix<CPU> &y) const 793     { 794         assert(x.row == y.row && x.col == y.col); 795         for(int i = 0;i < x.row;i++) { 796             for(int j = 0;j < x.col;j++) { 797                 float t = x[i][j]; 798                 y[i][j] *= t > 0 ? 1 : 0; 799             } 800         } 801     } 802 }; 803  804 #ifdef GPU 805 __global__ void k_forward(const float *x, float *y, size_t row, size_t col) 806 { 807     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 808     int i = gidx / col; 809     int j = gidx % col; 810     if(i >= row) return; 811  812     float t = x[i * col + j]; 813     y[i * col + j] = t > 0 ? t : 0; 814 } 815  816 __global__ void k_derive_and_dot_into(const float *x, float *y, size_t row, size_t col) 817 { 818     size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 819     int i = gidx / col; 820     int j = gidx % col; 821     if(i >= row) return; 822  823     float t = x[i * col + j]; 824     y[i * col + j] *= t > 0 ? 1 : 0; 825 } 826  827  828 template<> 829 struct Relu<GPU> 830 { 831     void forward(const Matrix<GPU>& x, Matrix<GPU> &y) const 832     { 833         assert(x.row == y.row && x.col == y.col); 834         size_t thread_cnt = get_thread_cnt(x.row * x.col); 835         size_t block_cnt = ceil(x.row * x.col, thread_cnt); 836         k_forward<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col); 837         /* 838         for(int i = 0;i < x.row;i++) { 839             for(int j = 0;j < x.col;j++) { 840                 float t = x[i][j]; 841                 y[i][j] = t > 0 ? t : 0; 842             } 843         } 844         */ 845     } 846  847     void derive_and_dot_into(const Matrix<GPU>& x, Matrix<GPU> &y) const 848     { 849         assert(x.row == y.row && x.col == y.col); 850         size_t thread_cnt = get_thread_cnt(x.row * x.col); 851         size_t block_cnt = ceil(x.row * x.col, thread_cnt); 852         k_derive_and_dot_into<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col); 853         /* 854         for(int i = 0;i < x.row;i++) { 855             for(int j = 0;j < x.col;j++) { 856                 float t = x[i][j]; 857                 y[i][j] *= t > 0 ? 1 : 0; 858             } 859         } 860         */ 861     } 862 }; 863 #endif 864  865 template<size_t IS_GPU> 866 struct Layer 867 { 868     size_t in_size, out_size, batch_size; 869     Matrix<IS_GPU> w, b; 870     Relu<IS_GPU> relu; 871  872     // z = w * in + b 873     // a = relu(z) 874     Matrix<IS_GPU> z, a, delta; 875  876     Layer(size_t _in_size, size_t _out_size, size_t _batch_size=1) : 877         in_size(_in_size), out_size(_out_size), batch_size(_batch_size), 878         w(_out_size, _in_size), b(1, _out_size), 879         z(_batch_size, _out_size), a(_batch_size, _out_size),  880         delta(_batch_size, _out_size) 881     { 882         assert(in_size > 0); 883         assert(out_size > 0); 884         assert(batch_size > 0); 885         w.init_gauss(); 886         b.init_gauss(); 887     } 888  889     void reset_batch_size(size_t _batch_size)  890     { 891         assert(_batch_size > 0); 892         if(_batch_size == batch_size) return; 893         batch_size = _batch_size; 894  895         z.free();  896         a.free();  897         delta.free(); 898         z = Matrix<IS_GPU>(batch_size, out_size); 899         a = Matrix<IS_GPU>(batch_size, out_size); 900         delta = Matrix<IS_GPU>(batch_size, out_size); 901     } 902  903     void calc(const Matrix<IS_GPU>& in) 904     { 905         assert(in.row == batch_size); 906         assert(in.col == in_size); 907  908         // out x in 909         // batch x in 910         // batch x out 911         w.mul_to(in, z); 912  913         // 1 x out 914         // batch x out 915         b.cast_add_to(z); 916  917         // batch x out 918         // batch x out 919         relu.forward(z, a); 920     } 921  922     void propagate_delta(Matrix<IS_GPU>& out) 923     { 924         assert(out.row == batch_size); 925         assert(out.col == in_size); 926  927         // out x in 928         // batch x out 929         // batch x in 930         w.t_and_mul_to(delta, out); 931     } 932  933     void update_parameters(float alpha, const Matrix<IS_GPU> &prev_a) 934     { 935         assert(prev_a.row == batch_size); 936         assert(prev_a.col == in_size); 937  938         // 1 x out 939         // 1 x out 940         b.grad(-alpha, delta); 941  942         // out x in 943         // batch x out 944         // batch x in 945         w.grad(-alpha, delta, prev_a); 946     } 947 }; 948  949 int MsbInt(char buf[]) 950 { 951     int base = 1; 952     int ret = 0; 953     for(int i = 3;i >= 0;i--) { 954         ret += (unsigned char)buf[i] * base; 955         base *= 256; 956     } 957     return ret; 958 } 959  960 vector<int> ReadMnistLabels(string fileName) 961 { 962     vector<int> ret; 963     ifstream ifs(fileName.c_str(), ios::binary); 964     char buf[1000]; 965  966     // MAGIC 967     ifs.read(buf, 4); 968     int magic = MsbInt(buf); 969     assert(magic == 0x00000801); 970  971     // num of images 972     ifs.read(buf, 4); 973     int nImages = MsbInt(buf); 974  975     while(nImages--) { 976         ret.push_back(ifs.get()); 977     } 978  979     return ret; 980 } 981  982 Matrix<CPU> ReadMnistData(const string& fileName) 983 { 984     ifstream ifs(fileName.c_str(), ios::binary); 985     char buf[1000]; 986  987     // MAGIC 988     ifs.read(buf, 4); 989     int magic = MsbInt(buf); 990     assert(magic == 0x00000803); 991  992     // num of images 993     ifs.read(buf, 4); 994     int nImages = MsbInt(buf); 995  996     int row, col; 997     ifs.read(buf, 4); 998     row = MsbInt(buf); 999     ifs.read(buf, 4);1000     col = MsbInt(buf);1001     assert(row == 28 && col == 28);1002 1003     size_t size = row * col;1004 1005     Matrix<CPU> ret(nImages, size);1006 1007     for(int k = 0;k < nImages;k++) {1008         for(int i = 0;i < row * col;i++) {1009             ret[k][i] = ifs.get() / 256.0; // 归一化1010         }1011     }1012 1013     return ret;1014 }1015 1016 Matrix<CPU> translateLabels(const vector<int>& labels, int k=10) 1017 {1018     Matrix<CPU> ret(labels.size(), k);1019 1020     for(int i = 0;i < labels.size();i++) {1021         assert(labels[i] >= 0 && labels[i] < k);1022         ret[i][labels[i]] = 1;1023     }1024     return ret;1025 }1026 1027 template<size_t IS_GPU>1028 void reset_model_batch_size(vector<Layer<IS_GPU> >&model, size_t batch_size)1029 {1030     for(int i = 0;i < model.size();i++) {1031         model[i].reset_batch_size(batch_size);1032     }1033 }1034 1035 template<size_t IS_GPU>1036 vector<bool> forward(1037     vector<Layer<IS_GPU> >& model,1038     const Matrix<IS_GPU>& in, 1039     const Matrix<IS_GPU>& label, 1040     const vector<int>& raw_label, 1041     const Softmax<IS_GPU>& s,1042     float& error)1043 {1044     size_t batch_size = in.row;1045     assert(model[0].batch_size == batch_size);1046     assert(label.row == batch_size);1047     assert(raw_label.size() == batch_size);1048 1049     size_t nLayer = model.size();1050 1051     for(int j = 0;j < nLayer;j++) {1052         Layer<IS_GPU> &layer = model[j];1053         if(j == 0) {1054             layer.calc(in);1055         } else {1056             layer.calc(model[j-1].a);1057         }1058     }1059 1060     Layer<IS_GPU> &lastLayer = model[nLayer - 1];1061     Matrix<IS_GPU> error2(1, batch_size);1062     s.calc(lastLayer.a, label, error2);1063     error = error2.sum() / batch_size;1064     error2.free();1065 1066     Matrix<CPU> cpu_a = lastLayer.a.to_cpu();1067     vector<bool> ret(batch_size);1068     for(int i = 0;i < batch_size;i++) {1069         ret[i] = cpu_a.splice(i,i + 1).max_idx() == raw_label[i];1070     }1071     cpu_a.free();1072     return ret;1073 }1074 1075 template<size_t IS_GPU>1076 void run(Matrix<IS_GPU> &train_data,1077         Matrix<IS_GPU> &train_label,1078         vector<int> &raw_train_label,1079 1080         Matrix<IS_GPU> &test_data,1081         Matrix<IS_GPU> &test_label,1082         vector<int> &raw_test_label,1083 1084         vector<Layer<IS_GPU> >& model,1085         size_t batch_size1086     )1087 {1088     clock_t start = clock();1089 1090     size_t M = train_data.row;1091     size_t T = test_data.row;1092 1093     Softmax<IS_GPU> s;1094     float avg_error = 0;1095     float error;1096     for(int i = 0;i < M;i += batch_size) {1097         size_t this_batch_size = std::min(batch_size, M - i);1098         reset_model_batch_size(model, this_batch_size);1099         Matrix<IS_GPU> this_batch_train_data = http://www.mamicode.com/train_data.splice(i, i + this_batch_size);1100         Matrix<IS_GPU> this_batch_train_label = train_label.splice(i, i + this_batch_size);1101         vector<int> this_batch_raw_train_label(raw_train_label.begin() + i, raw_train_label.begin() + i + this_batch_size);1102 1103         /*1104         cout << this_batch_train_data.shape() << endl;1105         cout << this_batch_train_label.shape() << endl;1106         cout << this_batch_raw_train_label.size() << endl;1107         */1108 1109         forward<IS_GPU>(1110             model, 1111             this_batch_train_data,1112             this_batch_train_label,1113             this_batch_raw_train_label,1114             s,1115             error1116             );1117         avg_error += error * this_batch_size;1118 1119         // backward1120         for(int j = model.size() - 1;j >= 0;j--) {1121             Layer<IS_GPU> &layer = model[j];1122             if(j == model.size() - 1) {1123                 s.propagate_delta(layer.a, this_batch_train_label, layer.delta);1124             } else {1125                 model[j + 1].propagate_delta(layer.delta);1126             }1127             layer.relu.derive_and_dot_into(layer.a, layer.delta);1128         }1129 1130         for(int j = 0;j < model.size();j++) {1131             model[j].update_parameters(0.001, j == 0 ? this_batch_train_data : model[j-1].a);1132         }1133     }1134     avg_error /= M;1135 1136     clock_t mid = clock();1137     cout << "\ttime=" << ((mid-start)*1.0/CLOCKS_PER_SEC) << " error=" << avg_error << endl;1138 1139     // 测试1140     reset_model_batch_size(model, M);1141     vector<bool> is_good = forward(model, 1142         train_data.splice(0, M), 1143         train_label.splice(0, M), 1144         vector<int>(raw_train_label.begin(), raw_train_label.begin() + M),1145         s, error);1146     size_t total = 0, good = 0;1147     for(int i = 0;i < M;i++) {1148         if(is_good[i]) good++;1149         total++;1150     }1151     cout << "\ttrain_accuracy=" << (good*1.0/total) << " ";1152 1153     total = good = 0;1154     reset_model_batch_size(model, T);1155     is_good = forward(model, 1156         test_data.splice(0, T),1157         test_label.splice(0, T),1158         vector<int>(raw_test_label.begin(), raw_test_label.begin() + T),1159         s, error);1160     for(int i = 0;i < T;i++) {1161         if(is_good[i]) good++;1162         total++;1163     }1164     cout << "test_accuracy=" << (good*1.0/total) << " ";1165 1166     clock_t end = clock();1167     cout << "total_time=" << (end-start)*1.0/CLOCKS_PER_SEC << endl;1168 }1169 1170 void test()1171 {1172     /*1173     0 11174     1 21175     2 31176     */1177     Matrix<CPU> a(3, 2);1178     for(int i = 0;i < a.row;i++) {1179         for(int j = 0;j < a.col;j++) {1180             a[i][j] = i + j;1181         }1182     }1183     //Matrix splice(size_t from, size_t to) {1184     Matrix<CPU> t = a.splice(1,2);1185     assert(t.row == 1 && t.col == 2 && t[0][0] == 1);1186 1187     //string shape() const {1188     assert(a.shape() == "3 x 2");1189 1190     //void free() {1191     t = Matrix<CPU>(1,1);1192     t.free();1193     assert(!t.data);1194 1195     //void init(float v) {1196     t = Matrix<CPU>(2,2);1197     t.init(2);1198     assert(t[1][1] == 2);1199 1200     //void init_gauss() {1201     t = Matrix<CPU>(2,2);1202     assert(t[1][1] == 0);1203     t.init_gauss();1204     assert(t[1][1] != 0);1205 1206     //float *operator[](size_t i) {1207     //const float *operator[] (size_t i) const {1208 1209     //void assert_eq(const Matrix& rhs)1210     t = Matrix<CPU>(2,2);1211     Matrix<CPU> t2(2,2);1212     t.assert_eq(t2);1213 1214     //size_t max_idx() const {1215     assert(a.splice(0,1).max_idx() == 1);1216 1217     //float sum() const {1218     assert(a.splice(1,2).sum() == 3);1219 1220     //void mul_to(const Matrix& in, Matrix& out) const {1221     /*1222     0 11223     1 21224     2 31225 1226      x1227 1228     1 21229     3 41230 1231      =1232 1233     2 5 81234     4 11 181235     */1236     Matrix<CPU> b(2, 2);1237     b[0][0] = 1; b[0][1] = 2;1238     b[1][0] = 3; b[1][1] = 4;1239     Matrix<CPU> out(2, 3);1240     a.mul_to(b, out);1241     assert(out[0][0] == 2 && out[0][1] == 5 && out[0][2] == 8);1242     assert(out[1][0] == 4 && out[1][1] == 11 && out[1][2] == 18);1243 1244 1245     //void t_and_mul_to(const Matrix& in, Matrix& out) const {1246     /*1247     0 11248     1 2 -> 0 1 21249     2 3    1 2 31250 1251      x1252 1253     1 2 31254     4 5 61255 1256      =1257 1258     8 141259     17 321260     */1261     //Matrix<CPU> ap = a.t();1262     b = Matrix<CPU>(2, 3);1263     b[0][0] = 1; b[0][1] = 2; b[0][2] = 3;1264     b[1][0] = 4; b[1][1] = 5; b[1][2] = 6;1265     out = Matrix<CPU>(2, 2);1266     a.t_and_mul_to(b, out);1267     //cout << out << endl;1268     //Matrix<CPU> out2(2, 2);1269     //ap.mul_to(b, out2);1270     //cout << out2 << endl;1271     //out.assert_eq(out2);1272     assert(out[0][0] == 8 && out[0][1] == 14);1273     assert(out[1][0] == 17 && out[1][1] == 32);1274 1275     //void cast_add_to(Matrix& out) const {1276     /*1277     1 21278     +1279     1 21280     3 41281     =1282     2 41283     4 61284     */1285     b = Matrix<CPU>(1, 2);1286     b[0][0] = 1; b[0][1] = 2;1287     out = Matrix<CPU>(2,2);1288     out[0][0] = 1; out[0][1] = 2;1289     out[1][0] = 3; out[1][1] = 4;1290     b.cast_add_to(out);1291     assert(out[0][0] == 2 && out[0][1] == 4);1292     assert(out[1][0] == 4 && out[1][1] == 6);1293 1294     //void grad(float f, Matrix& delta) {1295     /*1296     1 21297     grad with f = -0.51298     1 21299     3 41300     =1301     -1 -11302     */1303     b = Matrix<CPU>(1, 2);1304     b[0][0] = 1; b[0][1] = 2;1305     out = Matrix<CPU>(2,2);1306     out[0][0] = 1; out[0][1] = 2;1307     out[1][0] = 3; out[1][1] = 4;1308     b.grad(-0.5, out);1309     assert(b[0][0] == -1 && b[0][1] == -1);1310 1311     //void grad(float f, const Matrix& delta, const Matrix& prev_a) {1312     /*1313     0 11314     1 21315     2 31316 1317     grad with f = -0.51318 1319     delta=1320     1 2 3 -> 11321     4 5 6    21322              31323 1324     prev_a=1325     7 8 -> 71326     9 10   81327 1328     =1329 1330     [0][1] = 1 - 0.5 * (1 * 8 + 4 * 10) = 1 - 0.5 * 48 = 1 - 24 = -231331     */1332     Matrix<CPU> w(3, 2);1333     w[0][0] = 0; w[0][1] = 1;1334     w[1][0] = 1; w[1][1] = 2;1335     w[2][0] = 2; w[2][1] = 3;1336 1337     Matrix<CPU> delta(2, 3);1338     delta[0][0] = 1; delta[0][1] = 2; delta[0][2] = 3;1339     delta[1][0] = 4; delta[1][1] = 5; delta[1][2] = 6;1340 1341     Matrix<CPU> prev_a(2, 2);1342     prev_a[0][0] = 7; prev_a[0][1] = 8;1343     prev_a[1][0] = 9; prev_a[1][1] = 10;1344 1345     w.grad(-0.5, delta, prev_a);1346     //cout << w << endl;1347     assert(w[0][1] == -23);1348 1349     //Matrix to_cpu() const {1350     t = a.to_cpu();1351     t.assert_eq(a);1352 }1353 1354 int main()1355 {1356     cout << "Testing" << endl;1357     test();1358     //return 0;1359 1360     cout << "Loading data" << endl;1361     // 读取数据1362     vector<int> raw_train_label = ReadMnistLabels("mnist/train-labels-idx1-ubyte");1363     //assert(raw_train_label.size() == 60000);1364     Matrix<CPU> cpu_train_data = http://www.mamicode.com/ReadMnistData("mnist/train-images-idx3-ubyte");1365     //assert(cpu_train_data.row == 60000 && cpu_train_data.col == 28 * 28);1366     Matrix<CPU> cpu_train_label = translateLabels(raw_train_label);1367     //assert(cpu_train_label.row == 60000 && cpu_train_label.col == 10);1368 #ifdef GPU1369     Matrix<GPU> gpu_train_data(cpu_train_data.to_gpu());1370     Matrix<GPU> gpu_train_label(cpu_train_label.to_gpu());1371 #endif1372 1373 1374     vector<int> raw_test_label = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");1375     //assert(raw_test_label.size() == 10000);1376     Matrix<CPU> cpu_test_data = http://www.mamicode.com/ReadMnistData("mnist/t10k-images-idx3-ubyte");1377     //assert(cpu_test_data.row == 10000 && cpu_test_data.col == 28 * 28);1378     Matrix<CPU> cpu_test_label = translateLabels(raw_test_label);1379     //assert(cpu_test_label.row == 10000 && cpu_test_label.col == 10);1380 #ifdef GPU1381     Matrix<GPU> gpu_test_data(cpu_test_data.to_gpu());1382     Matrix<GPU> gpu_test_label(cpu_test_label.to_gpu());1383 #endif1384 1385     size_t n_input = cpu_train_data.col;1386     size_t n_output = 10;1387     size_t n_mid = 100;1388     size_t batch_size = 256;1389 1390     cout << "Now run" << endl;1391     srand(1000);1392     vector<Layer<CPU> > cpu_model;1393     cpu_model.push_back(Layer<CPU>(n_input, n_mid));1394     cpu_model.push_back(Layer<CPU>(n_mid, n_output));1395 #ifdef GPU1396     srand(1000);1397     vector<Layer<GPU> > gpu_model;1398     gpu_model.push_back(Layer<GPU>(n_input, n_mid));1399     gpu_model.push_back(Layer<GPU>(n_mid, n_output));1400 #endif1401 1402     for(int i = 0; i < 5;i++) {1403         cout << "cpu-" << (i+1) << endl;1404         run<CPU>(cpu_train_data, cpu_train_label, raw_train_label, cpu_test_data, cpu_test_label, raw_test_label, cpu_model, batch_size);1405         #ifdef GPU1406         cout << "gpu-" << (i+1) << endl;1407         run<GPU>(gpu_train_data, gpu_train_label, raw_train_label, gpu_test_data, gpu_test_label, raw_test_label, gpu_model, batch_size);1408         #endif1409     }1410 1411     cout << "Done" << endl;1412     return 0;1413 }

 

CUDA跑MNIST,加速