首页 > 代码库 > CUDA跑MNIST,加速
CUDA跑MNIST,加速
之前用CUDA写的版本,竟然还不如CPU。
经过好几次的尝试,得到两点经验:
1. CUDA的kernel不需要写得硕大无比。写大了之后,block数和thread数反而不好调整(之前都没有用上block),另外就是会导致数据管理非常复杂。kernel搞成细粒度之后好像也没那么多的影响。
2. 训练还得batch的来,最重要的影响就是能极大的增加数据并行性。
下面的代码,GPU vs CPU达到了接近20倍的加速,代码也不复杂。
1 #include <iostream> 2 #include <cstdlib> 3 #include <cassert> 4 #include <string> 5 #include <cstring> 6 #include <fstream> 7 #include <vector> 8 #include <memory> 9 #include <cstdlib> 10 #include <cmath> 11 #include <ctime> 12 using namespace std; 13 14 #define CPU 0 15 #define GPU 1 16 17 #ifndef GPU 18 #define __host__ 19 #define __device__ 20 #define expf exp 21 #define logf log 22 #endif 23 24 #ifdef GPU 25 #include <cuda_runtime.h> 26 #include <math_functions.h> 27 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo) 28 { 29 if(code == cudaSuccess) return; 30 cerr << "Cuda call failed at " << fileName << ":" << lineNo 31 << " " << cudaGetErrorString(code) << endl; 32 exit(-1); 33 } 34 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__) 35 36 37 inline size_t get_thread_cnt(size_t size) 38 { 39 size = min(size, 1024UL); 40 41 // ceil(size / 32) 42 size += 31; 43 size &= ~31; 44 return size; 45 } 46 47 inline size_t ceil(size_t x, size_t y) 48 { 49 return (x + y - 1) / y; 50 } 51 #endif 52 53 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html 54 // 高斯分布的随机数,均值为0,方差为1 55 double gaussrand() 56 { 57 static double V1, V2, S; 58 static int phase = 0; 59 double X; 60 61 if ( phase == 0 ) { 62 do { 63 double U1 = (double)rand() / RAND_MAX; 64 double U2 = (double)rand() / RAND_MAX; 65 66 V1 = 2 * U1 - 1; 67 V2 = 2 * U2 - 1; 68 S = V1 * V1 + V2 * V2; 69 } while(S >= 1 || S == 0); 70 71 X = V1 * sqrt(-2 * log(S) / S); 72 } else 73 X = V2 * sqrt(-2 * log(S) / S); 74 75 phase = 1 - phase; 76 77 return X; 78 } 79 80 float gaussrand_f() 81 { 82 return float(gaussrand() * 0.01); 83 } 84 85 template<size_t IS_GPU> 86 struct Matrix {}; 87 88 template<> 89 struct Matrix<CPU> 90 { 91 size_t row, col, size; 92 float *data; 93 94 Matrix(size_t _row, size_t _col) : 95 row(_row), col(_col), size(_row * _col) 96 { 97 assert(row > 0 && col > 0); 98 data = http://www.mamicode.com/(float *)malloc(size * sizeof(float)); 99 assert(data); 100 memset(data, 0, row * col * sizeof(float)); 101 } 102 103 Matrix(size_t _row, size_t _col, float *_data) : 104 row(_row), col(_col), size(_row * _col), data(_data) 105 { 106 assert(row > 0 && col > 0 && data); 107 } 108 109 Matrix splice(size_t from, size_t to) 110 { 111 assert(from < to && to <= row); 112 size_t offset = from * col; 113 114 Matrix<CPU> ret(to - from, col, data + offset); 115 return ret; 116 } 117 118 string shape() const { 119 char buf[100]; 120 sprintf(buf, "%d x %d", (int)row, (int)col); 121 return buf; 122 } 123 124 void free() 125 { 126 assert(data); 127 ::free(data); 128 data =http://www.mamicode.com/ NULL; 129 } 130 131 void init(float v) 132 { 133 for(int i = 0;i < row;i++) { 134 for(int j = 0;j < col;j++) { 135 (*this)[i][j] = v; 136 } 137 } 138 } 139 140 void init_gauss() 141 { 142 for(int i = 0;i < row;i++) { 143 for(int j = 0;j < col;j++) { 144 (*this)[i][j] = gaussrand_f(); 145 } 146 } 147 } 148 149 float *operator[](size_t i) { 150 assert(i < row); 151 return data + i * col; 152 } 153 154 const float *operator[] (size_t i) const { 155 assert(i < row); 156 return data + i * col; 157 } 158 159 void assert_eq(const Matrix& rhs) 160 { 161 assert(row == rhs.row); 162 assert(col == rhs.col); 163 for(int i = 0;i < row;i++) { 164 for(int j = 0;j < col;j++) { 165 float x = (*this)[i][j]; 166 float y = rhs[i][j]; 167 float d = x - y; 168 if(d < 0) d = -d; 169 if(d > 1E-7) { 170 cerr << "[" << i << "," << j << "] delta=" << d << " x=" << x << " y=" << y << endl; 171 exit(-1); 172 } 173 } 174 } 175 } 176 177 size_t max_idx() const { 178 assert(row == 1); 179 size_t ret = 0; 180 for(int i = 1;i < col;i++) { 181 if((*this)[0][i] > (*this)[0][ret]) { 182 ret = i; 183 } 184 } 185 return ret; 186 } 187 188 float sum() const { 189 float ret = 0; 190 for(int i = 0;i < row;i++) { 191 for(int j = 0;j < col;j++) { 192 ret += (*this)[i][j]; 193 } 194 } 195 return ret; 196 } 197 198 void mul_to(const Matrix& in, Matrix& out) const { 199 // out x in 200 // b x in 201 // b x out 202 assert(row == out.col && col == in.col && in.row == out.row); 203 for(int b = 0;b < in.row;b++) { 204 for(int i = 0;i < row;i++) { 205 out[b][i] = 0; 206 for(int j = 0;j < col;j++) { 207 out[b][i] += (*this)[i][j] * in[b][j]; 208 } 209 } 210 } 211 } 212 213 Matrix t() const { 214 Matrix ret(col, row); 215 for(int i = 0;i < row;i++) { 216 for(int j = 0;j < col;j++) { 217 ret[j][i] = (*this)[i][j]; 218 } 219 } 220 return ret; 221 } 222 223 void t_and_mul_to(const Matrix& in, Matrix& out) const { 224 // out x in 225 // b x out 226 // b x in 227 assert(row == in.col && col == out.col && in.row == out.row); 228 for(int b = 0;b < in.row;b++) { 229 for(int j = 0;j < col;j++) { 230 out[b][j] = 0; 231 for(int i = 0;i < row;i++) { 232 out[b][j] += (*this)[i][j] * in[b][i]; 233 } 234 } 235 } 236 } 237 238 void cast_add_to(Matrix& out) const { 239 // 1 x out 240 // b x out 241 assert(row == 1 && col == out.col); 242 for(int b = 0;b < out.row;b++) { 243 for(int j = 0;j < col;j++) { 244 out[b][j] += (*this)[0][j]; 245 } 246 } 247 } 248 249 void grad(float f, Matrix& delta) { 250 // 1 x out 251 // b x out 252 assert(row == 1 && col == delta.col); 253 for(int j = 0;j < col;j++) { 254 float sum = 0; 255 for(int b = 0;b < delta.row;b++) { 256 sum += delta[b][j]; 257 } 258 sum *= f; 259 (*this)[0][j] += sum; 260 } 261 } 262 263 void grad(float f, const Matrix& delta, const Matrix& prev_a) { 264 // out x in 265 // b x out 266 // b x in 267 assert(row == delta.col && col == prev_a.col); 268 for(int i = 0;i < row;i++) { 269 for(int j = 0;j < col;j++) { 270 float sum = 0; 271 for(int b = 0;b < delta.row;b++) { 272 sum += delta[b][i] * prev_a[b][j]; 273 } 274 sum *= f; 275 (*this)[i][j] += sum; 276 } 277 } 278 } 279 280 Matrix to_cpu() const { 281 Matrix ret(row, col); 282 memcpy(ret.data, data, size * sizeof(float)); 283 return ret; 284 } 285 286 #ifdef GPU 287 Matrix<GPU> to_gpu() const; 288 #endif 289 }; 290 291 ostream& operator<<(ostream& out, const Matrix<CPU>& v) 292 { 293 out << "[(" << v.row << " x " << v.col << ") " << endl; 294 for(int i = 0;i < v.row;i++) { 295 out << "\t"; 296 for(int j = 0;j < v.col;j++) { 297 if(j > 0) cout << ","; 298 out << v[i][j]; 299 } 300 out << endl; 301 } 302 out << "]"; 303 return out; 304 } 305 306 #ifdef GPU 307 class SmartGPUMem 308 { 309 private: 310 float *m_gpu; 311 312 public: 313 SmartGPUMem() 314 { 315 CK(cudaMalloc((void **)&m_gpu, sizeof *m_gpu)); 316 assert(m_gpu); 317 } 318 319 float *gpu() { return m_gpu; } 320 float cpu() 321 { 322 float t; 323 CK(cudaMemcpy(&t, m_gpu, sizeof t, cudaMemcpyDeviceToHost)); 324 return t; 325 } 326 327 ~SmartGPUMem() 328 { 329 CK(cudaFree(m_gpu)); 330 } 331 }; 332 333 __global__ void k_sum(const float *data, size_t row, size_t col, float *out) { 334 float t = 0; 335 for(int i = 0;i < row;i++) { 336 for(int j = 0;j < col;j++) { 337 t += data[i * col + j]; 338 } 339 } 340 *out = t; 341 } 342 343 __global__ void k_max_idx(const float *data, size_t size, float *out) { 344 int t = 0; 345 for(int i = 1;i < size;i++) { 346 if(data[i] > data[t]) { 347 t = i; 348 } 349 } 350 *out = t; 351 } 352 353 __global__ void k_mul_to(const float * w, const float *in, float *out, size_t row, size_t col, size_t batch_size) 354 { 355 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 356 int b = gidx / row; 357 int i = gidx % row; 358 359 if(b >= batch_size) return; 360 361 float t = 0; 362 for(int j = 0;j < col;j++) { 363 t += w[i * col + j] * in[b * col + j]; 364 } 365 out[b * row + i] = t; 366 } 367 368 __global__ void k_t_and_mul_to(const float *w, const float *in, float *out, size_t row, size_t col, size_t batch_size) 369 { 370 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 371 int b = gidx / col; 372 int j = gidx % col; 373 374 if(b >= batch_size) return; 375 376 float t = 0; 377 for(int i = 0;i < row;i++) { 378 t += w[i * col + j] * in[b * row + i]; 379 } 380 out[b * col + j] = t; 381 } 382 383 __global__ void k_cast_add_to(const float* bx, float *out, size_t col, size_t batch_size) 384 { 385 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 386 int b = gidx / col; 387 int j = gidx % col; 388 389 if(b >= batch_size) return; 390 out[b * col + j] += bx[j]; 391 } 392 393 __global__ void k_grad(float *b, float f, const float *delta, size_t col, size_t batch_size) 394 { 395 size_t j = blockDim.x * blockIdx.x + threadIdx.x; 396 if(j >= col) return; 397 398 float sum = 0; 399 for(int b = 0;b < batch_size;b++) { 400 sum += delta[b * col + j]; 401 } 402 sum *= f; 403 b[j] += sum; 404 } 405 406 __global__ void k_grad(float *w, float f, const float *delta, const float *prev_a, size_t row, size_t col, size_t batch_size) 407 { 408 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 409 size_t i = gidx / col; 410 size_t j = gidx % col; 411 if(i >= row) return; 412 413 float sum = 0; 414 for(int b = 0;b < batch_size;b++) { 415 sum += delta[b * row + i] * prev_a[b * col + j]; 416 } 417 sum *= f; 418 w[i * col + j] += sum; 419 } 420 421 template<> 422 struct Matrix<GPU> 423 { 424 size_t row, col, size; 425 float *data; 426 427 Matrix(size_t _row, size_t _col) : 428 row(_row), col(_col), size(_row * _col) 429 { 430 assert(row > 0 && col > 0); 431 CK(cudaMalloc((void **)&data, size * sizeof(float))); 432 assert(data); 433 434 float *tmp = (float*)malloc(size * sizeof(float)); 435 assert(tmp); 436 memset(tmp, 0, size * sizeof(float)); 437 CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 438 ::free(tmp); 439 } 440 441 Matrix(size_t _row, size_t _col, float *_data) : 442 row(_row), col(_col), size(_row * _col), data(_data) 443 { 444 assert(row > 0 && col > 0 && data); 445 } 446 447 Matrix splice(size_t from, size_t to) 448 { 449 assert(from < to && to <= row); 450 size_t offset = from * col; 451 452 Matrix ret(to - from, col, data + offset); 453 return ret; 454 } 455 456 string shape() const { 457 char buf[100]; 458 sprintf(buf, "%d x %d", (int)row, (int)col); 459 return buf; 460 } 461 462 void free() 463 { 464 assert(data); 465 CK(cudaFree(data)); 466 data =http://www.mamicode.com/ NULL; 467 } 468 469 void init(float v) 470 { 471 float *tmp = (float*)malloc(size * sizeof(float)); 472 assert(tmp); 473 for(int i = 0;i < size;i++) { 474 tmp[i] = v; 475 } 476 CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 477 ::free(tmp); 478 } 479 480 void init_gauss() 481 { 482 float *tmp = (float*)malloc(size * sizeof(float)); 483 assert(tmp); 484 for(int i = 0;i < size;i++) { 485 tmp[i] = gaussrand_f(); 486 } 487 CK(cudaMemcpy(data, tmp, size * sizeof(float), cudaMemcpyHostToDevice)); 488 ::free(tmp); 489 } 490 491 __device__ float *operator[](size_t i) { 492 assert(i < row); 493 return data + i * col; 494 } 495 496 __device__ const float *operator[] (size_t i) const { 497 assert(i < row); 498 return data + i * col; 499 } 500 501 size_t max_idx() const { 502 assert(row == 1); 503 SmartGPUMem mem; 504 k_max_idx<<<1,1>>>(data, col, mem.gpu()); 505 return mem.cpu(); 506 } 507 508 float sum() const { 509 SmartGPUMem mem; 510 k_sum<<<1,1>>>(data, row, col, mem.gpu()); 511 return mem.cpu(); 512 } 513 514 void mul_to(const Matrix& in, Matrix& out) const { 515 // out x in 516 // b x in 517 // b x out 518 assert(row == out.col && col == in.col && in.row == out.row); 519 size_t thread_cnt = get_thread_cnt(in.row * row); 520 size_t block_cnt = ceil(in.row * row, thread_cnt); 521 522 k_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row); 523 524 /* 525 for(int b = 0;b < in.row;b++) { 526 for(int i = 0;i < row;i++) { 527 out[b][i] = 0; 528 for(int j = 0;j < col;j++) { 529 out[b][i] += (*this)[i][j] * in[b][j]; 530 } 531 } 532 } 533 */ 534 } 535 536 537 void t_and_mul_to(const Matrix& in, Matrix& out) const { 538 // out x in 539 // b x out 540 // b x in 541 assert(row == in.col && col == out.col && in.row == out.row); 542 size_t thread_cnt = get_thread_cnt(in.row * col); 543 size_t block_cnt = ceil(in.row * col, thread_cnt); 544 545 k_t_and_mul_to<<<block_cnt, thread_cnt>>>(data, in.data, out.data, row, col, in.row); 546 /* 547 for(int b = 0;b < in.row;b++) { 548 for(int j = 0;j < col;j++) { 549 out[b][j] = 0; 550 for(int i = 0;i < row;i++) { 551 out[b][j] += (*this)[i][j] * in[b][i]; 552 } 553 } 554 } 555 */ 556 } 557 558 void cast_add_to(Matrix& out) const { 559 // 1 x out 560 // b x out 561 assert(row == 1 && col == out.col); 562 size_t thread_cnt = get_thread_cnt(out.row * col); 563 size_t block_cnt = ceil(out.row * col, thread_cnt); 564 565 k_cast_add_to<<<block_cnt, thread_cnt>>>(data, out.data, col, out.row); 566 /* 567 for(int b = 0;b < out.row;b++) { 568 for(int j = 0;j < col;j++) { 569 out[b][j] += (*this)[0][j]; 570 } 571 } 572 */ 573 } 574 575 void grad(float f, const Matrix& delta) { 576 // 1 x out 577 // b x out 578 assert(row == 1 && col == delta.col); 579 size_t thread_cnt = get_thread_cnt(col); 580 size_t block_cnt = ceil(col, thread_cnt); 581 582 k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, col, delta.row); 583 /* 584 for(int j = 0;j < col;j++) { 585 float sum = 0; 586 for(int b = 0;b < delta.row;b++) { 587 sum += delta[b][j]; 588 } 589 sum *= f; 590 (*this)[0][j] += sum; 591 } 592 */ 593 } 594 595 void grad(float f, const Matrix& delta, const Matrix& prev_a) { 596 // out x in 597 // b x out 598 // b x in 599 assert(row == delta.col && col == prev_a.col); 600 size_t thread_cnt = get_thread_cnt(row * col); 601 size_t block_cnt = ceil(row * col, thread_cnt); 602 603 k_grad<<<block_cnt, thread_cnt>>>(data, f, delta.data, prev_a.data, row, col, delta.row); 604 /* 605 for(int i = 0;i < row;i++) { 606 for(int j = 0;j < col;j++) { 607 float sum = 0; 608 for(int b = 0;b < delta.row;b++) { 609 sum += delta[b][i] * prev_a[b][j]; 610 } 611 sum *= f; 612 (*this)[i][j] += sum; 613 } 614 } 615 */ 616 } 617 618 Matrix<CPU> to_cpu() const { 619 Matrix<CPU> ret(row, col); 620 CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyDeviceToHost)); 621 return ret; 622 } 623 }; 624 625 Matrix<GPU> Matrix<CPU>::to_gpu() const { 626 Matrix<GPU> ret(row, col); 627 CK(cudaMemcpy(ret.data, data, size * sizeof(float), cudaMemcpyHostToDevice)); 628 return ret; 629 } 630 #endif 631 632 template<size_t IS_GPU> 633 struct Softmax{}; 634 635 __host__ __device__ float softmax_calc(const float *x, const float *y, size_t size) 636 { 637 /* 638 log( exp(x_j) / \sum exp(x_k) ) 639 = x_j - log \sum exp(x_k) 640 = x_j - (max + log \sum exp(x_k - max)) 641 */ 642 float maxX = x[0]; 643 for(int i = 0;i < size;i++) { 644 if(x[i] > maxX) { 645 maxX = x[i]; 646 } 647 } 648 649 float xSum = 0; 650 for(int i = 0;i < size;i++) { 651 xSum += expf(x[i] - maxX); 652 } 653 654 float ret = 0; 655 for(int i = 0;i < size;i++) { 656 ret += y[i] * (x[i] - (maxX + logf(xSum))); 657 } 658 659 return -ret; 660 } 661 662 __host__ __device__ void softmax_propagate_delta(const float *x, const float *y, float *z, size_t size, float f) 663 { 664 /* 665 - d \sum y_j * log( exp(x_j) / \sum exp(x_k) ) 666 = - d \sum y_j * x_j - d \sum y_j log (\sum exp(x_k) ) 667 = - y_i + \sum (y_j * exp(x_i) / \sum exp(x_k)) 668 = - y_i + exp(x_i) (\sum y_j) / (\sum exp(x_k)) 669 */ 670 671 float maxX = x[0]; 672 for(int i = 0;i < size;i++) { 673 if(x[i] > maxX) { 674 maxX = x[i]; 675 } 676 } 677 678 // y - exp(x) sum_of_y / sum_of_exp(x) 679 float sumOfY = 0; 680 float sumOfX = 0; 681 for(int i = 0;i < size;i++) { 682 z[i] = expf(x[i] - maxX); 683 sumOfY += y[i]; 684 sumOfX += z[i]; 685 } 686 687 float yx = sumOfY/sumOfX; 688 for(int i = 0;i < size;i++) { 689 z[i] = yx * z[i] - y[i]; 690 } 691 692 for(int i = 0;i < size;i++) { 693 z[i] *= f; 694 } 695 } 696 697 template<> 698 struct Softmax<CPU> 699 { 700 // - \sum y_j * log( exp(x_j) / \sum exp(x_k) ) 701 void calc(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& out) const 702 { 703 assert(x.row == y.row && x.col == y.col); 704 assert(out.row == 1 && out.col == x.row); 705 for(int i = 0;i < x.row;i++) { 706 out[0][i] = softmax_calc(x[i], y[i], x.col); 707 } 708 } 709 710 void propagate_delta(const Matrix<CPU> &x, const Matrix<CPU> &y, Matrix<CPU>& z) const 711 { 712 assert(x.row == y.row && x.col == y.col); 713 assert(x.row == z.row && x.col == z.col); 714 715 for(int i = 0;i < x.row;i++) { 716 softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row); 717 } 718 } 719 }; 720 721 #ifdef GPU 722 __global__ void k_calc(const float *x, const float *y, float *out, size_t row, size_t col) 723 { 724 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 725 if(gidx >= row) return; 726 size_t i = gidx; 727 728 out[i] = softmax_calc(x + i * col, y + i * col, col); 729 } 730 731 __global__ void k_propagate_delta(const float *x, const float *y, float *z, size_t row, size_t col, float f) 732 { 733 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 734 if(gidx >= row) return; 735 size_t i = gidx; 736 737 softmax_propagate_delta(x + col * i, y + col * i, z + col * i, col, f); 738 } 739 740 template<> 741 struct Softmax<GPU> 742 { 743 void calc(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& out) const 744 { 745 assert(x.row == y.row && x.col == y.col); 746 assert(out.row == 1 && out.col == x.row); 747 748 size_t thread_cnt = get_thread_cnt(x.row); 749 size_t block_cnt = ceil(x.row, thread_cnt); 750 k_calc<<<block_cnt, thread_cnt>>>(x.data, y.data, out.data, x.row, x.col); 751 /* 752 for(int i = 0;i < x.row;i++) { 753 out[0][i] = softmax_calc(x[i], y[i], x.col); 754 } 755 */ 756 } 757 758 void propagate_delta(const Matrix<GPU> &x, const Matrix<GPU> &y, Matrix<GPU>& z) const 759 { 760 assert(x.row == y.row && x.col == y.col); 761 assert(x.row == z.row && x.col == z.col); 762 763 size_t thread_cnt = get_thread_cnt(x.row); 764 size_t block_cnt = ceil(x.row, thread_cnt); 765 k_propagate_delta<<<block_cnt, thread_cnt>>>(x.data, y.data, z.data, x.row, x.col, 1.0/x.row); 766 /* 767 for(int i = 0;i < x.row;i++) { 768 softmax_propagate_delta(x[i], y[i], z[i], x.col, 1.0/x.row); 769 } 770 */ 771 } 772 }; 773 #endif 774 775 template<size_t IS_GPU> 776 struct Relu{}; 777 778 template<> 779 struct Relu<CPU> 780 { 781 void forward(const Matrix<CPU>& x, Matrix<CPU> &y) const 782 { 783 assert(x.row == y.row && x.col == y.col); 784 for(int i = 0;i < x.row;i++) { 785 for(int j = 0;j < x.col;j++) { 786 float t = x[i][j]; 787 y[i][j] = t > 0 ? t : 0; 788 } 789 } 790 } 791 792 void derive_and_dot_into(const Matrix<CPU>& x, Matrix<CPU> &y) const 793 { 794 assert(x.row == y.row && x.col == y.col); 795 for(int i = 0;i < x.row;i++) { 796 for(int j = 0;j < x.col;j++) { 797 float t = x[i][j]; 798 y[i][j] *= t > 0 ? 1 : 0; 799 } 800 } 801 } 802 }; 803 804 #ifdef GPU 805 __global__ void k_forward(const float *x, float *y, size_t row, size_t col) 806 { 807 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 808 int i = gidx / col; 809 int j = gidx % col; 810 if(i >= row) return; 811 812 float t = x[i * col + j]; 813 y[i * col + j] = t > 0 ? t : 0; 814 } 815 816 __global__ void k_derive_and_dot_into(const float *x, float *y, size_t row, size_t col) 817 { 818 size_t gidx = blockDim.x * blockIdx.x + threadIdx.x; 819 int i = gidx / col; 820 int j = gidx % col; 821 if(i >= row) return; 822 823 float t = x[i * col + j]; 824 y[i * col + j] *= t > 0 ? 1 : 0; 825 } 826 827 828 template<> 829 struct Relu<GPU> 830 { 831 void forward(const Matrix<GPU>& x, Matrix<GPU> &y) const 832 { 833 assert(x.row == y.row && x.col == y.col); 834 size_t thread_cnt = get_thread_cnt(x.row * x.col); 835 size_t block_cnt = ceil(x.row * x.col, thread_cnt); 836 k_forward<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col); 837 /* 838 for(int i = 0;i < x.row;i++) { 839 for(int j = 0;j < x.col;j++) { 840 float t = x[i][j]; 841 y[i][j] = t > 0 ? t : 0; 842 } 843 } 844 */ 845 } 846 847 void derive_and_dot_into(const Matrix<GPU>& x, Matrix<GPU> &y) const 848 { 849 assert(x.row == y.row && x.col == y.col); 850 size_t thread_cnt = get_thread_cnt(x.row * x.col); 851 size_t block_cnt = ceil(x.row * x.col, thread_cnt); 852 k_derive_and_dot_into<<<block_cnt, thread_cnt>>>(x.data, y.data, x.row, x.col); 853 /* 854 for(int i = 0;i < x.row;i++) { 855 for(int j = 0;j < x.col;j++) { 856 float t = x[i][j]; 857 y[i][j] *= t > 0 ? 1 : 0; 858 } 859 } 860 */ 861 } 862 }; 863 #endif 864 865 template<size_t IS_GPU> 866 struct Layer 867 { 868 size_t in_size, out_size, batch_size; 869 Matrix<IS_GPU> w, b; 870 Relu<IS_GPU> relu; 871 872 // z = w * in + b 873 // a = relu(z) 874 Matrix<IS_GPU> z, a, delta; 875 876 Layer(size_t _in_size, size_t _out_size, size_t _batch_size=1) : 877 in_size(_in_size), out_size(_out_size), batch_size(_batch_size), 878 w(_out_size, _in_size), b(1, _out_size), 879 z(_batch_size, _out_size), a(_batch_size, _out_size), 880 delta(_batch_size, _out_size) 881 { 882 assert(in_size > 0); 883 assert(out_size > 0); 884 assert(batch_size > 0); 885 w.init_gauss(); 886 b.init_gauss(); 887 } 888 889 void reset_batch_size(size_t _batch_size) 890 { 891 assert(_batch_size > 0); 892 if(_batch_size == batch_size) return; 893 batch_size = _batch_size; 894 895 z.free(); 896 a.free(); 897 delta.free(); 898 z = Matrix<IS_GPU>(batch_size, out_size); 899 a = Matrix<IS_GPU>(batch_size, out_size); 900 delta = Matrix<IS_GPU>(batch_size, out_size); 901 } 902 903 void calc(const Matrix<IS_GPU>& in) 904 { 905 assert(in.row == batch_size); 906 assert(in.col == in_size); 907 908 // out x in 909 // batch x in 910 // batch x out 911 w.mul_to(in, z); 912 913 // 1 x out 914 // batch x out 915 b.cast_add_to(z); 916 917 // batch x out 918 // batch x out 919 relu.forward(z, a); 920 } 921 922 void propagate_delta(Matrix<IS_GPU>& out) 923 { 924 assert(out.row == batch_size); 925 assert(out.col == in_size); 926 927 // out x in 928 // batch x out 929 // batch x in 930 w.t_and_mul_to(delta, out); 931 } 932 933 void update_parameters(float alpha, const Matrix<IS_GPU> &prev_a) 934 { 935 assert(prev_a.row == batch_size); 936 assert(prev_a.col == in_size); 937 938 // 1 x out 939 // 1 x out 940 b.grad(-alpha, delta); 941 942 // out x in 943 // batch x out 944 // batch x in 945 w.grad(-alpha, delta, prev_a); 946 } 947 }; 948 949 int MsbInt(char buf[]) 950 { 951 int base = 1; 952 int ret = 0; 953 for(int i = 3;i >= 0;i--) { 954 ret += (unsigned char)buf[i] * base; 955 base *= 256; 956 } 957 return ret; 958 } 959 960 vector<int> ReadMnistLabels(string fileName) 961 { 962 vector<int> ret; 963 ifstream ifs(fileName.c_str(), ios::binary); 964 char buf[1000]; 965 966 // MAGIC 967 ifs.read(buf, 4); 968 int magic = MsbInt(buf); 969 assert(magic == 0x00000801); 970 971 // num of images 972 ifs.read(buf, 4); 973 int nImages = MsbInt(buf); 974 975 while(nImages--) { 976 ret.push_back(ifs.get()); 977 } 978 979 return ret; 980 } 981 982 Matrix<CPU> ReadMnistData(const string& fileName) 983 { 984 ifstream ifs(fileName.c_str(), ios::binary); 985 char buf[1000]; 986 987 // MAGIC 988 ifs.read(buf, 4); 989 int magic = MsbInt(buf); 990 assert(magic == 0x00000803); 991 992 // num of images 993 ifs.read(buf, 4); 994 int nImages = MsbInt(buf); 995 996 int row, col; 997 ifs.read(buf, 4); 998 row = MsbInt(buf); 999 ifs.read(buf, 4);1000 col = MsbInt(buf);1001 assert(row == 28 && col == 28);1002 1003 size_t size = row * col;1004 1005 Matrix<CPU> ret(nImages, size);1006 1007 for(int k = 0;k < nImages;k++) {1008 for(int i = 0;i < row * col;i++) {1009 ret[k][i] = ifs.get() / 256.0; // 归一化1010 }1011 }1012 1013 return ret;1014 }1015 1016 Matrix<CPU> translateLabels(const vector<int>& labels, int k=10) 1017 {1018 Matrix<CPU> ret(labels.size(), k);1019 1020 for(int i = 0;i < labels.size();i++) {1021 assert(labels[i] >= 0 && labels[i] < k);1022 ret[i][labels[i]] = 1;1023 }1024 return ret;1025 }1026 1027 template<size_t IS_GPU>1028 void reset_model_batch_size(vector<Layer<IS_GPU> >&model, size_t batch_size)1029 {1030 for(int i = 0;i < model.size();i++) {1031 model[i].reset_batch_size(batch_size);1032 }1033 }1034 1035 template<size_t IS_GPU>1036 vector<bool> forward(1037 vector<Layer<IS_GPU> >& model,1038 const Matrix<IS_GPU>& in, 1039 const Matrix<IS_GPU>& label, 1040 const vector<int>& raw_label, 1041 const Softmax<IS_GPU>& s,1042 float& error)1043 {1044 size_t batch_size = in.row;1045 assert(model[0].batch_size == batch_size);1046 assert(label.row == batch_size);1047 assert(raw_label.size() == batch_size);1048 1049 size_t nLayer = model.size();1050 1051 for(int j = 0;j < nLayer;j++) {1052 Layer<IS_GPU> &layer = model[j];1053 if(j == 0) {1054 layer.calc(in);1055 } else {1056 layer.calc(model[j-1].a);1057 }1058 }1059 1060 Layer<IS_GPU> &lastLayer = model[nLayer - 1];1061 Matrix<IS_GPU> error2(1, batch_size);1062 s.calc(lastLayer.a, label, error2);1063 error = error2.sum() / batch_size;1064 error2.free();1065 1066 Matrix<CPU> cpu_a = lastLayer.a.to_cpu();1067 vector<bool> ret(batch_size);1068 for(int i = 0;i < batch_size;i++) {1069 ret[i] = cpu_a.splice(i,i + 1).max_idx() == raw_label[i];1070 }1071 cpu_a.free();1072 return ret;1073 }1074 1075 template<size_t IS_GPU>1076 void run(Matrix<IS_GPU> &train_data,1077 Matrix<IS_GPU> &train_label,1078 vector<int> &raw_train_label,1079 1080 Matrix<IS_GPU> &test_data,1081 Matrix<IS_GPU> &test_label,1082 vector<int> &raw_test_label,1083 1084 vector<Layer<IS_GPU> >& model,1085 size_t batch_size1086 )1087 {1088 clock_t start = clock();1089 1090 size_t M = train_data.row;1091 size_t T = test_data.row;1092 1093 Softmax<IS_GPU> s;1094 float avg_error = 0;1095 float error;1096 for(int i = 0;i < M;i += batch_size) {1097 size_t this_batch_size = std::min(batch_size, M - i);1098 reset_model_batch_size(model, this_batch_size);1099 Matrix<IS_GPU> this_batch_train_data = http://www.mamicode.com/train_data.splice(i, i + this_batch_size);1100 Matrix<IS_GPU> this_batch_train_label = train_label.splice(i, i + this_batch_size);1101 vector<int> this_batch_raw_train_label(raw_train_label.begin() + i, raw_train_label.begin() + i + this_batch_size);1102 1103 /*1104 cout << this_batch_train_data.shape() << endl;1105 cout << this_batch_train_label.shape() << endl;1106 cout << this_batch_raw_train_label.size() << endl;1107 */1108 1109 forward<IS_GPU>(1110 model, 1111 this_batch_train_data,1112 this_batch_train_label,1113 this_batch_raw_train_label,1114 s,1115 error1116 );1117 avg_error += error * this_batch_size;1118 1119 // backward1120 for(int j = model.size() - 1;j >= 0;j--) {1121 Layer<IS_GPU> &layer = model[j];1122 if(j == model.size() - 1) {1123 s.propagate_delta(layer.a, this_batch_train_label, layer.delta);1124 } else {1125 model[j + 1].propagate_delta(layer.delta);1126 }1127 layer.relu.derive_and_dot_into(layer.a, layer.delta);1128 }1129 1130 for(int j = 0;j < model.size();j++) {1131 model[j].update_parameters(0.001, j == 0 ? this_batch_train_data : model[j-1].a);1132 }1133 }1134 avg_error /= M;1135 1136 clock_t mid = clock();1137 cout << "\ttime=" << ((mid-start)*1.0/CLOCKS_PER_SEC) << " error=" << avg_error << endl;1138 1139 // 测试1140 reset_model_batch_size(model, M);1141 vector<bool> is_good = forward(model, 1142 train_data.splice(0, M), 1143 train_label.splice(0, M), 1144 vector<int>(raw_train_label.begin(), raw_train_label.begin() + M),1145 s, error);1146 size_t total = 0, good = 0;1147 for(int i = 0;i < M;i++) {1148 if(is_good[i]) good++;1149 total++;1150 }1151 cout << "\ttrain_accuracy=" << (good*1.0/total) << " ";1152 1153 total = good = 0;1154 reset_model_batch_size(model, T);1155 is_good = forward(model, 1156 test_data.splice(0, T),1157 test_label.splice(0, T),1158 vector<int>(raw_test_label.begin(), raw_test_label.begin() + T),1159 s, error);1160 for(int i = 0;i < T;i++) {1161 if(is_good[i]) good++;1162 total++;1163 }1164 cout << "test_accuracy=" << (good*1.0/total) << " ";1165 1166 clock_t end = clock();1167 cout << "total_time=" << (end-start)*1.0/CLOCKS_PER_SEC << endl;1168 }1169 1170 void test()1171 {1172 /*1173 0 11174 1 21175 2 31176 */1177 Matrix<CPU> a(3, 2);1178 for(int i = 0;i < a.row;i++) {1179 for(int j = 0;j < a.col;j++) {1180 a[i][j] = i + j;1181 }1182 }1183 //Matrix splice(size_t from, size_t to) {1184 Matrix<CPU> t = a.splice(1,2);1185 assert(t.row == 1 && t.col == 2 && t[0][0] == 1);1186 1187 //string shape() const {1188 assert(a.shape() == "3 x 2");1189 1190 //void free() {1191 t = Matrix<CPU>(1,1);1192 t.free();1193 assert(!t.data);1194 1195 //void init(float v) {1196 t = Matrix<CPU>(2,2);1197 t.init(2);1198 assert(t[1][1] == 2);1199 1200 //void init_gauss() {1201 t = Matrix<CPU>(2,2);1202 assert(t[1][1] == 0);1203 t.init_gauss();1204 assert(t[1][1] != 0);1205 1206 //float *operator[](size_t i) {1207 //const float *operator[] (size_t i) const {1208 1209 //void assert_eq(const Matrix& rhs)1210 t = Matrix<CPU>(2,2);1211 Matrix<CPU> t2(2,2);1212 t.assert_eq(t2);1213 1214 //size_t max_idx() const {1215 assert(a.splice(0,1).max_idx() == 1);1216 1217 //float sum() const {1218 assert(a.splice(1,2).sum() == 3);1219 1220 //void mul_to(const Matrix& in, Matrix& out) const {1221 /*1222 0 11223 1 21224 2 31225 1226 x1227 1228 1 21229 3 41230 1231 =1232 1233 2 5 81234 4 11 181235 */1236 Matrix<CPU> b(2, 2);1237 b[0][0] = 1; b[0][1] = 2;1238 b[1][0] = 3; b[1][1] = 4;1239 Matrix<CPU> out(2, 3);1240 a.mul_to(b, out);1241 assert(out[0][0] == 2 && out[0][1] == 5 && out[0][2] == 8);1242 assert(out[1][0] == 4 && out[1][1] == 11 && out[1][2] == 18);1243 1244 1245 //void t_and_mul_to(const Matrix& in, Matrix& out) const {1246 /*1247 0 11248 1 2 -> 0 1 21249 2 3 1 2 31250 1251 x1252 1253 1 2 31254 4 5 61255 1256 =1257 1258 8 141259 17 321260 */1261 //Matrix<CPU> ap = a.t();1262 b = Matrix<CPU>(2, 3);1263 b[0][0] = 1; b[0][1] = 2; b[0][2] = 3;1264 b[1][0] = 4; b[1][1] = 5; b[1][2] = 6;1265 out = Matrix<CPU>(2, 2);1266 a.t_and_mul_to(b, out);1267 //cout << out << endl;1268 //Matrix<CPU> out2(2, 2);1269 //ap.mul_to(b, out2);1270 //cout << out2 << endl;1271 //out.assert_eq(out2);1272 assert(out[0][0] == 8 && out[0][1] == 14);1273 assert(out[1][0] == 17 && out[1][1] == 32);1274 1275 //void cast_add_to(Matrix& out) const {1276 /*1277 1 21278 +1279 1 21280 3 41281 =1282 2 41283 4 61284 */1285 b = Matrix<CPU>(1, 2);1286 b[0][0] = 1; b[0][1] = 2;1287 out = Matrix<CPU>(2,2);1288 out[0][0] = 1; out[0][1] = 2;1289 out[1][0] = 3; out[1][1] = 4;1290 b.cast_add_to(out);1291 assert(out[0][0] == 2 && out[0][1] == 4);1292 assert(out[1][0] == 4 && out[1][1] == 6);1293 1294 //void grad(float f, Matrix& delta) {1295 /*1296 1 21297 grad with f = -0.51298 1 21299 3 41300 =1301 -1 -11302 */1303 b = Matrix<CPU>(1, 2);1304 b[0][0] = 1; b[0][1] = 2;1305 out = Matrix<CPU>(2,2);1306 out[0][0] = 1; out[0][1] = 2;1307 out[1][0] = 3; out[1][1] = 4;1308 b.grad(-0.5, out);1309 assert(b[0][0] == -1 && b[0][1] == -1);1310 1311 //void grad(float f, const Matrix& delta, const Matrix& prev_a) {1312 /*1313 0 11314 1 21315 2 31316 1317 grad with f = -0.51318 1319 delta=1320 1 2 3 -> 11321 4 5 6 21322 31323 1324 prev_a=1325 7 8 -> 71326 9 10 81327 1328 =1329 1330 [0][1] = 1 - 0.5 * (1 * 8 + 4 * 10) = 1 - 0.5 * 48 = 1 - 24 = -231331 */1332 Matrix<CPU> w(3, 2);1333 w[0][0] = 0; w[0][1] = 1;1334 w[1][0] = 1; w[1][1] = 2;1335 w[2][0] = 2; w[2][1] = 3;1336 1337 Matrix<CPU> delta(2, 3);1338 delta[0][0] = 1; delta[0][1] = 2; delta[0][2] = 3;1339 delta[1][0] = 4; delta[1][1] = 5; delta[1][2] = 6;1340 1341 Matrix<CPU> prev_a(2, 2);1342 prev_a[0][0] = 7; prev_a[0][1] = 8;1343 prev_a[1][0] = 9; prev_a[1][1] = 10;1344 1345 w.grad(-0.5, delta, prev_a);1346 //cout << w << endl;1347 assert(w[0][1] == -23);1348 1349 //Matrix to_cpu() const {1350 t = a.to_cpu();1351 t.assert_eq(a);1352 }1353 1354 int main()1355 {1356 cout << "Testing" << endl;1357 test();1358 //return 0;1359 1360 cout << "Loading data" << endl;1361 // 读取数据1362 vector<int> raw_train_label = ReadMnistLabels("mnist/train-labels-idx1-ubyte");1363 //assert(raw_train_label.size() == 60000);1364 Matrix<CPU> cpu_train_data = http://www.mamicode.com/ReadMnistData("mnist/train-images-idx3-ubyte");1365 //assert(cpu_train_data.row == 60000 && cpu_train_data.col == 28 * 28);1366 Matrix<CPU> cpu_train_label = translateLabels(raw_train_label);1367 //assert(cpu_train_label.row == 60000 && cpu_train_label.col == 10);1368 #ifdef GPU1369 Matrix<GPU> gpu_train_data(cpu_train_data.to_gpu());1370 Matrix<GPU> gpu_train_label(cpu_train_label.to_gpu());1371 #endif1372 1373 1374 vector<int> raw_test_label = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");1375 //assert(raw_test_label.size() == 10000);1376 Matrix<CPU> cpu_test_data = http://www.mamicode.com/ReadMnistData("mnist/t10k-images-idx3-ubyte");1377 //assert(cpu_test_data.row == 10000 && cpu_test_data.col == 28 * 28);1378 Matrix<CPU> cpu_test_label = translateLabels(raw_test_label);1379 //assert(cpu_test_label.row == 10000 && cpu_test_label.col == 10);1380 #ifdef GPU1381 Matrix<GPU> gpu_test_data(cpu_test_data.to_gpu());1382 Matrix<GPU> gpu_test_label(cpu_test_label.to_gpu());1383 #endif1384 1385 size_t n_input = cpu_train_data.col;1386 size_t n_output = 10;1387 size_t n_mid = 100;1388 size_t batch_size = 256;1389 1390 cout << "Now run" << endl;1391 srand(1000);1392 vector<Layer<CPU> > cpu_model;1393 cpu_model.push_back(Layer<CPU>(n_input, n_mid));1394 cpu_model.push_back(Layer<CPU>(n_mid, n_output));1395 #ifdef GPU1396 srand(1000);1397 vector<Layer<GPU> > gpu_model;1398 gpu_model.push_back(Layer<GPU>(n_input, n_mid));1399 gpu_model.push_back(Layer<GPU>(n_mid, n_output));1400 #endif1401 1402 for(int i = 0; i < 5;i++) {1403 cout << "cpu-" << (i+1) << endl;1404 run<CPU>(cpu_train_data, cpu_train_label, raw_train_label, cpu_test_data, cpu_test_label, raw_test_label, cpu_model, batch_size);1405 #ifdef GPU1406 cout << "gpu-" << (i+1) << endl;1407 run<GPU>(gpu_train_data, gpu_train_label, raw_train_label, gpu_test_data, gpu_test_label, raw_test_label, gpu_model, batch_size);1408 #endif1409 }1410 1411 cout << "Done" << endl;1412 return 0;1413 }
CUDA跑MNIST,加速
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。