首页 > 代码库 > 简单直接的CUDA改造
简单直接的CUDA改造
把前一篇中的MNIST数据识别程序进行了简单的CUDA改造,得到的结果很差,一个epoch从大约5秒变成了50秒。
也可以理解,我把每个操作(比如mul, add)单独拎出来实现,结果由于不知道自己的上下文,就不要不断的 __syncthreads。
不过还是有一些收获,第一次写CUDA程序,最终还是基本保证了程序的正确性。
// 修正: 5秒和50秒的对比有错,因为两个网络的结构不一样,把之前的网络改成和CUDA程序一样的网络之后,时间变成了 30 vs 50,CUDA还是慢一些。
1 #include <iostream> 2 #include <cstdlib> 3 #include <cassert> 4 #include <string> 5 #include <cstring> 6 #include <fstream> 7 #include <vector> 8 #include <memory> 9 #include <cstdlib> 10 #include <cuda_runtime.h> 11 #include <math_functions.h> 12 #include <cmath> 13 #include <ctime> 14 using namespace std; 15 16 void CheckCudaReturnCode(cudaError_t code, const char *fileName, int lineNo) 17 { 18 if(code == cudaSuccess) return; 19 cerr << "Cuda call failed at " << fileName << ":" << lineNo 20 << " " << cudaGetErrorString(code) << endl; 21 exit(-1); 22 } 23 24 #define CK(x) CheckCudaReturnCode((x), __FILE__, __LINE__) 25 26 // 为了简单,只用一个BLOCK 27 #define BSIZE (blockDim.x) 28 #define TIDX (threadIdx.x) 29 30 bool InitCUDA() 31 { 32 int count; 33 cudaGetDeviceCount(&count); 34 if(count == 0) { 35 cerr << "There is no cuda device" << endl; 36 return false; 37 } 38 cout << "Toal " << count << " cuda devices" << endl; 39 40 int i; 41 for(i = 0;i < count;i++) { 42 cudaDeviceProp prop; 43 if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { 44 if(prop.major >= 1) { 45 break; 46 } 47 } 48 } 49 50 if(i == count) { 51 cerr << "There is no device supporting CUDA 1.x" << endl; 52 return false; 53 } 54 55 cudaSetDevice(i); 56 return true; 57 } 58 59 // http://www.cnblogs.com/yeahgis/archive/2012/07/13/2590485.html 60 // 高斯分布的随机数,均值为0,方差为1 61 double gaussrand() 62 { 63 static double V1, V2, S; 64 static int phase = 0; 65 double X; 66 67 if ( phase == 0 ) { 68 do { 69 double U1 = (double)rand() / RAND_MAX; 70 double U2 = (double)rand() / RAND_MAX; 71 72 V1 = 2 * U1 - 1; 73 V2 = 2 * U2 - 1; 74 S = V1 * V1 + V2 * V2; 75 } while(S >= 1 || S == 0); 76 77 X = V1 * sqrt(-2 * log(S) / S); 78 } else 79 X = V2 * sqrt(-2 * log(S) / S); 80 81 phase = 1 - phase; 82 83 return X; 84 } 85 86 #define ALIGN_FLOAT(x) (((x) + 3) & (~3)) 87 88 template<size_t ROW, size_t COL> 89 struct Matrix 90 { 91 const static int row = ROW; 92 const static int col = COL; 93 float data[ROW][ALIGN_FLOAT(COL)]; 94 __device__ __host__ inline float* operator[](size_t x) 95 { 96 assert(x < ROW); 97 return data[x]; 98 } 99 };100 101 template<size_t SIZE>102 struct Vector103 {104 const static int size = SIZE;105 float data[SIZE];106 __device__ __host__ inline float &operator[](size_t x)107 {108 assert(x < SIZE);109 return data[x];110 }111 };112 113 template<size_t SIZE>114 ostream& operator<<(ostream& out, Vector<SIZE>& v)115 {116 out << "[(" << v.size << ") ";117 for(int i = 0;i < v.size;i++) {118 if(i > 0) out << ",";119 out << v[i];120 }121 out << "]";122 return out;123 }124 125 // 矩阵乘法126 template<size_t ROW, size_t COL>127 __device__ inline void mul(Matrix<ROW,COL> &w, Vector<COL> &x, Vector<ROW> &out)128 {129 /*130 for(int i = 0;i < w.row;i++) {131 out[i] = 0;132 for(int j = 0;j < w.col;j++) {133 out[i] += w[i][j] * x[j];134 }135 }136 */137 for(int i = TIDX;i < w.row;i += BSIZE) {138 out[i] = 0;139 for(int j = 0;j < w.col;j++) {140 out[i] += w[i][j] * x[j];141 }142 }143 __syncthreads();144 }145 146 // 向量点乘147 template<size_t SIZE>148 __device__ inline void dot(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)149 {150 /*151 for(int i = 0;i < x.size;i++) {152 out[i] = x[i] * y[i];153 }154 */155 for(int i = TIDX;i < x.size;i += BSIZE) {156 out[i] = x[i] * y[i];157 }158 __syncthreads();159 }160 161 // w转置,然后和v相乘162 template<size_t ROW, size_t COL>163 __device__ inline void t_and_mul(Matrix<ROW, COL> &w, Vector<ROW> &v, Vector<COL> &out)164 {165 /*166 for(int i = 0;i < w.col;i++) {167 out[i] = 0;168 for(int j = 0;j < w.row;j++) {169 out[i] += w[j][i] * v[j];170 }171 }172 */173 for(int i = TIDX;i < w.col;i++) {174 out[i] = 0;175 for(int j = 0;j < w.row;j++) {176 out[i] += w[j][i] * v[j];177 }178 }179 __syncthreads();180 }181 182 template<size_t SIZE>183 __device__ inline void add(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)184 {185 /*186 for(int i = 0;i < x.size;i++) {187 out[i] = x[i] + y[i];188 }189 */190 for(int i = TIDX;i < x.size;i += BSIZE) {191 out[i] = x[i] + y[i];192 }193 __syncthreads();194 }195 196 template<size_t SIZE>197 __device__ inline void sub(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)198 {199 /*200 for(int i = 0;i < x.size;i++) {201 out[i] = x[i] - y[i];202 }203 */204 for(int i = TIDX;i < x.size;i += BSIZE) {205 out[i] = x[i] - y[i];206 }207 __syncthreads();208 }209 210 template<size_t SIZE>211 __device__ inline void mul(float x, Vector<SIZE> &y, Vector<SIZE> &out)212 {213 /*214 for(int i = 0;i < y.size;i++) {215 out[i] = x * y[i];216 }217 */218 for(int i = TIDX;i < y.size;i += BSIZE) {219 out[i] = x * y[i];220 }221 __syncthreads();222 }223 224 template<size_t SIZE>225 __device__ inline void mul(Vector<SIZE> &x, float y, Vector<SIZE> &out)226 {227 mul(y, x, out);228 }229 230 template<size_t SIZE>231 __device__ inline void copy(Vector<SIZE> &x, Vector<SIZE> &out)232 {233 /*234 for(int i = 0;i < x.size;i++) {235 out[i] = x[i];236 }237 */238 for(int i = TIDX;i < x.size;i += BSIZE) {239 out[i] = x[i];240 }241 __syncthreads();242 }243 244 __device__ inline float sum_of_shared(int size)245 {246 extern __shared__ float shared[];247 248 __syncthreads();249 250 if(TIDX == 0) {251 for(int i = 1;i < size;i++) {252 shared[0] += shared[i];253 }254 }255 __syncthreads();256 257 float ret = shared[0];258 259 __syncthreads();260 261 return ret;262 }263 264 __device__ inline float max_of_shared(int size)265 {266 extern __shared__ float shared[];267 268 __syncthreads();269 270 if(TIDX == 0) {271 for(int i = 1;i < size;i++) {272 if(shared[0] < shared[i]) {273 shared[0] = shared[i];274 }275 }276 }277 __syncthreads();278 279 float ret = shared[0];280 281 __syncthreads();282 283 return ret;284 }285 286 287 template<size_t SIZE>288 __device__ inline float max(Vector<SIZE>& x)289 {290 assert(x.size > 0);291 292 extern __shared__ float shared[];293 294 if(TIDX < x.size) {295 shared[TIDX] = x[TIDX];296 }297 298 for(int i = TIDX + BSIZE;i < x.size;i += BSIZE) {299 if(shared[TIDX] < x[i]) {300 shared[TIDX] = x[i];301 }302 }303 304 return max_of_shared(min(BSIZE, x.size));305 }306 307 template<size_t SIZE>308 __device__ inline float sum(Vector<SIZE>& x)309 {310 assert(x.size > 0);311 312 extern __shared__ float shared[];313 314 if(TIDX < x.size) {315 shared[TIDX] = 0;316 }317 318 for(int i = TIDX;i < x.size;i += BSIZE) {319 shared[TIDX] += x[i];320 }321 322 return sum_of_shared(min(BSIZE, x.size));323 }324 325 template<size_t SIZE>326 __device__ inline void add_with_xs(Vector<SIZE> &x, float xs, Vector<SIZE> &y, Vector<SIZE> &out)327 {328 /*329 for(int i = 0;i < x.size;i++) {330 out[i] = x[i] + xs * y[i];331 }332 */333 for(int i = TIDX;i < x.size;i += BSIZE) {334 out[i] = x[i] + xs * y[i];335 }336 337 __syncthreads();338 }339 340 template<size_t SIZE>341 struct Softmax342 {343 __device__ static inline float calc(Vector<SIZE>& x, Vector<SIZE>& y)344 {345 // - \sum y_j * log( exp(x_j) / \sum exp(x_k) )346 /*347 log( exp(x_j) / \sum exp(x_k) )348 = x_j - log \sum exp(x_k)349 = x_j - (max + log \sum exp(x_k - max))350 */351 352 float maxX = max(x);353 354 /*355 float xSum = 0;356 for(int i = 0;i < x.size;i++) {357 xSum += expf(x[i] - maxX);358 }359 */360 361 extern __shared__ float shared[];362 if(TIDX < x.size) {363 shared[TIDX] = 0;364 }365 366 for(int i = TIDX;i < x.size;i += BSIZE) {367 shared[TIDX] += expf(x[i] - maxX);368 }369 if(TIDX == 0) {370 for(int i = 1;i < BSIZE && i < x.size;i++) {371 shared[0] += shared[i];372 }373 }374 375 float xSum = sum_of_shared(min(BSIZE, x.size));376 377 /*378 float ret = 0;379 for(int i = 0;i < x.size;i++) {380 ret += y[i] * (x[i] - (maxX + logf(xSum)));381 }382 */383 if(TIDX < x.size) {384 shared[TIDX] = 0;385 }386 for(int i = TIDX;i < x.size;i += BSIZE) {387 shared[i] += y[i] * (x[i] - (maxX + logf(xSum)));388 }389 390 float ret = sum_of_shared(min(BSIZE, x.size));391 392 return -ret;393 }394 395 396 static inline float calc_host(Vector<SIZE>& x, Vector<SIZE>& y)397 {398 // - \sum y_j * log( exp(x_j) / \sum exp(x_k) )399 /*400 log( exp(x_j) / \sum exp(x_k) )401 = x_j - log \sum exp(x_k)402 = x_j - (max + log \sum exp(x_k - max))403 */404 405 float maxX = x[0];406 for(int i = 1;i < x.size;i++) {407 if(x[i] > maxX) {408 maxX = x[i];409 }410 }411 412 float xSum = 0;413 for(int i = 0;i < x.size;i++) {414 xSum += expf(x[i] - maxX);415 }416 417 float ret = 0;418 for(int i = 0;i < x.size;i++) {419 ret += y[i] * (x[i] - (maxX + logf(xSum)));420 }421 422 return -ret;423 }424 425 __device__ static inline void propagate_delta(Vector<SIZE> &x, Vector<SIZE> &y, Vector<SIZE> &out)426 {427 /*428 - d \sum y_j * log( exp(x_j) / \sum exp(x_k) )429 = - d \sum y_j * x_j - d \sum y_j log (\sum exp(x_k) )430 = - y_i + \sum (y_j * exp(x_i) / \sum exp(x_k))431 = - y_i + exp(x_i) (\sum y_j) / (\sum exp(x_k))432 */433 434 float maxX = max(x);435 436 // -y + exp(x) sum_of_y / sum_of_exp(x)437 438 /*439 float sumOfY = 0;440 float sumOfX = 0;441 442 for(int i = 0;i < x.size;i++) {443 out[i] = expf(x[i] - maxX);444 sumOfY += y[i];445 sumOfX += out[i];446 }447 */448 449 for(int i = TIDX;i < x.size;i += BSIZE) {450 out[i] = expf(x[i] - maxX);451 }452 453 float sumOfY = sum(y);454 float sumOfX = sum(out);455 456 float t = sumOfY/sumOfX;457 458 mul(t, out, out);459 sub(out, y, out);460 }461 };462 463 template<size_t SIZE>464 struct Relu465 {466 __device__ static inline void forward(Vector<SIZE> &x, Vector<SIZE> &out)467 {468 /*469 for(int i = 0;i < x.size;i++) {470 out[i] = x[i] >= 0 ? x[i] : 0;471 }472 */473 for(int i = TIDX;i < x.size;i += BSIZE) {474 out[i] = x[i] >= 0 ? x[i] : 0;475 }476 __syncthreads();477 }478 479 __device__ static inline void derive(Vector<SIZE> &x, Vector<SIZE> &out)480 {481 /*482 for(int i = 0;i < x.size;i++) {483 out[i] = x[i] >= 0 ? 1 : 0;484 }485 */486 for(int i = TIDX;i < x.size;i += BSIZE) {487 out[i] = x[i] >= 0 ? 1 : 0;488 }489 __syncthreads();490 }491 492 __device__ static inline void derive_and_dot_into(Vector<SIZE> &x, Vector<SIZE> &out)493 {494 // out = dot(dx, out)495 /*496 for(int i = 0;i < x.size;i++) {497 out[i] = out[i] * (x[i] >= 0 ? 1 : 0);498 }499 */500 for(int i = TIDX;i < x.size;i += BSIZE) {501 out[i] = out[i] * (x[i] >= 0 ? 1 : 0);502 }503 __syncthreads();504 }505 };506 507 // NN的一层508 // 1. 输入不算一层509 // 2. 层的w矩阵是从前面一层到当前层的w,和NG的定义有些出入510 // 3. 层的b是前面一层到当前层的b,和NG的定义有些出入511 template <size_t IN_SIZE, size_t OUT_SIZE, typename Activator=Relu<OUT_SIZE> >512 struct Layer513 {514 typedef Activator ActivatorType;515 516 // 上一层的输出的个数,不包括bias517 const static int inSize = IN_SIZE;518 // 当前层的输出519 const static int outSize = OUT_SIZE;520 521 Matrix<OUT_SIZE, IN_SIZE> w;522 Vector<OUT_SIZE> b;523 // 最后一次forward计算之后保存的激活值524 Vector<OUT_SIZE> a;525 Vector<OUT_SIZE> z;526 // 最后一次反向传播计算之后保存的delta值527 Vector<OUT_SIZE> delta;528 529 void initWeights()530 {531 for(int i = 0;i < b.size;i++) {532 b[i] = float(gaussrand() * 0.01);533 }534 for(int i = 0;i < w.row;i++) {535 for(int j = 0;j < w.col;j++) {536 w[i][j] = float(gaussrand() * 0.01);537 }538 }539 }540 541 Layer()542 {543 initWeights();544 }545 546 __device__ inline void calc(Vector<IN_SIZE> &in)547 {548 // w * in + b549 mul(w, in, z);550 add(z, b, z);551 Activator::forward(z, a);552 }553 554 __device__ inline void propagate_delta(Vector<IN_SIZE> &out)555 {556 t_and_mul(w, delta, out);557 }558 559 // alpha是学习率560 // prevA是上一层的输出561 __device__ inline void update_parameters(float alpha, Vector <IN_SIZE> &prevA)562 {563 // b = b - alpha * delta564 add_with_xs(b, -alpha, delta, b);565 566 /*567 for(int i = 0;i < w.row;i++) {568 for(int j = 0;j < w.col;j++) {569 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i];570 }571 }572 */573 for(int i = TIDX;i < w.row;i += BSIZE) {574 for(int j = 0;j < w.col;j++) {575 w[i][j] = w[i][j] - alpha * prevA[j] * delta[i];576 }577 }578 __syncthreads();579 }580 };581 582 // model583 #define INPUT_SIZE (28 * 28)584 #define OUTPUT_SIZE 10585 586 typedef Layer<INPUT_SIZE, 100, Relu<100> > L1_TYPE;587 588 typedef Layer<100, OUTPUT_SIZE, Relu<OUTPUT_SIZE> > L2_TYPE;589 590 #define LN_TYPE L2_TYPE591 #define LN L2592 593 __global__ void forward(594 L1_TYPE &L1, L2_TYPE &L2,595 Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &output)596 {597 L1.calc(input);598 599 L2.calc(L1.a);600 601 copy(L2.a, output);602 }603 604 template<typename CostFun>605 __global__ void backward(606 L1_TYPE &L1, L2_TYPE &L2,607 Vector<INPUT_SIZE> &input, Vector<OUTPUT_SIZE> &y, float alpha)608 {609 // 最后一层610 CostFun::propagate_delta(LN.a, y, LN.delta);611 LN_TYPE::ActivatorType::derive_and_dot_into(LN.a, LN.delta);612 613 // 其它层614 L2.propagate_delta(L1.delta);615 L1_TYPE::ActivatorType::derive_and_dot_into(L1.a, L1.delta);616 617 // 更新所有的w和b618 L1.update_parameters(alpha, input);619 L2.update_parameters(alpha, L1.a);620 }621 622 // 读取一个整数623 int MsbInt(char buf[], int len=4)624 {625 int base = 1;626 int ret = 0;627 for(int i = len - 1;i >= 0;i--) {628 ret += (unsigned char)buf[i] * base;629 base *= 256;630 }631 return ret;632 }633 634 vector<int> ReadMnistLabels(string fileName)635 {636 vector<int> ret;637 ifstream ifs(fileName.c_str(), ios::binary);638 char buf[1000];639 640 // MAGIC641 ifs.read(buf, 4);642 int magic = MsbInt(buf);643 if(magic != 0x00000801) {644 cerr << "incorrect label file magic number" << endl;645 }646 647 // num of images648 ifs.read(buf, 4);649 int nImages = MsbInt(buf);650 651 while(nImages--) {652 ret.push_back(ifs.get());653 }654 655 return ret;656 }657 658 Vector<INPUT_SIZE> * ReadMnistData(string fileName)659 {660 ifstream ifs(fileName.c_str(), ios::binary);661 char buf[1000];662 663 // MAGIC664 ifs.read(buf, 4);665 int magic = MsbInt(buf);666 if(magic != 0x00000803) {667 cerr << "incorrect data file magic number" << endl;668 }669 670 // num of images671 ifs.read(buf, 4);672 int nImages = MsbInt(buf);673 Vector<INPUT_SIZE> * ret = new Vector<INPUT_SIZE>[nImages];674 675 int row, col;676 ifs.read(buf, 4);677 row = MsbInt(buf);678 ifs.read(buf, 4);679 col = MsbInt(buf);680 if(row * col != INPUT_SIZE) {681 cerr << "incorrect image size" << endl;682 }683 684 for(int k = 0;k < nImages;k++) {685 Vector<INPUT_SIZE> &image = ret[k];686 for(int i = 0;i < row * col;i++) {687 image[i] = ifs.get() / 256.0; // 归一化688 }689 }690 691 return ret;692 }693 694 Vector<OUTPUT_SIZE>* translateLabels(vector<int> &labels, int k=10)695 {696 int n = labels.size();697 Vector<OUTPUT_SIZE> * ret = new Vector<OUTPUT_SIZE>[n];698 699 for(int i = 0;i < labels.size();i++) {700 Vector<OUTPUT_SIZE> &tmp = ret[i];701 memset(&tmp, 0, sizeof(tmp));702 assert(labels[i] >= 0 && labels[i] < k);703 tmp[labels[i]] = 1;704 }705 return ret;706 }707 708 int getMaxIdx(Vector<OUTPUT_SIZE>& x)709 {710 int maxIdx = 0;711 float maxV = x[0];712 for(int i = 0;i < x.size;i++) {713 if(x[i] > maxV) {714 maxV = x[i];715 maxIdx = i;716 }717 }718 return maxIdx;719 }720 721 template <typename T>722 void CUDA_ALLOC_AND_COPY(T *&to, T *from, size_t size)723 {724 CK(cudaMalloc((void**)&to, size));725 CK(cudaMemcpy(to, from, size, cudaMemcpyHostToDevice));726 }727 728 int main()729 {730 srand(1000);731 732 if(!InitCUDA()) {733 return -1;734 }735 736 L1_TYPE *tmpL1 = new L1_TYPE(), *L1;737 CUDA_ALLOC_AND_COPY(L1, tmpL1, sizeof(*L1));738 delete tmpL1;739 740 L2_TYPE *tmpL2 = new L2_TYPE(), *L2;741 CUDA_ALLOC_AND_COPY(L2, tmpL2, sizeof(*L2));742 delete tmpL2;743 744 cout << "Loading data" << endl;745 746 // 读取数据747 vector<int> trainLabels = ReadMnistLabels("mnist/train-labels-idx1-ubyte");748 int nTrain = trainLabels.size();749 Vector<OUTPUT_SIZE>* trainLabels2 = translateLabels(trainLabels);750 Vector<OUTPUT_SIZE>* trainLabels2OnGpu;751 CUDA_ALLOC_AND_COPY(trainLabels2OnGpu, trainLabels2, sizeof(trainLabels2[0]) * nTrain);752 753 Vector<INPUT_SIZE>* trainData = http://www.mamicode.com/ReadMnistData("mnist/train-images-idx3-ubyte");754 Vector<INPUT_SIZE>* trainDataOnGpu;755 CUDA_ALLOC_AND_COPY(trainDataOnGpu, trainData, sizeof(trainData[0]) * nTrain);756 757 vector<int> testLabels = ReadMnistLabels("mnist/t10k-labels-idx1-ubyte");758 int nTest = testLabels.size();759 760 Vector<INPUT_SIZE>* testData = http://www.mamicode.com/ReadMnistData("mnist/t10k-images-idx3-ubyte");761 Vector<INPUT_SIZE>* testDataOnGpu;762 CUDA_ALLOC_AND_COPY(testDataOnGpu, testData, sizeof(testData[0]) * nTest);763 764 int M = nTrain;765 int T = nTest;766 767 typedef Softmax<OUTPUT_SIZE> CostFun;768 769 // 开始训练770 cout << "Start training" << endl;771 clock_t fullStartedAt = clock();772 773 Vector<OUTPUT_SIZE> *outputOnCuda;774 CK(cudaMalloc((void**)&outputOnCuda, sizeof(*outputOnCuda)));775 776 Vector<OUTPUT_SIZE> output;777 778 // 为了简单,只支持一个BLOCK779 #define N_BLOCK 1780 // 可以多个THREAD781 #define N_THREAD 1024782 #define SHARED_SIZE (sizeof(float) * N_THREAD)783 784 for(int step = 0;step < 100000;step++) {785 clock_t step_1 = clock();786 787 float avgError = 0;788 789 for(int i = 0;i < M;i++) {790 Vector<INPUT_SIZE> &x = trainDataOnGpu[i];791 Vector<OUTPUT_SIZE> &y = trainLabels2OnGpu[i];792 793 forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);794 795 CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));796 797 Vector<OUTPUT_SIZE> &hostY = trainLabels2[i];798 float error = CostFun::calc_host(output, hostY);799 avgError += error;800 //cout << output << " " << hostY << endl;801 802 backward<CostFun><<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, y, 0.001);803 }804 avgError /= M;805 806 clock_t step_2 = clock();807 808 cout << "step=" << step << " time_cost=" << (step_2 - step_1)*1.0/CLOCKS_PER_SEC << " avgErr=" << avgError << " ";809 810 // validate811 int nTotal = 0;812 int nGood = 0;813 for(int i = 0;i < M;i++) {814 Vector<INPUT_SIZE> &x = trainDataOnGpu[i];815 816 forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);817 818 CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));819 820 int maxIdx = getMaxIdx(output);821 if(maxIdx == trainLabels[i]) {822 nGood++;823 }824 nTotal++;825 }826 cout << "train_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";827 bool doBreak = false;828 if(nGood * 1.0 / nTotal > 0.95) {829 doBreak = true;830 }831 832 // check833 nTotal = 0;834 nGood = 0;835 for(int i = 0;i < T;i++) {836 Vector<INPUT_SIZE> &x = testDataOnGpu[i];837 838 forward<<<N_BLOCK, N_THREAD, SHARED_SIZE>>>(*L1, *L2, x, *outputOnCuda);839 840 CK(cudaMemcpy(&output, outputOnCuda, sizeof(output), cudaMemcpyDeviceToHost));841 842 int maxIdx = getMaxIdx(output);843 if(maxIdx == testLabels[i]) {844 nGood++;845 }846 nTotal++;847 }848 cout << "test_accuracy " << nGood << "/" << nTotal << "=" << nGood*1.0/nTotal << " ";849 850 clock_t step_3 = clock();851 cout << "total_time=" << (step_3-step_1)*1.0/CLOCKS_PER_SEC << endl;852 if(doBreak) {853 break;854 }855 }856 857 clock_t fullEndedAt = clock();858 cout << "Total cost " << (fullEndedAt - fullStartedAt)/CLOCKS_PER_SEC << " seconds" << endl;859 860 return 0;861 }
简单直接的CUDA改造
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。