首页 > 代码库 > Spark学习笔记——手写数字识别
Spark学习笔记——手写数字识别
import org.apache.spark.ml.classification.RandomForestClassifier import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.mllib.classification.{LogisticRegressionWithLBFGS, NaiveBayes, SVMWithSGD} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.optimization.L1Updater import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.{DecisionTree, RandomForest} import org.apache.spark.mllib.tree.configuration.Algo import org.apache.spark.mllib.tree.impurity.Entropy /** * Created by common on 17-5-17. */ case class LabeledPic( label: Int, pic: List[Double] = List() ) object DigitRecognizer { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("DigitRecgonizer").setMaster("local") val sc = new SparkContext(conf) // 去掉第一行,sed 1d train.csv > train_noheader.csv val trainFile = "file:///media/common/工作/kaggle/DigitRecognizer/train_noheader.csv" val trainRawData = http://www.mamicode.com/sc.textFile(trainFile)",")) val trainData = http://www.mamicode.com/trainRecords.map { r =>"贝叶斯模型正确率:" + nbAccuracy) // // // 对测试数据进行预测 // val testRawData = http://www.mamicode.com/sc.textFile("file:///media/common/工作/kaggle/DigitRecognizer/test_noheader.csv") // // 通过逗号对数据进行分割,生成数组的rdd // val testRecords = testRawData.map(line => line.split(",")) // // val testData = http://www.mamicode.com/testRecords.map { r =>"file:///media/common/工作/kaggle/DigitRecognizer/test_predict") // // 使用线性回归模型 // val lrModel = new LogisticRegressionWithLBFGS() // .setNumClasses(10) // .run(trainData) // // val lrTotalCorrect = trainData.map { point => // if (lrModel.predict(point.features) == point.label) 1 else 0 // }.sum // val lrAccuracy = lrTotalCorrect / trainData.count // // println("线性回归模型正确率:" + lrAccuracy) // // // 对测试数据进行预测 // val testRawData = http://www.mamicode.com/sc.textFile("file:///media/common/工作/kaggle/DigitRecognizer/test_noheader.csv") // // 通过逗号对数据进行分割,生成数组的rdd // val testRecords = testRawData.map(line => line.split(",")) // // val testData = http://www.mamicode.com/testRecords.map { r =>"file:///media/common/工作/kaggle/DigitRecognizer/test_predict1") // // 使用决策树模型 // val maxTreeDepth = 10 // val numClass = 10 // val dtModel = DecisionTree.train(trainData, Algo.Classification, Entropy, maxTreeDepth, numClass) // // val dtTotalCorrect = trainData.map { point => // if (dtModel.predict(point.features) == point.label) 1 else 0 // }.sum // val dtAccuracy = dtTotalCorrect / trainData.count // // println("决策树模型正确率:" + dtAccuracy) // // // 对测试数据进行预测 // val testRawData = http://www.mamicode.com/sc.textFile("file:///media/common/工作/kaggle/DigitRecognizer/test_noheader.csv") // // 通过逗号对数据进行分割,生成数组的rdd // val testRecords = testRawData.map(line => line.split(",")) // // val testData = http://www.mamicode.com/testRecords.map { r =>"file:///media/common/工作/kaggle/DigitRecognizer/test_predict2") // // 使用随机森林模型 // val numClasses = 30 // val categoricalFeaturesInfo = Map[Int, Int]() // val numTrees = 50 // val featureSubsetStrategy = "auto" // val impurity = "gini" // val maxDepth = 10 // val maxBins = 32 // val rtModel = RandomForest.trainClassifier(trainData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) // // val rtTotalCorrect = trainData.map { point => // if (rtModel.predict(point.features) == point.label) 1 else 0 // }.sum // val rtAccuracy = rtTotalCorrect / trainData.count // // println("随机森林模型正确率:" + rtAccuracy) // // // 对测试数据进行预测 // val testRawData = http://www.mamicode.com/sc.textFile("file:///media/common/工作/kaggle/DigitRecognizer/test_noheader.csv") // // 通过逗号对数据进行分割,生成数组的rdd // val testRecords = testRawData.map(line => line.split(",")) // // val testData = http://www.mamicode.com/testRecords.map { r =>"file:///media/common/工作/kaggle/DigitRecognizer/test_predict") } }
Spark学习笔记——手写数字识别
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。