首页 > 代码库 > Notes : <Hands-on ML with Sklearn & TF> Chapter 3
Notes : <Hands-on ML with Sklearn & TF> Chapter 3
Chapter 3-Classification
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script><script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js"></script><style></style><style></style><style></style><style></style>
<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML"></script><script type="text/x-mathjax-config">// </script>
MNIST¶
- MNIST is a dataset which has 70,000 small images
- "Hello World" of Machine Learning
# fetch MNIST, from sklearn.datasets import fetch_mldatamnist = fetch_mldata(‘MNIST original‘)#但是总是显示下载失败,下载mnist-original.mat到~/scikit_learn_data/mldata/内。#mldata.org//google
- A DESCR key describing the dataset
- A data key containing an array with one row per instance and one column per feature
- A target containing an array with the labels
X, y = mnist["data"],mnist["target"]print(X.shape,y.shape) #784 = 28pixels x 28pixels from 0-255(white-black)
(70000, 784) (70000,)
#show%matplotlib inlineimport matplotlibimport matplotlib.pyplot as pltsome_digit = X[12345]some_digit_image = some_digit.reshape(28,28)plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")plt.axis(‘off‘)plt.show()
# EXTRAimport numpy as npdef plot_digits(instances, images_per_row=10, **options): size = 28 images_per_row = min(len(instances), images_per_row) images = [instance.reshape(size,size) for instance in instances] #转换成100个像素阵 n_rows = (len(instances) - 1) // images_per_row + 1 row_images = [] n_empty = n_rows * images_per_row - len(instances) images.append(np.zeros((size, size * n_empty))) for row in range(n_rows): rimages = images[row * images_per_row : (row + 1) * images_per_row] row_images.append(np.concatenate(rimages, axis=1)) #实现list的reshape image = np.concatenate(row_images, axis=0) plt.imshow(image, cmap = matplotlib.cm.binary, **options) plt.axis("off")plt.figure(figsize=(9,9))example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]] #把这100个图连起来plot_digits(example_images, images_per_row=10)plt.show()
y[12345]
1.0
- shuffle the train set
- similar cross-validation folds
- some algorithms sensitive to instance‘s order , similar instances in a row performs poorly
import numpy as npX_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]shuffle_index = np.random.permutation(60000)X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
Train a Binary Classifer¶
- 判断一张图是不是某个数字就是一个 Binary Classifer问题。 如: 5 or not-5
- Stochastic Grandient Descant(SGD) Classifer 随机梯度下降分类
- train instance independently
y_train_5 = (y_train == 5)y_test_5 = (y_test == 5)from sklearn.linear_model import SGDClassifiersgd_clf = SGDClassifier(random_state = 42)sgd_clf.fit(X_train, y_train_5)sgd_clf.predict([X[36000]])
array([ True], dtype=bool)
Preformance Measures¶
Measuing Accuracy Using Cross-Validation
- need more control
from sklearn.model_selection import StratifiedKFoldfrom sklearn.base import clone#StratifiedKFold performs stratified samplingskfolds = StratifiedKFold(n_splits=3, random_state=42)for train_index, test_index in skfolds.split(X_train, y_train_5): clone_clf = clone(sgd_clf) X_train_folds = X_train[train_index] y_train_folds = (y_train_5[train_index]) X_test_fold = X_train[test_index] y_test_fold = (y_train_5[test_index]) clone_clf.fit(X_train_folds, y_train_folds) y_pred = clone_clf.predict(X_test_fold) n_current = sum(y_pred == y_test_fold) print(n_current/len(y_pred))
0.9530.95250.95515
# use cross_val_scorefrom sklearn.model_selection import cross_val_scorecross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring=‘accuracy‘)
array([ 0.953 , 0.9525 , 0.95515])
这并不代表精确度高,因为即使全为no-5s的交叉验证的正确率也有90%
from sklearn.base import BaseEstimatorclass Never5Classifier(BaseEstimator): def fit(self, X, y=None): pass def predict(self, X): return np.zeros((len(X), 1), dtype=bool) never_5_clf = Never5Classifier()cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring=‘accuracy‘)
array([ 0.90825, 0.9112 , 0.9095 ])
Confusion Matrix
来源 wikipeadia confusion matrix
from sklearn.metrics import confusion_matrixfrom sklearn.model_selection import cross_val_predicty_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)confusion_matrix(y_train_5, y_train_pred)
array([[53207, 1372], [ 1415, 4006]])
y_train_perfect_predictions = y_train_5confusion_matrix(y_train_5, y_train_perfect_predictions)
array([[54579, 0], [ 0, 5421]])
Precision and Recall
from sklearn.metrics import precision_score, recall_scoreprint(precision_score(y_train_5, y_train_pred))print(recall_score(y_train_5, y_train_pred))
0.7448865749350.738978048331
# f1 is the harmonic meanfrom sklearn.metrics import f1_scoref1_score(y_train_5, y_train_pred)
0.7419205481989074
f1 favor classifier that has similar precision and recall</br> 但情况并不总是这样</br> 宁可错杀一百,不可放过一个:low recall, high precision , 如:视频等级划分;或者情况相反:如抓小偷</br>
Precision / Recall Tradeoff
- lowing the threshold increase recall and reduce precision
- sklearn doesn‘t let you set the threshold directly and give you access to the decision secores(use to prediction)
some_digit_index = 36000some_digit = X[some_digit_index]y_scores = sgd_clf.decision_function([some_digit])y_scores
array([ 45981.28253526])
threshold = 0y_some_digit_pred = (y_scores > threshold)y_some_digit_pred
array([ True], dtype=bool)
threshold = 200000y_some_digit_pred = (y_scores > threshold)y_some_digit_pred
array([False], dtype=bool)
decide which threshlod to use
#使用交叉验证获取分数y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
from sklearn.metrics import precision_recall_curve#计算所有precision和recallprecisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
#画出来def plot_precision_recall_vs_threshold(precisions, recalls, threshold): plt.plot(thresholds, precisions[:-1], "b--", label="Precision") plt.plot(thresholds, recalls[:-1], "g-", label="Recall") plt.xlabel("Threshold") plt.legend(loc="upper left") plt.ylim([0,1]) plot_precision_recall_vs_threshold(precisions, recalls, thresholds)plt.show()
y_train_pred_90 = (y_scores > 250000)precision_score(y_train_5, y_train_pred_90)
0.96514161220043571
recall_score(y_train_5, y_train_pred_90)
0.32687695997048516
just set the a high enough threshold to creat a classifier with virtually any precision
ROC 受试者工作特征曲线 : the true positive rate against the false positive rate
from sklearn.metrics import roc_curvefpr, tpr, thresholds = roc_curve(y_train_5, y_scores)def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0,1], [0,1], ‘k--‘) plt.axis([0, 1, 0, 1]) plt.xlabel(‘False Positive Rate‘) plt.ylabel(‘True Positive Rate‘) plot_roc_curve(fpr, tpr)plt.show()
# compute the area under the curve(AUC)from sklearn.metrics import roc_auc_scoreroc_auc_score(y_train_5, y_scores)
0.9568006259068953
- positive calss is rare or more care the false negatives use the PR curve
- otherwise use the ROC(ROC, AUC)
- sklearn give decision_function() or predict_proba()(return an array containing an row per instance and a column per class, each containing the probability that the given instance belongs to the given calss)
from sklearn.ensemble import RandomForestClassifierforest_clf = RandomForestClassifier(random_state = 42)y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")
# use the posobility as the scoresy_scores_forest = y_probas_forest[:,1]fprs_forest, tprs_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
plt.plot(fpr, tpr, ‘b:‘, label="SGD")plot_roc_curve(fprs_forest, tprs_forest, "Random forest")plt.legend(loc=‘bottom right‘)plt.show()
/usr/local/lib/python3.5/dist-packages/matplotlib/legend.py:326: UserWarning: Unrecognized location "bottom right". Falling back on "best"; valid locations are lower left center right upper right center right upper center lower right upper left center left lower center best six.iterkeys(self.codes))))
roc_auc_score(y_train_5, y_scores_forest)
0.99114321301880992
- how to train binary classifier
- choose metric for task
- evaluate your classifiers using cross-validation
- select the Precision/Recall tradeoff that fits your needs and compare various medel using ROC curve and ROC/AUC scores
Multiclass Classification¶
- 有些算法本身支持多分类
- 也可使用多个二分类代替的策略
- 多个二分类,要分类时,每个都进行分类,选最高分(OvA)
- 为每一对训练一个分类,如:1-2,1-3,...,9-8,...,一共需要N(N-1)/2,称为one versus one(OvO)
- 一些数据集规模和算法规模关联性不强的使用OvO,如:SVM;其他的使用OvA
- sklearn在使用二分类处理多分类时,自动合适的使用OvA或者OvO
#try SGDClassifiersgd_clf.fit(X_train, y_train)sgd_clf.predict([some_digit])
array([ 5.])
some_digit_secores = sgd_clf.decision_function([some_digit])some_digit_secores
array([[-305117.56076994, -572405.6562905 , -386686.20587505, -198578.92561098, -312977.5748752 , 45981.28253526, -752588.92027703, -425193.41816061, -692575.39314386, -732446.97820597]])
np.argmax(some_digit_secores)
5
sgd_clf.classes_
array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])
sgd_clf.classes_[5] #巧了
5.0
#force sklearn to use OvO or OvA: use OneVsOneClassifer or OneVsRestClassiferfrom sklearn.multiclass import OneVsOneClassifierovo_clf=OneVsOneClassifier(SGDClassifier(random_state=42))ovo_clf.fit(X_train, y_train)ovo_clf.predict([some_digit])
array([ 5.])
forest_clf.fit(X_train, y_train)forest_clf.predict([some_digit])
array([ 5.])
forest_clf.predict_proba([some_digit])
array([[ 0. , 0. , 0. , 0. , 0.1, 0.9, 0. , 0. , 0. , 0. ]])
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring=‘accuracy‘)
array([ 0.87037592, 0.88059403, 0.84912737])
#简单的对输入的缩放:StandardScalerfrom sklearn.preprocessing import StandardScalerscaler = StandardScaler()X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring=‘accuracy‘)
array([ 0.91071786, 0.90684534, 0.91233685])
Error Analysis¶
- look at the confusion matrix
- plot on the errors
- divide each value in confusion matrix by number of images in the corresopnding class
- fill the diagonals with zeros to keep only the errors
#1-1y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)conf_mx = confusion_matrix(y_train, y_train_pred)conf_mx
array([[5729, 2, 23, 8, 11, 50, 49, 9, 40, 2], [ 1, 6505, 42, 21, 6, 40, 6, 10, 100, 11], [ 53, 41, 5336, 102, 81, 26, 84, 67, 154, 14], [ 45, 45, 140, 5359, 6, 220, 36, 49, 134, 97], [ 16, 30, 38, 10, 5361, 11, 50, 33, 77, 216], [ 73, 41, 34, 184, 73, 4588, 104, 32, 195, 97], [ 31, 28, 51, 1, 51, 86, 5613, 8, 49, 0], [ 22, 21, 70, 30, 55, 12, 5, 5815, 18, 217], [ 49, 173, 74, 151, 14, 153, 55, 21, 5021, 140], [ 43, 37, 25, 85, 166, 32, 3, 204, 83, 5271]])
plt.matshow(conf_mx, cmap=plt.cm.gray)plt.show()
most images are on the main diagonal which means that they were classified correctly and 5s is darker means fewer 5s images in the dataset or classifier doesn‘t perform well
#2-1row_sums = conf_mx.sum(axis=1, keepdims=True)norm_conf_mx = conf_mx/row_sums
#2-2np.fill_diagonal(norm_conf_mx, 0)plt.matshow(norm_conf_mx, cmap=plt.cm.gray)plt.show()
- row represent the actual classes
- improve 8s, 9s, 3/5
- count the number of close loops
cl_a, cl_b = 3, 5X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]plt.figure(figsize=(8,8))plt.subplot(221)plot_digits(X_aa[:25], images_per_row=5)plt.subplot(222)plot_digits(X_ab[:25], images_per_row=5)plt.subplot(223)plot_digits(X_ba[:25], images_per_row=5)plt.subplot(224)plot_digits(X_bb[:25], images_per_row=5)plt.show()
看到顶部的直线的底部的弧线中间的连接方式:偏向左边一条直线就是5,偏向右边就是3
Multilabel Classification¶
from sklearn.neighbors import KNeighborsClassifier #support multilabel classificationy_train_large = (y_train >= 7)y_train_odd = (y_train % 2 == 1)y_multilabel = np.c_[y_train_large, y_train_odd]knn_clf = KNeighborsClassifier()knn_clf.fit(X_train, y_multilabel)
KNeighborsClassifier(algorithm=‘auto‘, leaf_size=30, metric=‘minkowski‘, metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights=‘uniform‘)
knn_clf.predict([some_digit])
array([[False, True]], dtype=bool)
#evaluate by f1 scorefrom sklearn.metrics import f1_scorey_train_knn_pred = cross_val_predict(knn_clf, X_train, y_train, cv=3)f1_score(y_train, y_train_knn_pred, average=‘macro‘)
---------------------------------------------------------------------------KeyboardInterrupt Traceback (most recent call last)KeyboardInterrupt:
Multioutput Classification¶
import numpy.random as rndnoise = rnd.randint(0, 100, (len(X_train), 784))X_train_mod = X_train + noisey_train_mod = X_trainnoise = rnd.randint(0, 100, (len(X_test), 784))X_test_mod = X_test + noisey_test_mod = X_test
def plot_digit(data): image = data.reshape(28, 28) plt.imshow(image, cmap = matplotlib.cm.binary, interpolation="nearest") plt.axis("off") some_index = 5500plt.subplot(121); plot_digit(X_test_mod[some_index])plt.subplot(122); plot_digit(y_test_mod[some_index])plt.show()
knn_clf.fit(X_train_mod, y_train_mod)clean_digit=knn_clf.predict([X_test_mod[some_index]])plot_digit(clean_digit)
Notes : <Hands-on ML with Sklearn & TF> Chapter 3