首页 > 代码库 > ROC检验分类树性能
ROC检验分类树性能
001 #############################################################002 ############# 读取excel文件 ######################003 #############################################################004 #root<-"C:/Users/liming/Desktop/写书/chap7/cbc/"005 root<-"E:/work/写书/chap7/cbc(可以使用-但是未写书书中)/"006 file<-paste(root,"data.xls",sep="")007 library (RODBC)008 excel_file <- odbcConnectExcel(file)009 data<- sqlFetch ( excel_file,"data")[,1:17]#由于excel文件问题010 close ( excel_file )011 ##############################################################012 ####数据处理013 ##############################################################014 data$chineseCook<-as.factor(ifelse(data$chineseCook==0,0,1))#如果购买Cook书则为1,没有购买则为0015 data$chineseHist<-as.factor(ifelse(data$chineseHist==0,0,1))#如果购买Atlas书则为1,没有购买则为0016 data$chineseArt<-as.factor(ifelse(data$chineseArt==0,0,1))#如果购买Art书则为1,没有购买则为0017 data$Gender<-as.factor(data$Gender)#性别018 data$HanDynastyArt<-as.factor(data$HanDynastyArt)#目标变量019 ##############################################################020 ####对不平衡目标样本的样本处理021 ##############################################################022 balance<-function(data,yval){023 y.vector<-with(data,get(yval))#在data数据框中读取列名称为yval参数的向量,例如如果yval设置为HanDynastyArt则y.vector就是data$HanDynastyArt024 index.1<-which(y.vector==1)025 index.0<-which(y.vector==0)026 index.1<-sample(index.1,length(index.0),replace=T)027 result<-data[sample(c(index.0,index.1)),]028 result029 }030 data<-balance(data,"HanDynastyArt")031 ##############################################################032 ####分割训练集数据和测试数据033 ##############################################################034 apart.data<-function(data,train.data.persent=0.7){035 train.index<-sample(c(1:nrow(data)),round(nrow(data)*train.data.persent))036 data.train<-data[train.index,]037 data.test<-data[-c(train.index),]038 result<-list(train=data.train,test=data.test)039 result040 }041 p.data<-apart.data(data)042 data.train<-p.data$train043 data.test<-p.data$test044 ###############################################################045 mod.formula<-as.formula("HanDynastyArt~Gender+M+R+F+FirstPurch+ChildBks+YouthBks+CookBks+DoItYBks+RefBks+ArtBks+GeogBks+chineseCook+chineseHist+chineseArt")046 #### party ####047 library("party")048 ctree.sol<-ctree(mod.formula,data=http://www.mamicode.com/data.train,control= ctree_control(mincriterion =0.95))049 #plot(ctree.sol)050 #### rpart ####051 library("rpart")052 rpart.sol<-rpart(mod.formula,data=http://www.mamicode.com/data.train,control=list(cp=0.001))053 #plot(rpart.sol,uniform=TRUE,compress=TRUE,lty=3,branch=0.7)054 #text(rpart.sol,all=TRUE,digits=7,use.n=TRUE,cex=0.9,xpd=TRUE)055 #### logit ####056 glm.sol<-glm(mod.formula,data=http://www.mamicode.com/data.train,family=binomial("logit"))057 #### nnet ####058 library(nnet)059 nnet.sol<-nnet(mod.formula,data=http://www.mamicode.com/data.train,size=30,maxit=1000)#size越高 树越大060 #### svm ####061 library(e1071)062 svm.sol<-svm(mod.formula,data=http://www.mamicode.com/data.train,probability = TRUE)#probability为T表示计算预测数值为取1和0的概率,063 ##############################################################064 ####性能检验065 ##############################################################066 library(ROCR)067 sol.performance<-function(sol,test,add.logic=F,color=NA){068 ########使用prediction函数初始化数据########069 test.real<-as.numeric(ifelse(test$HanDynastyArt==0,0,1))#把目标变量的factor0变为numeric0。factor1变为numeric1。070 sol.class<-class(sol)[1]071 if(sol.class=="BinaryTree"){#ctree函数072 test.pred<-as.numeric(predict(sol,test))073 test.pred<-ifelse(test.pred==1,0,1)#把目标变量的1变为0;2变为1074 }else{if(sol.class=="rpart"){#rpart函数075 test.pred<-predict(sol,test)[,2]076 }else{if(sol.class=="glm"){#glm函数077 glm.pred<-predict(sol,test)078 test.pred<-1/(1+exp(-glm.pred))079 }else{if(sol.class=="nnet.formula"){#nnet函数080 test.pred<-predict(sol,test)081 }else{if(sol.class=="svm.formula"){#svm函数082 test.pred<-attr(predict(sol,test,probability=T),"probabilities")[,2]083 }else{084 print("ERROR:sol输入有误!")085 return()086 }}}}}087 predictions<-prediction(test.pred,test.real) 088 ########计算混淆矩阵并使用performance函数计算灵敏度和auc########089 if(sol.class=="BinaryTree"){#ctree函数090 print("混淆矩阵:")091 print(table(test.pred,test.real,dnn=c("预测数值","真实数值")))092 sens<-performance(predictions,‘sens‘)@y.values[[1]][2]093 print(paste("灵敏度(Sensitivity):",sens,sep=""))094 spec<-performance(predictions,‘spec‘)@y.values[[1]][2]095 print(paste("特指度(Specicity):",spec,sep=""))096 }else{if(sol.class=="rpart"){#rpart函数097 print("混淆矩阵:")098 tmp<-ifelse(as.numeric(predict(sol,test,type="class"))==1,0,1)#把目标变量的1变为0;2变为1099 print(table(tmp,test.real,dnn=c("预测数值","真实数值")))100 auc<-performance(predictions,‘auc‘)@y.values101 print(paste("ROC曲线下的面积(auc):",auc,sep=""))102 }else{if(sol.class=="glm"){#glm函数103 print("混淆矩阵:")104 tmp<-ifelse(test.pred>0.5,1,0)105 print(table(tmp,test.real,dnn=c("预测数值","真实数值")))106 auc<-performance(predictions,‘auc‘)@y.values107 print(paste("ROC曲线下的面积(auc):",auc,sep=""))108 }else{if(sol.class=="nnet.formula"){#nnet函数109 print("混淆矩阵:")110 tmp<-as.numeric(predict(sol,test,type="class"))111 print(table(tmp,test.real,dnn=c("预测数值","真实数值")))112 auc<-performance(predictions,‘auc‘)@y.values113 print(paste("ROC曲线下的面积(auc):",auc,sep=""))114 }else{if(sol.class=="svm.formula"){#svm函数115 print("混淆矩阵:")116 tmp<-ifelse(as.numeric(predict(sol,test,type="class"))==1,0,1)#把目标变量的1变为0;2变为1117 print(table(tmp,test.real,dnn=c("预测数值","真实数值")))118 auc<-performance(predictions,‘auc‘)@y.values119 print(paste("ROC曲线下的面积(auc):",auc,sep=""))120 }else{121 print("ERROR:sol输入有误!")122 return()123 }}}}}124 ########绘制ROC曲线####125 #如果predict返回的是0/1(tree模型)则roc是一个点的折线,如果是0-1的概率连续值(logic nnet等)则roc是多个点组成的曲线126 plot(performance(predictions,‘tpr‘,‘fpr‘),colorize=T,main="ROC图",ylab="真正率(TPR)=灵敏度(Sensitivity)",xlab="假正率(FPR)=1-特指度(1-Specicity)",add=add.logic,colorize.palette=color)127 128 }129 col<-rainbow(5,start=0,end=4/6)130 sol.performance(ctree.sol,data.test,F,col[1])131 sol.performance(rpart.sol,data.test,T,col[2])132 sol.performance(glm.sol,data.test,T,col[3])133 sol.performance(nnet.sol,data.test,T,col[4])134 sol.performance(svm.sol,data.test,T,col[5])135 id=c("ctree模型","rpart模型","glm模型","nnet模型","svm模型")136 legend("bottomright",legend=id,horiz=T,pch=15,col=col,cex=0.8,bty="n")
数据源下载链接为http://vdisk.weibo.com/s/sFZzV0wnhRPN;
该数据集是某图书出版社研究用户是否会购买新书而做的调查问卷结果,其中的HanDynastyArt是目标变量。
输出的结果为,表示不同的模型下roc曲线:
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。