首页 > 代码库 > 用R语言将数据转换成Format Vowpal Wabbit Input Files

用R语言将数据转换成Format Vowpal Wabbit Input Files

相关文章:https://github.com/JohnLangford/vowpal_wabbit/wiki

                    https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format

# 从网上下载titanicDF的数据源
titanicDF<-read.csv(‘http://math.ucdenver.edu/RTutorial/titanic.txt‘,sep=‘\t‘)
#  将数据源存到本地
# write.table(titanicDF,‘titanicDF.txt‘,row.names=F)
# write.csv(titanicDF,"titanicDF.csv",row.names=F)
# 创建新变量 title
titanicDF$Title<-ifelse(grepl(‘Mr ‘,titanicDF$Name),‘Mr‘,ifelse(grepl(‘Mrs ‘,titanicDF$Name),‘Mrs‘,ifelse(grepl(‘Miss‘,titanicDF$Name),‘Miss‘,‘Nothing‘)))
# 转换成factor类型
titanicDF$Title<-as.factor(titanicDF$Title)
head(titanicDF)
##                                            Name PClass   Age    Sex
## 1                  Allen, Miss Elisabeth Walton    1st 29.00 female
## 2                   Allison, Miss Helen Loraine    1st  2.00 female
## 3           Allison, Mr Hudson Joshua Creighton    1st 30.00   male
## 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st 25.00 female
## 5                 Allison, Master Hudson Trevor    1st  0.92   male
## 6                            Anderson, Mr Harry    1st 47.00   male
##   Survived   Title
## 1        1    Miss
## 2        0    Miss
## 3        0      Mr
## 4        0     Mrs
## 5        1 Nothing
## 6        1      Mr
str(titanicDF)
## ‘data.frame‘:    1313 obs. of  6 variables:
##  $ Name    : Factor w/ 1310 levels "Abbing, Mr Anthony",..: 22 25 26 27 24 31 45 46 50 54 ...
##  $ PClass  : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age     : num  29 2 30 25 0.92 47 63 39 58 71 ...
##  $ Sex     : Factor w/ 2 levels "female","male": 1 1 2 1 2 2 1 2 1 2 ...
##  $ Survived: int  1 0 0 0 1 1 1 0 1 0 ...
##  $ Title   : Factor w/ 4 levels "Miss","Mr","Mrs",..: 1 1 2 3 4 2 1 2 3 2 ...
# 查看数据缺失情况
library(dfexplore)
## Loading required package: ggplot2
dfplot(titanicDF)
技术分享



sum(is.na(titanicDF$Age)) # Age有557条记录数据缺失
## [1] 557
# 用中位数填补age变量的缺失数据
titanicDF$Age[is.na(titanicDF$Age)]<-median(titanicDF$Age,na.rm=T)
# 再次查看tittitanicDF数据是否有缺失值
dfplot(titanicDF)

技术分享


# 对数据的列进行重组,将目标"Survived"放在最后面
titanicDF<-titanicDF[c("PClass","Age","Sex","Title","Survived")]
head(titanicDF)
##   PClass   Age    Sex   Title Survived
## 1    1st 29.00 female    Miss        1
## 2    1st  2.00 female    Miss        0
## 3    1st 30.00   male      Mr        0
## 4    1st 25.00 female     Mrs        0
## 5    1st  0.92   male Nothing        1
## 6    1st 47.00   male      Mr        1
# binarize all factors:对所有因子变量进行虚拟变量(哑变量)处理
library(caret)
## Loading required package: lattice
titanicDummy<-dummyVars(~.,data=http://www.mamicode.com/titanicDF,fullRank=F)
titanicDF<-as.data.frame(predict(titanicDummy,newdata=http://www.mamicode.com/titanicDF))
head(titanicDF)
##   PClass.1st PClass.2nd PClass.3rd   Age Sex.female Sex.male Title.Miss
## 1          1          0          0 29.00          1        0          1
## 2          1          0          0  2.00          1        0          1
## 3          1          0          0 30.00          0        1          0
## 4          1          0          0 25.00          1        0          0
## 5          1          0          0  0.92          0        1          0
## 6          1          0          0 47.00          0        1          0
##   Title.Mr Title.Mrs Title.Nothing Survived
## 1        0         0             0        1
## 2        0         0             0        0
## 3        1         0             0        0
## 4        0         1             0        0
## 5        0         0             1        1
## 6        1         0             0        1
https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format


[Label] [Importance [Tag]]|Namespace Features |Namespace Features … |Namespace Features


isclassification<-T
outcomeName<-"Survived"
labelName<-titanicDF[,"Survived"]
weightName<-""
objDF<-titanicDF
predictors<-names(objDF)[!names(objDF) %in% c(outcomeName,weightName)]
# LABELS & IMPORTANCE
if (is.null(labelName)) {
  outcomeName<-"ignoreme"
  objDF[,outcomeName]<-"0 |"
} else {
  if (isclassification) {
    # everything should be -1 1 for classification
    objDF[,outcomeName]<-ifelse(objDF[,outcomeName]>0,1,-1)
  }
  if (weightName !=‘‘)
    objDF[,outcomeName]<-psate(objDF[,outcomeName],objDF[,weightName],"|")
  else
    objDF[,outcomeName]<-paste(objDF[,outcomeName],"|")
}
# Pairing column names with data.. adding 1 blank character before each variable
for (i in predictors){
  objDF[,i]<-ifelse(objDF[,i]==1,paste0(‘ ‘,i),
                    ifelse(objDF[,i]==0,‘‘,paste0(‘ ‘,i,":",objDF[,i])))
}


# reorder columns:重新组合列
objDF<-objDF[c(outcomeName,predictors)]
head(objDF)
##   Survived  PClass.1st PClass.2nd PClass.3rd       Age  Sex.female
## 1      1 |  PClass.1st                          Age:29  Sex.female
## 2     -1 |  PClass.1st                           Age:2  Sex.female
## 3     -1 |  PClass.1st                          Age:30            
## 4     -1 |  PClass.1st                          Age:25  Sex.female
## 5      1 |  PClass.1st                        Age:0.92            
## 6      1 |  PClass.1st                          Age:47            
##    Sex.male  Title.Miss  Title.Mr  Title.Mrs  Title.Nothing
## 1            Title.Miss                                    
## 2            Title.Miss                                    
## 3  Sex.male              Title.Mr                          
## 4                                  Title.Mrs               
## 5  Sex.male                                   Title.Nothing
## 6  Sex.male              Title.Mr
write.table(objDF,"vw.txt",sep="",quote=F,row.names=F,col.names=F)

用R语言将数据转换成Format Vowpal Wabbit Input Files