首页 > 代码库 > R语言进行数据预处理
R语言进行数据预处理
R语言进行数据预处理wranging
li_volleyball
2016年3月22日
data wrangling with R
packages:tidyr dplyr
Ground rules
library(tidyr)
library(dplyr)
##
## Attaching package: ‘dplyr‘
## The following objects are masked from ‘package:stats‘:
##
## filter, lag
## The following objects are masked from ‘package:base‘:
##
## intersect, setdiff, setequal, union
View(iris)
View(mtcars)
select(mtcars,am:1)
## am vs qsec wt drat hp disp cyl mpg
## Mazda RX4 1 0 16.46 2.620 3.90 110 160.0 6 21.0
## Mazda RX4 Wag 1 0 17.02 2.875 3.90 110 160.0 6 21.0
## Datsun 710 1 1 18.61 2.320 3.85 93 108.0 4 22.8
## Hornet 4 Drive 0 1 19.44 3.215 3.08 110 258.0 6 21.4
## Hornet Sportabout 0 0 17.02 3.440 3.15 175 360.0 8 18.7
## Valiant 0 1 20.22 3.460 2.76 105 225.0 6 18.1
## Duster 360 0 0 15.84 3.570 3.21 245 360.0 8 14.3
## Merc 240D 0 1 20.00 3.190 3.69 62 146.7 4 24.4
## Merc 230 0 1 22.90 3.150 3.92 95 140.8 4 22.8
## Merc 280 0 1 18.30 3.440 3.92 123 167.6 6 19.2
## Merc 280C 0 1 18.90 3.440 3.92 123 167.6 6 17.8
## Merc 450SE 0 0 17.40 4.070 3.07 180 275.8 8 16.4
## Merc 450SL 0 0 17.60 3.730 3.07 180 275.8 8 17.3
## Merc 450SLC 0 0 18.00 3.780 3.07 180 275.8 8 15.2
## Cadillac Fleetwood 0 0 17.98 5.250 2.93 205 472.0 8 10.4
## Lincoln Continental 0 0 17.82 5.424 3.00 215 460.0 8 10.4
## Chrysler Imperial 0 0 17.42 5.345 3.23 230 440.0 8 14.7
## Fiat 128 1 1 19.47 2.200 4.08 66 78.7 4 32.4
## Honda Civic 1 1 18.52 1.615 4.93 52 75.7 4 30.4
## Toyota Corolla 1 1 19.90 1.835 4.22 65 71.1 4 33.9
## Toyota Corona 0 1 20.01 2.465 3.70 97 120.1 4 21.5
## Dodge Challenger 0 0 16.87 3.520 2.76 150 318.0 8 15.5
## AMC Javelin 0 0 17.30 3.435 3.15 150 304.0 8 15.2
## Camaro Z28 0 0 15.41 3.840 3.73 245 350.0 8 13.3
## Pontiac Firebird 0 0 17.05 3.845 3.08 175 400.0 8 19.2
## Fiat X1-9 1 1 18.90 1.935 4.08 66 79.0 4 27.3
## Porsche 914-2 1 0 16.70 2.140 4.43 91 120.3 4 26.0
## Lotus Europa 1 1 16.90 1.513 3.77 113 95.1 4 30.4
## Ford Pantera L 1 0 14.50 3.170 4.22 264 351.0 8 15.8
## Ferrari Dino 1 0 15.50 2.770 3.62 175 145.0 6 19.7
## Maserati Bora 1 0 14.60 3.570 3.54 335 301.0 8 15.0
## Volvo 142E 1 1 18.60 2.780 4.11 109 121.0 4 21.4
mtcars %>% select(am:1)
## am vs qsec wt drat hp disp cyl mpg
## Mazda RX4 1 0 16.46 2.620 3.90 110 160.0 6 21.0
## Mazda RX4 Wag 1 0 17.02 2.875 3.90 110 160.0 6 21.0
## Datsun 710 1 1 18.61 2.320 3.85 93 108.0 4 22.8
## Hornet 4 Drive 0 1 19.44 3.215 3.08 110 258.0 6 21.4
## Hornet Sportabout 0 0 17.02 3.440 3.15 175 360.0 8 18.7
## Valiant 0 1 20.22 3.460 2.76 105 225.0 6 18.1
## Duster 360 0 0 15.84 3.570 3.21 245 360.0 8 14.3
## Merc 240D 0 1 20.00 3.190 3.69 62 146.7 4 24.4
## Merc 230 0 1 22.90 3.150 3.92 95 140.8 4 22.8
## Merc 280 0 1 18.30 3.440 3.92 123 167.6 6 19.2
## Merc 280C 0 1 18.90 3.440 3.92 123 167.6 6 17.8
## Merc 450SE 0 0 17.40 4.070 3.07 180 275.8 8 16.4
## Merc 450SL 0 0 17.60 3.730 3.07 180 275.8 8 17.3
## Merc 450SLC 0 0 18.00 3.780 3.07 180 275.8 8 15.2
## Cadillac Fleetwood 0 0 17.98 5.250 2.93 205 472.0 8 10.4
## Lincoln Continental 0 0 17.82 5.424 3.00 215 460.0 8 10.4
## Chrysler Imperial 0 0 17.42 5.345 3.23 230 440.0 8 14.7
## Fiat 128 1 1 19.47 2.200 4.08 66 78.7 4 32.4
## Honda Civic 1 1 18.52 1.615 4.93 52 75.7 4 30.4
## Toyota Corolla 1 1 19.90 1.835 4.22 65 71.1 4 33.9
## Toyota Corona 0 1 20.01 2.465 3.70 97 120.1 4 21.5
## Dodge Challenger 0 0 16.87 3.520 2.76 150 318.0 8 15.5
## AMC Javelin 0 0 17.30 3.435 3.15 150 304.0 8 15.2
## Camaro Z28 0 0 15.41 3.840 3.73 245 350.0 8 13.3
## Pontiac Firebird 0 0 17.05 3.845 3.08 175 400.0 8 19.2
## Fiat X1-9 1 1 18.90 1.935 4.08 66 79.0 4 27.3
## Porsche 914-2 1 0 16.70 2.140 4.43 91 120.3 4 26.0
## Lotus Europa 1 1 16.90 1.513 3.77 113 95.1 4 30.4
## Ford Pantera L 1 0 14.50 3.170 4.22 264 351.0 8 15.8
## Ferrari Dino 1 0 15.50 2.770 3.62 175 145.0 6 19.7
## Maserati Bora 1 0 14.60 3.570 3.54 335 301.0 8 15.0
## Volvo 142E 1 1 18.60 2.780 4.11 109 121.0 4 21.4
example1<-data.frame(A=c(paste("x",1:6,sep = "")),
B=seq(1,11,2),
c=1:6,
date=c("2000-08-15","1998-07-15","1995-06-04","1997-07-01","1999-06-01","1996-06-25"))
example1
## A B c date
## 1 x1 1 1 2000-08-15
## 2 x2 3 2 1998-07-15
## 3 x3 5 3 1995-06-04
## 4 x4 7 4 1997-07-01
## 5 x5 9 5 1999-06-01
## 6 x6 11 6 1996-06-25
# 一个变量一列
# 一个观测值一行
#每一种观测在一个表里
#separate()
separate(example1,date,c("Y","m","d"),sep="-")
## A B c Y m d
## 1 x1 1 1 2000 08 15
## 2 x2 3 2 1998 07 15
## 3 x3 5 3 1995 06 04
## 4 x4 7 4 1997 07 01
## 5 x5 9 5 1999 06 01
## 6 x6 11 6 1996 06 25
example12<-example1 %>% separate(date,c("Y","m","d"),sep="-")
#unite()
unite(example12,"YM",Y,m,sep="-")
## A B c YM d
## 1 x1 1 1 2000-08 15
## 2 x2 3 2 1998-07 15
## 3 x3 5 3 1995-06 04
## 4 x4 7 4 1997-07 01
## 5 x5 9 5 1999-06 01
## 6 x6 11 6 1996-06 25
#select()
select(example1,A,B)
## A B
## 1 x1 1
## 2 x2 3
## 3 x3 5
## 4 x4 7
## 5 x5 9
## 6 x6 11
select(example1,-A)
## B c date
## 1 1 1 2000-08-15
## 2 3 2 1998-07-15
## 3 5 3 1995-06-04
## 4 7 4 1997-07-01
## 5 9 5 1999-06-01
## 6 11 6 1996-06-25
select(example1,B:date)
## B c date
## 1 1 1 2000-08-15
## 2 3 2 1998-07-15
## 3 5 3 1995-06-04
## 4 7 4 1997-07-01
## 5 9 5 1999-06-01
## 6 11 6 1996-06-25
select(example1,starts_with("d"))
## date
## 1 2000-08-15
## 2 1998-07-15
## 3 1995-06-04
## 4 1997-07-01
## 5 1999-06-01
## 6 1996-06-25
select(example1,ends_with("e"))
## date
## 1 2000-08-15
## 2 1998-07-15
## 3 1995-06-04
## 4 1997-07-01
## 5 1999-06-01
## 6 1996-06-25
select(example1,contains("a"))
## A date
## 1 x1 2000-08-15
## 2 x2 1998-07-15
## 3 x3 1995-06-04
## 4 x4 1997-07-01
## 5 x5 1999-06-01
## 6 x6 1996-06-25
#filter()
filter(example1,B>=6)
## A B c date
## 1 x4 7 4 1997-07-01
## 2 x5 9 5 1999-06-01
## 3 x6 11 6 1996-06-25
filter(example1,B>=6,A%in%c("x1","x4","x5"))
## A B c date
## 1 x4 7 4 1997-07-01
## 2 x5 9 5 1999-06-01
#mutate()
mutate(example1,ratio=B/c)
## A B c date ratio
## 1 x1 1 1 2000-08-15 1.000000
## 2 x2 3 2 1998-07-15 1.500000
## 3 x3 5 3 1995-06-04 1.666667
## 4 x4 7 4 1997-07-01 1.750000
## 5 x5 9 5 1999-06-01 1.800000
## 6 x6 11 6 1996-06-25 1.833333
mutate(example1,ratio=B/c,inverse=ratio-1)
## A B c date ratio inverse
## 1 x1 1 1 2000-08-15 1.000000 0.0000000
## 2 x2 3 2 1998-07-15 1.500000 0.5000000
## 3 x3 5 3 1995-06-04 1.666667 0.6666667
## 4 x4 7 4 1997-07-01 1.750000 0.7500000
## 5 x5 9 5 1999-06-01 1.800000 0.8000000
## 6 x6 11 6 1996-06-25 1.833333 0.8333333
mutate(example1,cumsum(B))
## A B c date cumsum(B)
## 1 x1 1 1 2000-08-15 1
## 2 x2 3 2 1998-07-15 4
## 3 x3 5 3 1995-06-04 9
## 4 x4 7 4 1997-07-01 16
## 5 x5 9 5 1999-06-01 25
## 6 x6 11 6 1996-06-25 36
mutate(example1,cumsum(B),cummean(B),cumany(B>6),cumall(B>6))
## A B c date cumsum(B) cummean(B) cumany(B > 6) cumall(B > 6)
## 1 x1 1 1 2000-08-15 1 1 FALSE FALSE
## 2 x2 3 2 1998-07-15 4 2 FALSE FALSE
## 3 x3 5 3 1995-06-04 9 3 FALSE FALSE
## 4 x4 7 4 1997-07-01 16 4 TRUE FALSE
## 5 x5 9 5 1999-06-01 25 5 TRUE FALSE
## 6 x6 11 6 1996-06-25 36 6 TRUE FALSE
mutate(example1,cummin(B),cummax(B))
## A B c date cummin(B) cummax(B)
## 1 x1 1 1 2000-08-15 1 1
## 2 x2 3 2 1998-07-15 1 3
## 3 x3 5 3 1995-06-04 1 5
## 4 x4 7 4 1997-07-01 1 7
## 5 x5 9 5 1999-06-01 1 9
## 6 x6 11 6 1996-06-25 1 11
mutate(example1,between(B,4,8))
## A B c date between(B, 4, 8)
## 1 x1 1 1 2000-08-15 FALSE
## 2 x2 3 2 1998-07-15 FALSE
## 3 x3 5 3 1995-06-04 TRUE
## 4 x4 7 4 1997-07-01 TRUE
## 5 x5 9 5 1999-06-01 FALSE
## 6 x6 11 6 1996-06-25 FALSE
mutate(example1,cume_dist(B))
## A B c date cume_dist(B)
## 1 x1 1 1 2000-08-15 0.1666667
## 2 x2 3 2 1998-07-15 0.3333333
## 3 x3 5 3 1995-06-04 0.5000000
## 4 x4 7 4 1997-07-01 0.6666667
## 5 x5 9 5 1999-06-01 0.8333333