6.3 数据重塑

重复测量数据的变形 Reshape Grouped Data,将宽格式 wide 的数据框变长格式 long的,反之也行。reshape 还支持正则表达式

str(Indometh)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   66 obs. of  3 variables:
##  $ Subject: Ord.factor w/ 6 levels "1"<"4"<"2"<"5"<..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ time   : num  0.25 0.5 0.75 1 1.25 2 3 4 5 6 ...
##  $ conc   : num  1.5 0.94 0.78 0.48 0.37 0.19 0.12 0.11 0.08 0.07 ...
##  - attr(*, "formula")=Class 'formula'  language conc ~ time | Subject
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time since drug administration"
##   ..$ y: chr "Indomethacin concentration"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(hr)"
##   ..$ y: chr "(mcg/ml)"
summary(Indometh)
##  Subject      time            conc       
##  1:11    Min.   :0.250   Min.   :0.0500  
##  4:11    1st Qu.:0.750   1st Qu.:0.1100  
##  2:11    Median :2.000   Median :0.3400  
##  5:11    Mean   :2.886   Mean   :0.5918  
##  6:11    3rd Qu.:5.000   3rd Qu.:0.8325  
##  3:11    Max.   :8.000   Max.   :2.7200
# 长的变宽
wide <- reshape(Indometh,
  v.names = "conc", idvar = "Subject",
  timevar = "time", direction = "wide"
)
wide[, 1:6]
##    Subject conc.0.25 conc.0.5 conc.0.75 conc.1 conc.1.25
## 1        1      1.50     0.94      0.78   0.48      0.37
## 12       2      2.03     1.63      0.71   0.70      0.64
## 23       3      2.72     1.49      1.16   0.80      0.80
## 34       4      1.85     1.39      1.02   0.89      0.59
## 45       5      2.05     1.04      0.81   0.39      0.30
....
# 宽的变长
reshape(wide, direction = "long")
##        Subject time conc
## 1.0.25       1 0.25 1.50
## 2.0.25       2 0.25 2.03
## 3.0.25       3 0.25 2.72
## 4.0.25       4 0.25 1.85
## 5.0.25       5 0.25 2.05
....

宽的格式变成长的格式 https://stackoverflow.com/questions/2185252 或者长的格式变成宽的格式 https://stackoverflow.com/questions/5890584/

set.seed(45)
dat <- data.frame(
    name = rep(c("Orange", "Apple"), each=4),
    numbers = rep(1:4, 2),
    value = rnorm(8))
dat
##     name numbers      value
## 1 Orange       1  0.3407997
## 2 Orange       2 -0.7033403
## 3 Orange       3 -0.3795377
## 4 Orange       4 -0.7460474
## 5  Apple       1 -0.8981073
## 6  Apple       2 -0.3347941
## 7  Apple       3 -0.5013782
## 8  Apple       4 -0.1745357
reshape(dat, idvar = "name", timevar = "numbers", direction = "wide")
##     name    value.1    value.2    value.3    value.4
## 1 Orange  0.3407997 -0.7033403 -0.3795377 -0.7460474
## 5  Apple -0.8981073 -0.3347941 -0.5013782 -0.1745357
## times need not be numeric
df <- data.frame(id = rep(1:4, rep(2,4)),
                 visit = I(rep(c("Before","After"), 4)),
                 x = rnorm(4), y = runif(4))
df
##   id  visit          x          y
## 1  1 Before  1.8090374 0.89106978
## 2  1  After -0.2301050 0.06920426
## 3  2 Before -1.1304182 0.94623103
## 4  2  After  0.2159889 0.74850150
## 5  3 Before  1.8090374 0.89106978
## 6  3  After -0.2301050 0.06920426
## 7  4 Before -1.1304182 0.94623103
## 8  4  After  0.2159889 0.74850150
reshape(df, timevar = "visit", idvar = "id", direction = "wide")
##   id  x.Before  y.Before    x.After    y.After
## 1  1  1.809037 0.8910698 -0.2301050 0.06920426
## 3  2 -1.130418 0.9462310  0.2159889 0.74850150
## 5  3  1.809037 0.8910698 -0.2301050 0.06920426
## 7  4 -1.130418 0.9462310  0.2159889 0.74850150
## warns that y is really varying
reshape(df, timevar = "visit", idvar = "id", direction = "wide", v.names = "x")
## Warning in reshapeWide(data, idvar = idvar, timevar = timevar, varying =
## varying, : some constant variables (y) are really varying
##   id         y  x.Before    x.After
## 1  1 0.8910698  1.809037 -0.2301050
## 3  2 0.9462310 -1.130418  0.2159889
## 5  3 0.8910698  1.809037 -0.2301050
## 7  4 0.9462310 -1.130418  0.2159889

更加复杂的例子, gambia 数据集,重塑的效果是使得个体水平的长格式变为村庄水平的宽格式

# data(gambia, package = "geoR")
# 在线下载数据集
gambia <- read.table(
  file =
    paste("http://www.leg.ufpr.br/lib/exe/fetch.php",
      "pessoais:paulojus:mbgbook:datasets:gambia.txt",
      sep = "/"
    ), header = TRUE
)
head(gambia)
# Building a "village-level" data frame
ind <- paste("x", gambia[, 1], "y", gambia[, 2], sep = "")
village <- gambia[!duplicated(ind), c(1:2, 7:8)]
village$prev <- as.vector(tapply(gambia$pos, ind, mean))
head(village)