3.1 导入数据

Base R 针对不同的数据格式文件,提供了大量的数据导入和导出函数,不愧是专注数据分析20余年的优秀统计软件。 除了函数 write.ftableread.ftable 来自 stats 包,都来自 base 和 utils 包

# 当前环境的搜索路径
searchpaths()
## [1] ".GlobalEnv"                          
## [2] "/opt/R/4.2.3/lib/R/library/stats"    
## [3] "/opt/R/4.2.3/lib/R/library/graphics" 
## [4] "/opt/R/4.2.3/lib/R/library/grDevices"
## [5] "/opt/R/4.2.3/lib/R/library/utils"    
## [6] "/opt/R/4.2.3/lib/R/library/datasets" 
## [7] "/opt/R/4.2.3/lib/R/library/methods"  
## [8] "Autoloads"                           
## [9] "/opt/R/4.2.3/lib/R/library/base"
# 返回匹配结果及其所在路径的编号
apropos("^(read|write)", where = TRUE, mode = "function")
##                  5                  5                  9                  5 
##         "read.csv"        "read.csv2"         "read.dcf"       "read.delim" 
##                  5                  5                  5                  2 
##      "read.delim2"         "read.DIF"     "read.fortran"      "read.ftable" 
##                  5                  5                  5                  9 
##         "read.fwf"      "read.socket"       "read.table"          "readBin" 
##                  9                  5                  9                  9 
##         "readChar" "readCitationFile"         "readline"        "readLines" 
##                  9                  9                  9                  5 
##          "readRDS"     "readRenviron"            "write"        "write.csv" 
##                  5                  9                  2                  5 
##       "write.csv2"        "write.dcf"     "write.ftable"     "write.socket" 
##                  5                  9                  9                  9 
##      "write.table"         "writeBin"        "writeChar"       "writeLines"

3.1.1 scan

scan(file = "", what = double(), nmax = -1, n = -1, sep = "",
     quote = if(identical(sep, "\n")) "" else "'\"", dec = ".",
     skip = 0, nlines = 0, na.strings = "NA",
     flush = FALSE, fill = FALSE, strip.white = FALSE,
     quiet = FALSE, blank.lines.skip = TRUE, multi.line = TRUE,
     comment.char = "", allowEscapes = FALSE,
     fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)

首先让我们用 cat 函数创建一个练习数据集 ex.data

cat("TITLE extra line", "2 3 5 7", "11 13 17")
## TITLE extra line 2 3 5 7 11 13 17
cat("TITLE extra line", "2 3 5 7", "11 13 17", file = "data/ex.data", sep = "\n")

以此练习数据集,介绍 scan 函数最常用的参数

scan("data/ex.data")
## Error in scan("data/ex.data"): scan() expected 'a real', got 'TITLE'

从上面的报错信息,我们发现 scan 函数只能读取同一类型的数据,如布尔型 logical, 整型 integer,数值型 numeric(double), 复数型 complex,字符型 character,raw 和列表 list。所以我们设置参数 skip = 1 把第一行跳过,就成功读取了数据

scan("data/ex.data", skip = 1)
## [1]  2  3  5  7 11 13 17

如果设置参数 quiet = TRUE 就不会报告读取的数据量

scan("data/ex.data", skip = 1, quiet = TRUE)
## [1]  2  3  5  7 11 13 17

参数 nlines = 1 表示只读取一行数据

scan("data/ex.data", skip = 1, nlines = 1) # only 1 line after the skipped one
## [1] 2 3 5 7

默认参数 flush = TRUE 表示读取最后一个请求的字段后,刷新到行尾,下面对比一下读取的结果

scan("data/ex.data", what = list("", "", "")) # flush is F -> read "7"
## Warning in scan("data/ex.data", what = list("", "", "")): number of items read
## is not a multiple of the number of columns
## [[1]]
## [1] "TITLE" "2"     "7"     "17"   
## 
## [[2]]
## [1] "extra" "3"     "11"    ""     
## 
## [[3]]
## [1] "line" "5"    "13"   ""
scan("data/ex.data", what = list("", "", ""), flush = TRUE)
## [[1]]
## [1] "TITLE" "2"     "11"   
## 
## [[2]]
## [1] "extra" "3"     "13"   
## 
## [[3]]
## [1] "line" "5"    "17"

临时文件 ex.data 用完了,我们调用 unlink 函数将其删除,以免留下垃圾文件

unlink("data/ex.data") # tidy up

3.1.2 read.table

read.table(file,
  header = FALSE, sep = "", quote = "\"'",
  dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
  row.names, col.names, as.is = !stringsAsFactors,
  na.strings = "NA", colClasses = NA, nrows = -1,
  skip = 0, check.names = TRUE, fill = !blank.lines.skip,
  strip.white = FALSE, blank.lines.skip = TRUE,
  comment.char = "#",
  allowEscapes = FALSE, flush = FALSE,
  stringsAsFactors = default.stringsAsFactors(),
  fileEncoding = "", encoding = "unknown", text, skipNul = FALSE
)

read.csv(file,
  header = TRUE, sep = ",", quote = "\"",
  dec = ".", fill = TRUE, comment.char = "", ...
)

read.csv2(file,
  header = TRUE, sep = ";", quote = "\"",
  dec = ",", fill = TRUE, comment.char = "", ...
)

read.delim(file,
  header = TRUE, sep = "\t", quote = "\"",
  dec = ".", fill = TRUE, comment.char = "", ...
)

read.delim2(file,
  header = TRUE, sep = "\t", quote = "\"",
  dec = ",", fill = TRUE, comment.char = "", ...
)

变量名是不允许以下划线开头的,同样在数据框里,列名也不推荐使用下划线开头。默认情况下,read.table 都会通过参数 check.names 检查列名的有效性,该参数实际调用了函数 make.names 去检查。如果想尽量保持数据集原来的样子可以设置参数 check.names = FALSE, stringsAsFactors = FALSE。 默认情形下,read.table 还会将字符串转化为因子变量,这是 R 的历史原因,作为一门统计学家的必备语言,在统计模型中,字符常用来描述类别,而类别变量在 R 环境中常用因子类型来表示,而且大量内置的统计模型也是将它们视为因子变量,如 lmglm

dat1 = read.table(header = TRUE, check.names = TRUE, text = "
_a _b _c
1 2 a1
3 4 a2
")
dat1
##   X_a X_b X_c
## 1   1   2  a1
## 2   3   4  a2
dat2 = read.table(header = TRUE, check.names = FALSE, text = "
_a _b _c
1 2 a1
3 4 a2
")
dat2
##   _a _b _c
## 1  1  2 a1
## 2  3  4 a2
dat3 <- read.table(header = TRUE, check.names = FALSE,
  stringsAsFactors = FALSE, text = "
_a _b _c
1 2 a1
3 4 a2
"
)
dat3
##   _a _b _c
## 1  1  2 a1
## 2  3  4 a2

3.1.3 readLines

readLines(con = stdin(), n = -1L, ok = TRUE, warn = TRUE,
          encoding = "unknown", skipNul = FALSE)

让我们折腾一波,读进来又写出去,只有 R 3.5.3 以上才能保持原样的正确输入输出,因为这里有一个之前版本包含的 BUG

writeLines(readLines(system.file("DESCRIPTION", package = "splines")), "data/DESCRIPTION")
# 比较一下
identical(
  readLines(system.file("DESCRIPTION", package = "splines")),
  readLines("data/DESCRIPTION")
)
## [1] TRUE

这次我们创建一个真的临时文件,因为重新启动 R 这个文件和文件夹就没有了,回收掉了

fil <- tempfile(fileext = ".data")
cat("TITLE extra line", "2 3 5 7", "", "11 13 17", file = fil,
    sep = "\n")
fil
## [1] "/tmp/Rtmpt76iAL/file3eec7bde3aa5.data"

设置参数 n = -1 表示将文件 fil 的内容从头读到尾

readLines(fil, n = -1)
## [1] "TITLE extra line" "2 3 5 7"          ""                 "11 13 17"

作为拥有良好习惯的 R 用户,这种垃圾文件最好用后即焚

unlink(fil) # tidy up

再举个例子,我们创建一个新的临时文件 fil,文件内容只有

cat("123\nabc")
## 123
## abc
fil <- tempfile("test")
cat("123\nabc\n", file = fil, append = TRUE)
fil
## [1] "/tmp/Rtmpt76iAL/test3eec6b742b98"
readLines(fil)
## [1] "123" "abc"

这次读取文件的过程给出了警告,原因是 fil 没有以空行结尾,warn = TRUE 表示这种情况要给出警告,如果设置参数 warn = FALSE 就没有警告。我们还是建议大家尽量遵循规范。

再举一个例子,从一个连接读取数据,建立连接的方式有很多,参见 ?file,下面设置参数 blocking

con <- file(fil, "r", blocking = FALSE)
readLines(con)
## [1] "123" "abc"
cat(" def\n", file = fil, append = TRUE)
readLines(con)
## [1] " def"
# 关闭连接
close(con)
# 清理垃圾文件
unlink(fil)

3.1.4 readRDS

序列化数据操作,Mark Klik 开发的 fstTravers Ching 开发的 qs, Hadley Wickham 开发的 feather 包实现跨语言环境快速的读写数据

表 3.1: fst 序列化数据框对象性能比较 BaseR、 data.table 和 feather 6
Method Format Time (ms) Size (MB) Speed (MB/s) N
readRDS bin 1577 1000 633 112
saveRDS bin 2042 1000 489 112
fread csv 2925 1038 410 232
fwrite csv 2790 1038 358 241
read_feather bin 3950 813 253 112
write_feather bin 1820 813 549 112
read_fst bin 457 303 2184 282
write_fst bin 314 303 3180 291

目前比较好的是 qs 和 fst 包