Notes for R for Data Sience — Wickham & Grolemund

Chapter 8

In this chapter, the authors cover importing data using the readr package. The focus of the chapter is on reading “flat” text files.

Getting Started

“Flat” text files are set up with separate rows for each observation and separate columns for different variables. The columns are separated by different “separators.” Typically, we see comma, space, tab, and semicolons as separators.

Base R

The most common form of data file is the CSV, or comma separated values file. Base R provides a number of ways of reading CSV files. The easiest to use of these is the read.csv function. This function can be used to read a file from your local drive or from the web. Below we read data on the heights and weights of individuals using both methods.

  ### Read a local file
  htwt.local = read.csv("Data/htwt.csv")
  head(htwt.local)

##   Height Weight Group
## 1     64    159     1
## 2     63    155     2
## 3     67    157     2
## 4     60    125     1
## 5     52    103     2
## 6     58    122     2

  attributes(htwt.local)

## $names
## [1] "Height" "Weight" "Group" 
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

  ### Read a web file
  htwt.web = read.csv("http://facweb1.redlands.edu/fac/jim_bentley/Data/MATH%20212/HtWt.csv")
  head(htwt.web)

##   Height Weight Group
## 1     64    159     1
## 2     63    155     2
## 3     67    157     2
## 4     60    125     1
## 5     52    103     2
## 6     58    122     2

  attributes(htwt.web)

## $names
## [1] "Height" "Weight" "Group" 
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

  htwt.names <- names(htwt.web)

R also reads data with different separators/delimiters. Files without headers can also be imported. Below we read a tab delimited file that does not have a header. The generated variable names (V1, V2, V3) are then replaced with the names used above (Height, Weight, Group)

  args(read.csv)

## function (file, header = TRUE, sep = ",", quote = "\"", dec = ".", 
##     fill = TRUE, comment.char = "", ...) 
## NULL

  ### Read a tab delimited file with no header
  htwt.tab = read.csv("http://facweb1.redlands.edu/fac/jim_bentley/Data/MATH%20212/HtWtNoHeader.txt", sep = "\t", header = FALSE)
  head(htwt.tab)

##   V1  V2 V3
## 1 64 159  1
## 2 63 155  2
## 3 67 157  2
## 4 60 125  1
## 5 52 103  2
## 6 58 122  2

  names(htwt.tab) <- htwt.names
  head(htwt.tab)

##   Height Weight Group
## 1     64    159     1
## 2     63    155     2
## 3     67    157     2
## 4     60    125     1
## 5     52    103     2
## 6     58    122     2

  attributes(htwt.web)

## $names
## [1] "Height" "Weight" "Group" 
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

Exporting CSV

The export version of read.csv is write.csv. It provides similar options to the import function and generates local files.

As an example, we create a European ready version of the htwt data that use a semicolon as the delimiter.

  #?write.csv2
  
  ### Make a copy of htwt with some decimal numbers and a factor version of group
  htwt <- htwt.local
  htwt$group <- factor(htwt$Group, labels=c("Male","Female"))
  htwt$weight <- round(htwt$Weight + rnorm(length(htwt$Weight)), 1)
  htwt$height <- round(htwt$Height + rnorm(length(htwt$Height)), 1)
  
  ### Save the file in Euro format --- semicolons and commas
  write.csv2(htwt, "Data/htwtEuro.txt", row.names=FALSE)
  
  ### Read it non-Euro
  htwt.euro <- read.csv("Data/htwtEuro.txt")

## Error in read.table(file = file, header = header, sep = sep, quote = quote, : more columns than column names

  ### Read it Euro
  htwt.euro <- read.csv2("Data/htwtEuro.txt")
  head(htwt.euro)

##   Height Weight Group  group weight height
## 1     64    159     1   Male  160.1   63.6
## 2     63    155     2 Female  155.7   63.4
## 3     67    157     2 Female  157.2   69.0
## 4     60    125     1   Male  125.3   61.1
## 5     52    103     2 Female  102.6   53.5
## 6     58    122     2 Female  120.7   59.2

Take a look at the file Data/htwtEuro.txt to see why read.csv() failed. Something about too few commas? The moral of the story is that you need to get to know your data file before you read it.

Importing XLSX

  p_load(readxl)

  ### Do a simple read of a file with a single sheet and first line header
  noaa.ont.daily <- read_excel("LennoxData/NOAA010113to071018.xlsx", guess_max = 2000)
  dim(noaa.ont.daily)

## [1] 2017   42

  ### Read a multi-sheet file with some useless information above the header
  ### on the sheet we want.
  noaa.ont.hourly <- read_excel("LennoxData/NOAA_LCD_1526334.xlsx",
                      sheet = #2,  ### Could use sheet number but sheets might move
                      "NOAA_LCD_1526334", ### Use sheet name instead
                      skip = 3,  ### Skip header
                      guess_max = 25000
                      )
  dim(noaa.ont.hourly)

## [1] 78269    90

  ### Grab more simple files with interesting information from SCE
  SCE010117to010918 <- read_excel("LennoxData/SCE010117to010918.xlsx", skip = 13)
  dim(SCE010117to010918)

## [1] 10096     3

  SCE060116to072517 <- read_excel("LennoxData/SCE060116to072517.xlsx", skip = 13)
  dim(SCE060116to072517)

## [1] 1483    3

  SCE090117to092618 <- read_excel("LennoxData/SCE090117to092618.xlsx", skip = 13)
  dim(SCE090117to092618)

## [1] 10554     3

The SCE files overlap. We will see more efficient ways of merging these files in future sections. For now, let’s stack them and then get rid of the replicates.

  ### Bind rows, i.e. stack them
  sce <- rbind(SCE010117to010918, SCE060116to072517)
  sce <- rbind(sce, SCE090117to092618)
  
  ### Check to make sure that we haven't lost observations
  dim(sce)

## [1] 22133     3

  ### Rename the variables to something easy to type
  names(sce) <- c("Time_Period", "Usage_kwh", "Read_Quality")
  
  ### Take only the first of duplicated observations
  sce.duped <- sce[duplicated(sce),]
  sce.unique <- sce[!duplicated(sce),]
  sce <- unique(sce)
  dim(sce)

## [1] 15850     3

Take a graphical look at my usage over time.

  plot(sce$Usage_kwh, type="l", ylab = "kwh", xlab = "Hour")

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

There are ways of making the times more meaningful. We will learn about working with time in the future.

Importing Using readr

The tidyverse package provides its own versions of functions for reading CSV files. These are supposed to be more efficient than the base R versions.

  args(read_csv)

## function (file, col_names = TRUE, col_types = NULL, locale = default_locale(), 
##     na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", 
##     trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, 
##         n_max), progress = show_progress(), skip_empty_rows = TRUE) 
## NULL

  ### Read a local file
  htwt.local = read_csv("Data/htwt.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )

  head(htwt.local)

## # A tibble: 6 x 3
##   Height Weight Group
##    <dbl>  <dbl> <dbl>
## 1     64    159     1
## 2     63    155     2
## 3     67    157     2
## 4     60    125     1
## 5     52    103     2
## 6     58    122     2

  attributes(htwt.local)

## $names
## [1] "Height" "Weight" "Group" 
## 
## $class
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## $spec
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )

  ### Read a tab delimited file with no header
  htwt.tab = read_tsv("http://facweb1.redlands.edu/fac/jim_bentley/Data/MATH%20212/HtWtNoHeader.txt", col_names = FALSE)

## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_double()
## )

  head(htwt.tab)

## # A tibble: 6 x 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1    64   159     1
## 2    63   155     2
## 3    67   157     2
## 4    60   125     1
## 5    52   103     2
## 6    58   122     2

  names(htwt.tab) <- htwt.names
  head(htwt.tab)

## # A tibble: 6 x 3
##   Height Weight Group
##    <dbl>  <dbl> <dbl>
## 1     64    159     1
## 2     63    155     2
## 3     67    157     2
## 4     60    125     1
## 5     52    103     2
## 6     58    122     2

  attributes(htwt.web)

## $names
## [1] "Height" "Weight" "Group" 
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20

  ### Read a Euro file non-Euro
  htwt.euro <- read_csv("Data/htwtEuro.txt")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   `Height";"Weight";"Group";"group";"weight";"height` = col_character()
## )

## Warning: 20 parsing failures.
## row col  expected    actual                file
##   1  -- 1 columns 3 columns 'Data/htwtEuro.txt'
##   2  -- 1 columns 3 columns 'Data/htwtEuro.txt'
##   3  -- 1 columns 2 columns 'Data/htwtEuro.txt'
##   4  -- 1 columns 3 columns 'Data/htwtEuro.txt'
##   5  -- 1 columns 3 columns 'Data/htwtEuro.txt'
## ... ... ......... ......... ...................
## See problems(...) for more details.

  head(htwt.euro)

## # A tibble: 6 x 1
##   `Height";"Weight";"Group";"group";"weight";"height`
##   <chr>                                              
## 1 "64;159;1;\"Male\";160"                            
## 2 "63;155;2;\"Female\";155"                          
## 3 "67;157;2;\"Female\";157"                          
## 4 "60;125;1;\"Male\";125"                            
## 5 "52;103;2;\"Female\";102"                          
## 6 "58;122;2;\"Female\";120"

  ### Read it Euro
  htwt.euro <- read_csv2("Data/htwtEuro.txt")

## i Using ',' as decimal and '.' as grouping mark. Use `read_delim()` for more control.

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double(),
##   group = col_character(),
##   weight = col_double(),
##   height = col_double()
## )

  head(htwt.euro)

## # A tibble: 6 x 6
##   Height Weight Group group  weight height
##    <dbl>  <dbl> <dbl> <chr>   <dbl>  <dbl>
## 1     64    159     1 Male     160.   63.6
## 2     63    155     2 Female   156.   63.4
## 3     67    157     2 Female   157.   69  
## 4     60    125     1 Male     125.   61.1
## 5     52    103     2 Female   103.   53.5
## 6     58    122     2 Female   121.   59.2

We can check the speed of the two packages by using microbenchmark.

  p_load(microbenchmark)
  microbenchmark(
    x <- read.csv("Data/htwt.csv"),
    y <- read_csv("Data/htwt.csv"),
    y <- read.csv("Data/htwt.csv"),
    x <- read_csv("Data/htwt.csv"),
    times = 5
  )

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )
## 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Height = col_double(),
##   Weight = col_double(),
##   Group = col_double()
## )

## Unit: microseconds
##                            expr    min      lq     mean  median      uq     max
##  x <- read.csv("Data/htwt.csv")  507.0   509.7   570.92   511.6   578.8   747.5
##  y <- read_csv("Data/htwt.csv") 9093.4  9154.7 10943.26 11753.0 11792.1 12923.1
##  y <- read.csv("Data/htwt.csv")  517.8   548.2   589.32   580.4   603.4   696.8
##  x <- read_csv("Data/htwt.csv") 9431.8 10849.1 11175.84 11702.4 11832.2 12063.7
##  neval
##      5
##      5
##      5
##      5

Maybe the tidyverse versions are better for larger files.

  p_load(microbenchmark)
  write_csv(noaa.ont.hourly, "LennoxData/NOAA_LCD_1526334.csv")
  microbenchmark(
    x <- read.csv("LennoxData/NOAA_LCD_1526334.csv"),
    y <- read_csv("LennoxData/NOAA_LCD_1526334.csv"),
    times = 2
  )

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   STATION = col_character(),
##   STATION_NAME = col_character(),
##   DATE = col_datetime(format = ""),
##   REPORTTPYE = col_character(),
##   HOURLYSKYCONDITIONS = col_character(),
##   HOURLYVISIBILITY = col_character(),
##   HOURLYPRSENTWEATHERTYPE = col_character(),
##   HOURLYDewPointTempF = col_character(),
##   HOURLYDewPointTempC = col_character(),
##   HOURLYWindSpeed = col_character(),
##   HOURLYWindDirection = col_character(),
##   HOURLYPressureChange = col_logical(),
##   HOURLYPrecip = col_character(),
##   DAILYAverageRelativeHumidity = col_logical(),
##   DAILYAverageDewPointTemp = col_logical(),
##   DAILYAverageWetBulbTemp = col_logical(),
##   DAILYWeather = col_character(),
##   DAILYPrecip = col_character(),
##   DAILYSnowfall = col_logical(),
##   DAILYSnowDepth = col_logical()
##   # ... with 26 more columns
## )
## i Use `spec()` for the full column specifications.

## Warning: 617 parsing failures.
##  row                     col               expected actual                              file
## 4639 HOURLYDRYBULBTEMPF      no trailing characters  81s   'LennoxData/NOAA_LCD_1526334.csv'
## 4639 HOURLYDRYBULBTEMPC      no trailing characters  27.2s 'LennoxData/NOAA_LCD_1526334.csv'
## 5106 MonthlyDaysWithGT32Temp no trailing characters  0s    'LennoxData/NOAA_LCD_1526334.csv'
## 5106 MonthlyDaysWithLT0Temp  no trailing characters  0s    'LennoxData/NOAA_LCD_1526334.csv'
## 7214 HOURLYDRYBULBTEMPF      no trailing characters  93s   'LennoxData/NOAA_LCD_1526334.csv'
## .... ....................... ...................... ...... .................................
## See problems(...) for more details.

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   STATION = col_character(),
##   STATION_NAME = col_character(),
##   DATE = col_datetime(format = ""),
##   REPORTTPYE = col_character(),
##   HOURLYSKYCONDITIONS = col_character(),
##   HOURLYVISIBILITY = col_character(),
##   HOURLYPRSENTWEATHERTYPE = col_character(),
##   HOURLYDewPointTempF = col_character(),
##   HOURLYDewPointTempC = col_character(),
##   HOURLYWindSpeed = col_character(),
##   HOURLYWindDirection = col_character(),
##   HOURLYPressureChange = col_logical(),
##   HOURLYPrecip = col_character(),
##   DAILYAverageRelativeHumidity = col_logical(),
##   DAILYAverageDewPointTemp = col_logical(),
##   DAILYAverageWetBulbTemp = col_logical(),
##   DAILYWeather = col_character(),
##   DAILYPrecip = col_character(),
##   DAILYSnowfall = col_logical(),
##   DAILYSnowDepth = col_logical()
##   # ... with 26 more columns
## )
## i Use `spec()` for the full column specifications.

## Warning: 617 parsing failures.
##  row                     col               expected actual                              file
## 4639 HOURLYDRYBULBTEMPF      no trailing characters  81s   'LennoxData/NOAA_LCD_1526334.csv'
## 4639 HOURLYDRYBULBTEMPC      no trailing characters  27.2s 'LennoxData/NOAA_LCD_1526334.csv'
## 5106 MonthlyDaysWithGT32Temp no trailing characters  0s    'LennoxData/NOAA_LCD_1526334.csv'
## 5106 MonthlyDaysWithLT0Temp  no trailing characters  0s    'LennoxData/NOAA_LCD_1526334.csv'
## 7214 HOURLYDRYBULBTEMPF      no trailing characters  93s   'LennoxData/NOAA_LCD_1526334.csv'
## .... ....................... ...................... ...... .................................
## See problems(...) for more details.

## Unit: milliseconds
##                                              expr       min        lq     mean
##  x <- read.csv("LennoxData/NOAA_LCD_1526334.csv") 1431.3218 1431.3218 1507.474
##  y <- read_csv("LennoxData/NOAA_LCD_1526334.csv")  517.6988  517.6988  545.441
##    median        uq       max neval
##  1507.474 1583.6254 1583.6254     2
##   545.441  573.1832  573.1832     2