Data Importing and Cleaning

Data Types from R I

Load Necessary Libraries

library(readxl)
library(tidyverse)
library(gridExtra)
setwd("~/courses/R_Courses/DataCleaning")

Vectors and Matrices

A vector can be a “collection” of values or a single value 1. Atomic vector - a collection of values 2. Factors - special vectors that represent categorical data

genotype <- c(rep("WT",5),rep("KO",5))
genotype <- factor(genotype,levels=c("WT","KO"))
genotype
 [1] WT WT WT WT WT KO KO KO KO KO
Levels: WT KO
summary(genotype)
WT KO 
 5  5 
  1. Matrix
  • special vector with rows and columns
y <- matrix(1:20, nrow=5,ncol=4,byrow = FALSE)
y
     [,1] [,2] [,3] [,4]
[1,]    1    6   11   16
[2,]    2    7   12   17
[3,]    3    8   13   18
[4,]    4    9   14   19
[5,]    5   10   15   20
y <- matrix(1:20, nrow=5,ncol=4,byrow = TRUE)
y
     [,1] [,2] [,3] [,4]
[1,]    1    2    3    4
[2,]    5    6    7    8
[3,]    9   10   11   12
[4,]   13   14   15   16
[5,]   17   18   19   20
t(y) #transpose
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    5    9   13   17
[2,]    2    6   10   14   18
[3,]    3    7   11   15   19
[4,]    4    8   12   16   20
rowSums(y)
[1] 10 26 42 58 74
colMeans(y)
[1]  9 10 11 12
  1. Data frame
  • a special data structure of rows and columns, the default structure for reading in “excel-like” files
d <- c(1,2,3,4)
e <- c("red", "white", "red", NA)
f <- c(TRUE,TRUE,TRUE,FALSE)
x <- data.frame(d,e,f)
names(x) <- c("ID","Color","Passed")
x
  ID Color Passed
1  1   red   TRUE
2  2 white   TRUE
3  3   red   TRUE
4  4  <NA>  FALSE
  1. Tibble
  • A special type of data.frame
  • has a refined built-in print method to show only the first 10 rows
library(tibble)
as_tibble(iris)
# A tibble: 150 x 5
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
 1          5.1         3.5          1.4         0.2 setosa 
 2          4.9         3            1.4         0.2 setosa 
 3          4.7         3.2          1.3         0.2 setosa 
 4          4.6         3.1          1.5         0.2 setosa 
 5          5           3.6          1.4         0.2 setosa 
 6          5.4         3.9          1.7         0.4 setosa 
 7          4.6         3.4          1.4         0.3 setosa 
 8          5           3.4          1.5         0.2 setosa 
 9          4.4         2.9          1.4         0.2 setosa 
10          4.9         3.1          1.5         0.1 setosa 
# ... with 140 more rows
tibble(x = 1:5, y = 1, z = x ^ 2 + y)
# A tibble: 5 x 3
      x     y     z
  <int> <dbl> <dbl>
1     1     1     2
2     2     1     5
3     3     1    10
4     4     1    17
5     5     1    26
  1. List
  • a vector of different data types (including other vectors)
a <- c(1:5)
y <- matrix(1:20, nrow=5,ncol=4,byrow = TRUE)
w <- list(name="Fred", mynumbers=a, mymatrix=y, age=5.3)
w
$name
[1] "Fred"

$mynumbers
[1] 1 2 3 4 5

$mymatrix
     [,1] [,2] [,3] [,4]
[1,]    1    2    3    4
[2,]    5    6    7    8
[3,]    9   10   11   12
[4,]   13   14   15   16
[5,]   17   18   19   20

$age
[1] 5.3

Reading Data Into R

Dataframes is the R native data type for storing a table of mixed datatypes. Tribbles are enhanced dataframes used by the tidyverse packages.

  1. read_table
  • Reads a file into a dataframe
  1. read_csv
  • Reads a comma separated file into a dataframe
  1. readxl
  • Reads an XLS or XLSX file into a tribble
  1. readr
  • Reads a file into a tribble

Loading Data with readr and readxl

Load the an excel file on death rates and life expectancy

excel_sheets('Death_rates_and_life_expectancy_at_birth.xlsx')
[1] "NCHS_-_Death_rates_and_life_exp"
deathrates <- read_excel('Death_rates_and_life_expectancy_at_birth.xlsx',sheet = 'NCHS_-_Death_rates_and_life_exp')

-
/
                                                                                                                                           

Load the CSV file demographics

demographics <- read_csv('demographics.csv')
Parsed with column specification:
cols(
  PAT_ID = col_character(),
  GENDER = col_character(),
  BIRTH_DATE = col_character(),
  DEATH_DATE = col_character(),
  RACE = col_character(),
  ETHNICITY = col_character()
)

Create a CSV file on the fly

read_csv("a,b,c
1,2,3
4,5,6")
# A tibble: 2 x 3
      a     b     c
  <int> <int> <int>
1     1     2     3
2     4     5     6

Skipping Lines

read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)
# A tibble: 1 x 3
      x     y     z
  <int> <int> <int>
1     1     2     3
read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")
# A tibble: 1 x 3
      x     y     z
  <int> <int> <int>
1     1     2     3

No Header Line

read_csv("1,2,3\n4,5,6", col_names = FALSE)
# A tibble: 2 x 3
     X1    X2    X3
  <int> <int> <int>
1     1     2     3
2     4     5     6

Setting the Colnames

read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
# A tibble: 2 x 3
      x     y     z
  <int> <int> <int>
1     1     2     3
2     4     5     6

Data Types in Files

readr uses a heuristic to figure out the type of each column

  • logical: contains only “F”, “T”, “FALSE”, or “TRUE”.
  • integer: contains only numeric characters (and -).
  • double: contains only valid doubles (including numbers like 4.5e-5).
  • number: contains valid doubles with the grouping mark inside.
  • time: matches the default time_format.
  • date: matches the default date_format.
  • date-time: any ISO8601 date.
guess_parser("2010-10-01")
[1] "date"
guess_parser("15:01")
[1] "time"
guess_parser(c("TRUE", "FALSE"))
[1] "logical"
guess_parser(c("1", "5", "9"))
[1] "integer"
guess_parser(c("12,352,561"))
[1] "number"
str(parse_guess("2010-10-10"))
 Date[1:1], format: "2010-10-10"

readr allows you to set the data types: - col_guess() - the default - col_character() - col_double(), col_euro_double() • col_datetime(format = “”) Also - col_date(format = “”), col_time(format = “”) - col_factor(levels, ordered = FALSE) - col_integer() - col_logical() - col_number(), col_numeric() - col_skip()

x <- read_csv('demographics.csv',col_types=cols(PAT_ID=col_character(),GENDER=col_character(),BIRTH_DATE=col_date(),DEATH_DATE=col_date(), RACE=col_character(),ETHNICITY=col_character()))
number of columns of result is not a multiple of vector length (arg 1)23 parsing failures.
row # A tibble: 5 x 5 col     row col        expected     actual    file               expected   <int> <chr>      <chr>        <chr>     <chr>              actual 1     1 BIRTH_DATE "date like " 27-JUN-57 'demographics.csv' file 2     2 BIRTH_DATE "date like " 14-MAR-54 'demographics.csv' row 3     2 DEATH_DATE "date like " 26-MAR-12 'demographics.csv' col 4     3 BIRTH_DATE "date like " 13-JUN-40 'demographics.csv' expected 5     4 BIRTH_DATE "date like " 20-APR-81 'demographics.csv'
... ................. ... ............................................................ ........ ............................................................ ...... ............................................................ .... ............................................................ ... ............................................................ ... ............................................................ ........ ............................................................
See problems(...) for more details.

Writing Data

  1. CSV
write_csv(demographics, "test1.csv")
  1. R Binary Format
write_rds(demographics, "demographics.rds")

Data Cleaning: Missing Values and Controlled Language

A table is tidy if:

  • Each variable is in its own column
  • Each observation, or case, is in its own row

Tidy Data:

  • Makes variables easy to access as vectors
  • Preserves cases during vectorized operations

Examine the Data with ggplot2

p1 <- ggplot(deathrates, aes(x=Race, y=AverageLifeExpectancy_Years,color=Race)) + geom_boxplot()
p2 <- ggplot(deathrates, aes(x=Sex, y=AverageLifeExpectancy_Years,color=Sex)) + geom_boxplot()
grid.arrange(p1,p2,nrow=1)

Handle Missing Data with tidyr

There are several ways to deal with missing data:

  1. drop (drop_na)
  • Drop rows containing NA’s in columns.
  1. replace (replace_na)
  • Replace NAs by column
  1. fill (fill_na)
  • Fill in NA’s in columns with most recent non-NA values.

Drop NAs

nrow(deathrates)
[1] 1044
t1 <- drop_na(deathrates,AverageLifeExpectancy_Years)
nrow(t1)
[1] 1035

Replace NAs

t2 <- replace_na(deathrates,list(AverageLifeExpectancy_Years = 68))
head(deathrates)
# A tibble: 6 x 4
   Year Race      Sex        AverageLifeExpectancy_Years
  <dbl> <chr>     <chr>                            <dbl>
1  2015 All Races Both Sexes                        NA  
2  2014 All Races Both Sexes                        78.9
3  2013 All Races Both Sexes                        78.8
4  2012 All Races Both Sexes                        78.8
5  2011 All Races Both Sexes                        78.7
6  2010 All Races Both Sexes                        78.7
head(t2)
# A tibble: 6 x 4
   Year Race      Sex        AverageLifeExpectancy_Years
  <dbl> <chr>     <chr>                            <dbl>
1  2015 All Races Both Sexes                        68  
2  2014 All Races Both Sexes                        78.9
3  2013 All Races Both Sexes                        78.8
4  2012 All Races Both Sexes                        78.8
5  2011 All Races Both Sexes                        78.7
6  2010 All Races Both Sexes                        78.7

Fill NAs

t3 <- fill(deathrates,AverageLifeExpectancy_Years)
nrow(drop_na(deathrates))
[1] 1035
nrow(drop_na(t3))
[1] 1043
head(t3)
# A tibble: 6 x 4
   Year Race      Sex        AverageLifeExpectancy_Years
  <dbl> <chr>     <chr>                            <dbl>
1  2015 All Races Both Sexes                        NA  
2  2014 All Races Both Sexes                        78.9
3  2013 All Races Both Sexes                        78.8
4  2012 All Races Both Sexes                        78.8
5  2011 All Races Both Sexes                        78.7
6  2010 All Races Both Sexes                        78.7

Inconsistent Data Entry

Check the gender categories in the demographics table

unique(demographics$GENDER)
[1] "FEMALE" "F"      "M"      "MALE"   "FE"    

Replace Data Entry Mistakes with stringr

Update the Gender Column to Make 2 GENDER Categories M and F

demographics$GENDER <- str_replace(demographics$GENDER,'F.+','F')
demographics$GENDER <- str_replace(demographics$GENDER,'MALE','M')
unique(demographics$GENDER)
[1] "F" "M"

Reshaping Data: Gather and Spread

deathrates$Race <- str_replace(deathrates$Race,'All Races','All')
head(deathrates)
# A tibble: 6 x 4
   Year Race  Sex        AverageLifeExpectancy_Years
  <dbl> <chr> <chr>                            <dbl>
1  2015 All   Both Sexes                        NA  
2  2014 All   Both Sexes                        78.9
3  2013 All   Both Sexes                        78.8
4  2012 All   Both Sexes                        78.8
5  2011 All   Both Sexes                        78.7
6  2010 All   Both Sexes                        78.7
filter(deathrates,Year==1900)
# A tibble: 9 x 4
   Year Race  Sex        AverageLifeExpectancy_Years
  <dbl> <chr> <chr>                            <dbl>
1  1900 All   Both Sexes                        47.3
2  1900 All   Female                            48.3
3  1900 All   Male                              46.3
4  1900 Black Both Sexes                        33  
5  1900 Black Female                            33.5
6  1900 Black Male                              32.5
7  1900 White Both Sexes                        47.6
8  1900 White Female                            48.7
9  1900 White Male                              46.6
dr1 <- spread(deathrates, Race, AverageLifeExpectancy_Years)
dr2 <- gather(dr1, `All`, `Black`, `White`, key = "Race", value = "AvgLifeExpectancy")
head(dr1)
# A tibble: 6 x 5
   Year Sex          All Black White
  <dbl> <chr>      <dbl> <dbl> <dbl>
1  1900 Both Sexes  47.3  33    47.6
2  1900 Female      48.3  33.5  48.7
3  1900 Male        46.3  32.5  46.6
4  1901 Both Sexes  49.1  33.7  49.4
5  1901 Female      50.6  35.3  51  
6  1901 Male        47.6  32.2  48  
head(dr2)
# A tibble: 6 x 4
   Year Sex        Race  AvgLifeExpectancy
  <dbl> <chr>      <chr>             <dbl>
1  1900 Both Sexes All                47.3
2  1900 Female     All                48.3
3  1900 Male       All                46.3
4  1901 Both Sexes All                49.1
5  1901 Female     All                50.6
6  1901 Male       All                47.6

Transpose A Table

df1 <- data.frame(rows = c("one","two","three"),two = 1:3,three=1:3)
df1
   rows two three
1   one   1     1
2   two   2     2
3 three   3     3
df2 <- df1 %>% gather(newrows,valname,-rows) %>% spread(rows,valname)
df2
  newrows one three two
1   three   1     3   2
2     two   1     3   2

Split and Join Cells

dm1 <- separate(demographics, BIRTH_DATE, into = c("BirthDayOfMonth", "BirthMonth","BirthYear"))
head(dm1)
# A tibble: 6 x 8
  PAT_ID  GENDER BirthDayOfMonth BirthMonth BirthYear DEATH_DATE RACE    ETHNICITY   
  <chr>   <chr>  <chr>           <chr>      <chr>     <chr>      <chr>   <chr>       
1 Z583990 F      27              JUN        57        <NA>       White   Hispanic    
2 Z558145 F      14              MAR        54        26-MAR-12  Black   Non-hispanic
3 Z341881 F      13              JUN        40        <NA>       AA      Non-hispanic
4 Z795049 F      20              APR        81        <NA>       White   Non-hispanic
5 Z741693 M      19              DEC        75        <NA>       White   Non-hispanic
6 Z233337 F      18              DEC        36        <NA>       Unknown @           
dm2 <- unite(demographics,RACE,ETHNICITY,col="Population",sep=' ')
head(dm2)
# A tibble: 6 x 5
  PAT_ID  GENDER BIRTH_DATE DEATH_DATE Population        
  <chr>   <chr>  <chr>      <chr>      <chr>             
1 Z583990 F      27-JUN-57  <NA>       White Hispanic    
2 Z558145 F      14-MAR-54  26-MAR-12  Black Non-hispanic
3 Z341881 F      13-JUN-40  <NA>       AA Non-hispanic   
4 Z795049 F      20-APR-81  <NA>       White Non-hispanic
5 Z741693 M      19-DEC-75  <NA>       White Non-hispanic
6 Z233337 F      18-DEC-36  <NA>       Unknown @         

separate_rows allows for the separation to generate multiple rows

Expand Table

deathrates[is.na(deathrates$AverageLifeExpectancy_Years),]
# A tibble: 9 x 4
   Year Race  Sex        AverageLifeExpectancy_Years
  <dbl> <chr> <chr>                            <dbl>
1  2015 All   Both Sexes                          NA
2  2015 All   Female                              NA
3  2015 All   Male                                NA
4  2015 Black Both Sexes                          NA
5  2015 Black Female                              NA
6  2015 Black Male                                NA
7  2015 White Both Sexes                          NA
8  2015 White Female                              NA
9  2015 White Male                                NA
t1[is.na(t1$AverageLifeExpectancy_Years),]
# A tibble: 0 x 4
# ... with 4 variables: Year <dbl>, Race <chr>, Sex <chr>, AverageLifeExpectancy_Years <dbl>
t3 <- complete(t1,Year=1900:2015,Race=c('All','Black','White'),Sex=c('Both Sexes','Male','Female'))
t1[is.na(t1$AverageLifeExpectancy_Years),]
# A tibble: 0 x 4
# ... with 4 variables: Year <dbl>, Race <chr>, Sex <chr>, AverageLifeExpectancy_Years <dbl>
t3[is.na(t3$AverageLifeExpectancy_Years),]
# A tibble: 354 x 4
    Year Race  Sex        AverageLifeExpectancy_Years
   <dbl> <chr> <chr>                            <dbl>
 1  1900 All   Both Sexes                          NA
 2  1900 All   Female                              NA
 3  1900 All   Male                                NA
 4  1901 All   Both Sexes                          NA
 5  1901 All   Female                              NA
 6  1901 All   Male                                NA
 7  1902 All   Both Sexes                          NA
 8  1902 All   Female                              NA
 9  1902 All   Male                                NA
10  1903 All   Both Sexes                          NA
# ... with 344 more rows
t4 <- complete(t1,Year=1900:2015,Race=c('All','Black','White'),Sex=c('Both Sexes','Male','Female'),fill=list(AverageLifeExpectancy_Years=68))
t4[t4$Year>2014,]
# A tibble: 9 x 4
   Year Race  Sex        AverageLifeExpectancy_Years
  <dbl> <chr> <chr>                            <dbl>
1  2015 All   Both Sexes                          68
2  2015 All   Female                              68
3  2015 All   Male                                68
4  2015 Black Both Sexes                          68
5  2015 Black Female                              68
6  2015 Black Male                                68
7  2015 White Both Sexes                          68
8  2015 White Female                              68
9  2015 White Male                                68
df <- tibble(
  year   = c(2010, 2010, 2010, 2010, 2012, 2012, 2012),
  qtr    = c(   1,    2,    3,    4,    1,    2,    3),
  return = rnorm(7)
)
df %>% expand(year = full_seq(2010:2012, 1), qtr)
# A tibble: 12 x 2
    year   qtr
   <dbl> <dbl>
 1  2010     1
 2  2010     2
 3  2010     3
 4  2010     4
 5  2011     1
 6  2011     2
 7  2011     3
 8  2011     4
 9  2012     1
10  2012     2
11  2012     3
12  2012     4

Merging Tables

  1. Create a data frame of BMI for the patients in demographics