Data IO - MonaJzn - Fundamentals of Analytics

Import Nobel.csv file

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

nobel <- read_csv(file = "nobel.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   id = col_double(),
##   year = col_double(),
##   born_date = col_date(format = ""),
##   died_date = col_date(format = ""),
##   share = col_double()
## )
## i Use `spec()` for the full column specifications.

nobel <- read_csv(file.choose())

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   id = col_double(),
##   year = col_double(),
##   born_date = col_date(format = ""),
##   died_date = col_date(format = ""),
##   share = col_double()
## )
## i Use `spec()` for the full column specifications.

head(nobel)

## # A tibble: 6 x 26
##      id firstname  surname  year category affiliation   city  country born_date 
##   <dbl> <chr>      <chr>   <dbl> <chr>    <chr>         <chr> <chr>   <date>    
## 1     1 Wilhelm C~ Röntgen  1901 Physics  Munich Unive~ Muni~ Germany 1845-03-27
## 2     2 Hendrik A. Lorentz  1902 Physics  Leiden Unive~ Leid~ Nether~ 1853-07-18
## 3     3 Pieter     Zeeman   1902 Physics  Amsterdam Un~ Amst~ Nether~ 1865-05-25
## 4     4 Henri      Becque~  1903 Physics  École Polyte~ Paris France  1852-12-15
## 5     5 Pierre     Curie    1903 Physics  École munici~ Paris France  1859-05-15
## 6     6 Marie      Curie    1903 Physics  <NA>          <NA>  <NA>    1867-11-07
## # ... with 17 more variables: died_date <date>, gender <chr>, born_city <chr>,
## #   born_country <chr>, born_country_code <chr>, died_city <chr>,
## #   died_country <chr>, died_country_code <chr>, overall_motivation <chr>,
## #   share <dbl>, motivation <chr>, born_country_original <chr>,
## #   born_city_original <chr>, died_country_original <chr>,
## #   died_city_original <chr>, city_original <chr>, country_original <chr>

Write a csv file

df <- tribble(
  ~x, ~y,
  1,  "a",
  2,  "b",
  3,  "c"
)

write_csv(df, "df.csv")

Dealing with bad variable names

#edibnb_badnames <- read_csv("edibnb-badnames.csv")
#names(edibnb_badnames)

edibnb_col_names <- read_csv("edibnb-badnames.csv",
                             col_names = c("id", "price", 
                                           "neighbourhood", "accommodates",
                                           "bathroom", "bedroom", 
                                           "bed", "review_scores_rating", 
                                           "n_reviews", "url"))

## 
## -- Column specification --------------------------------------------------------
## cols(
##   id = col_character(),
##   price = col_character(),
##   neighbourhood = col_character(),
##   accommodates = col_character(),
##   bathroom = col_character(),
##   bedroom = col_character(),
##   bed = col_character(),
##   review_scores_rating = col_character(),
##   n_reviews = col_character(),
##   url = col_character()
## )

Importing data with snake case variables

edibnb_clean_names <- read_csv("edibnb-badnames.csv") %>%
  janitor::clean_names()

## 
## -- Column specification --------------------------------------------------------
## cols(
##   ID = col_double(),
##   Price = col_double(),
##   neighbourhood = col_character(),
##   accommodates = col_double(),
##   `Number of bathrooms` = col_double(),
##   `Number of Bedrooms` = col_double(),
##   `n beds` = col_double(),
##   `Review Scores Rating` = col_double(),
##   `Number of reviews` = col_double(),
##   listing_url = col_character()
## )

Read df_na.csv

read_csv("df-na.csv", 
         na = c("", "NA", ".", "9999", "Not applicable"))

## 
## -- Column specification --------------------------------------------------------
## cols(
##   x = col_double(),
##   y = col_character(),
##   z = col_character()
## )

## # A tibble: 9 x 3
##       x y     z     
##   <dbl> <chr> <chr> 
## 1     1 a     hi    
## 2    NA b     hello 
## 3     3 <NA>  <NA>  
## 4     4 d     ola   
## 5     5 e     hola  
## 6    NA f     whatup
## 7     7 g     wassup
## 8     8 h     sup   
## 9     9 i     <NA>

Readinf an XLSX file

library(readxl)
fav_food <- read_excel("favourite-food.xlsx",
                       na = c("N/A", "99999")) %>%
  janitor::clean_names()
fav_food

## # A tibble: 5 x 6
##   student_id full_name        favourite_food     meal_plan           age   ses  
##        <dbl> <chr>            <chr>              <chr>               <chr> <chr>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4     High 
## 2          2 Barclay Lynn     French fries       Lunch only          5     Midd~
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7     Low  
## 4          4 Leon Rossini     Anchovies          Lunch only          <NA>  Midd~
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five  High

fav_food <- fav_food %>%
  mutate(
    age = if_else(age == "five", "5", age),
    age = as.numeric(age)
    )
fav_food

## # A tibble: 5 x 6
##   student_id full_name        favourite_food     meal_plan             age ses  
##        <dbl> <chr>            <chr>              <chr>               <dbl> <chr>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4 High 
## 2          2 Barclay Lynn     French fries       Lunch only              5 Midd~
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7 Low  
## 4          4 Leon Rossini     Anchovies          Lunch only             NA Midd~
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5 High

glimpse(fav_food)

## Rows: 5
## Columns: 6
## $ student_id     <dbl> 1, 2, 3, 4, 5
## $ full_name      <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo~
## $ favourite_food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", ~
## $ meal_plan      <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun~
## $ age            <dbl> 4, 5, 7, NA, 5
## $ ses            <chr> "High", "Middle", "Low", "Middle", "High"

ETL of SES variable

fav_food %>%
  count(ses)

## # A tibble: 3 x 2
##   ses        n
##   <chr>  <int>
## 1 High       2
## 2 Low        1
## 3 Middle     2

fav_food <- fav_food %>%
  mutate(ses = fct_relevel(ses, "Low", "Middle", "High"))
fav_food %>%
  count(ses)

## # A tibble: 3 x 2
##   ses        n
##   <fct>  <int>
## 1 Low        1
## 2 Middle     2
## 3 High       2

fav_food

## # A tibble: 5 x 6
##   student_id full_name        favourite_food     meal_plan             age ses  
##        <dbl> <chr>            <chr>              <chr>               <dbl> <fct>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4 High 
## 2          2 Barclay Lynn     French fries       Lunch only              5 Midd~
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7 Low  
## 4          4 Leon Rossini     Anchovies          Lunch only             NA Midd~
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5 High