Import Nobel.csv file
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
nobel <- read_csv(file = "nobel.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## id = col_double(),
## year = col_double(),
## born_date = col_date(format = ""),
## died_date = col_date(format = ""),
## share = col_double()
## )
## i Use `spec()` for the full column specifications.
nobel <- read_csv(file.choose())
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## id = col_double(),
## year = col_double(),
## born_date = col_date(format = ""),
## died_date = col_date(format = ""),
## share = col_double()
## )
## i Use `spec()` for the full column specifications.
head(nobel)
## # A tibble: 6 x 26
## id firstname surname year category affiliation city country born_date
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <date>
## 1 1 Wilhelm C~ Röntgen 1901 Physics Munich Unive~ Muni~ Germany 1845-03-27
## 2 2 Hendrik A. Lorentz 1902 Physics Leiden Unive~ Leid~ Nether~ 1853-07-18
## 3 3 Pieter Zeeman 1902 Physics Amsterdam Un~ Amst~ Nether~ 1865-05-25
## 4 4 Henri Becque~ 1903 Physics École Polyte~ Paris France 1852-12-15
## 5 5 Pierre Curie 1903 Physics École munici~ Paris France 1859-05-15
## 6 6 Marie Curie 1903 Physics <NA> <NA> <NA> 1867-11-07
## # ... with 17 more variables: died_date <date>, gender <chr>, born_city <chr>,
## # born_country <chr>, born_country_code <chr>, died_city <chr>,
## # died_country <chr>, died_country_code <chr>, overall_motivation <chr>,
## # share <dbl>, motivation <chr>, born_country_original <chr>,
## # born_city_original <chr>, died_country_original <chr>,
## # died_city_original <chr>, city_original <chr>, country_original <chr>
Write a csv file
df <- tribble(
~x, ~y,
1, "a",
2, "b",
3, "c"
)
write_csv(df, "df.csv")
Dealing with bad variable names
#edibnb_badnames <- read_csv("edibnb-badnames.csv")
#names(edibnb_badnames)
edibnb_col_names <- read_csv("edibnb-badnames.csv",
col_names = c("id", "price",
"neighbourhood", "accommodates",
"bathroom", "bedroom",
"bed", "review_scores_rating",
"n_reviews", "url"))
##
## -- Column specification --------------------------------------------------------
## cols(
## id = col_character(),
## price = col_character(),
## neighbourhood = col_character(),
## accommodates = col_character(),
## bathroom = col_character(),
## bedroom = col_character(),
## bed = col_character(),
## review_scores_rating = col_character(),
## n_reviews = col_character(),
## url = col_character()
## )
Importing data with snake case variables
edibnb_clean_names <- read_csv("edibnb-badnames.csv") %>%
janitor::clean_names()
##
## -- Column specification --------------------------------------------------------
## cols(
## ID = col_double(),
## Price = col_double(),
## neighbourhood = col_character(),
## accommodates = col_double(),
## `Number of bathrooms` = col_double(),
## `Number of Bedrooms` = col_double(),
## `n beds` = col_double(),
## `Review Scores Rating` = col_double(),
## `Number of reviews` = col_double(),
## listing_url = col_character()
## )
Read df_na.csv
read_csv("df-na.csv",
na = c("", "NA", ".", "9999", "Not applicable"))
##
## -- Column specification --------------------------------------------------------
## cols(
## x = col_double(),
## y = col_character(),
## z = col_character()
## )
## # A tibble: 9 x 3
## x y z
## <dbl> <chr> <chr>
## 1 1 a hi
## 2 NA b hello
## 3 3 <NA> <NA>
## 4 4 d ola
## 5 5 e hola
## 6 NA f whatup
## 7 7 g wassup
## 8 8 h sup
## 9 9 i <NA>
Readinf an XLSX file
library(readxl)
fav_food <- read_excel("favourite-food.xlsx",
na = c("N/A", "99999")) %>%
janitor::clean_names()
fav_food
## # A tibble: 5 x 6
## student_id full_name favourite_food meal_plan age ses
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4 High
## 2 2 Barclay Lynn French fries Lunch only 5 Midd~
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7 Low
## 4 4 Leon Rossini Anchovies Lunch only <NA> Midd~
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch five High
fav_food <- fav_food %>%
mutate(
age = if_else(age == "five", "5", age),
age = as.numeric(age)
)
fav_food
## # A tibble: 5 x 6
## student_id full_name favourite_food meal_plan age ses
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4 High
## 2 2 Barclay Lynn French fries Lunch only 5 Midd~
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7 Low
## 4 4 Leon Rossini Anchovies Lunch only NA Midd~
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5 High
glimpse(fav_food)
## Rows: 5
## Columns: 6
## $ student_id <dbl> 1, 2, 3, 4, 5
## $ full_name <chr> "Sunil Huffmann", "Barclay Lynn", "Jayendra Lyne", "Leo~
## $ favourite_food <chr> "Strawberry yoghurt", "French fries", NA, "Anchovies", ~
## $ meal_plan <chr> "Lunch only", "Lunch only", "Breakfast and lunch", "Lun~
## $ age <dbl> 4, 5, 7, NA, 5
## $ ses <chr> "High", "Middle", "Low", "Middle", "High"
ETL of SES variable
fav_food %>%
count(ses)
## # A tibble: 3 x 2
## ses n
## <chr> <int>
## 1 High 2
## 2 Low 1
## 3 Middle 2
fav_food <- fav_food %>%
mutate(ses = fct_relevel(ses, "Low", "Middle", "High"))
fav_food %>%
count(ses)
## # A tibble: 3 x 2
## ses n
## <fct> <int>
## 1 Low 1
## 2 Middle 2
## 3 High 2
fav_food
## # A tibble: 5 x 6
## student_id full_name favourite_food meal_plan age ses
## <dbl> <chr> <chr> <chr> <dbl> <fct>
## 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4 High
## 2 2 Barclay Lynn French fries Lunch only 5 Midd~
## 3 3 Jayendra Lyne <NA> Breakfast and lunch 7 Low
## 4 4 Leon Rossini Anchovies Lunch only NA Midd~
## 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5 High