7 min read

A03 Working with Data in the Tidyverse

Import lego_sales.csv file

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
lego <- read_csv(file = "lego_sales.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   first_name = col_character(),
##   last_name = col_character(),
##   age = col_double(),
##   phone_number = col_character(),
##   set_id = col_double(),
##   number = col_character(),
##   theme = col_character(),
##   subtheme = col_character(),
##   year = col_double(),
##   name = col_character(),
##   pieces = col_double(),
##   us_price = col_double(),
##   image_url = col_character(),
##   quantity = col_double()
## )
lego <- read_csv(file.choose())
## 
## -- Column specification --------------------------------------------------------
## cols(
##   first_name = col_character(),
##   last_name = col_character(),
##   age = col_double(),
##   phone_number = col_character(),
##   set_id = col_double(),
##   number = col_character(),
##   theme = col_character(),
##   subtheme = col_character(),
##   year = col_double(),
##   name = col_character(),
##   pieces = col_double(),
##   us_price = col_double(),
##   image_url = col_character(),
##   quantity = col_double()
## )
head(lego)
## # A tibble: 6 x 14
##   first_name last_name   age phone_number set_id number theme    subtheme   year
##   <chr>      <chr>     <dbl> <chr>         <dbl> <chr>  <chr>    <chr>     <dbl>
## 1 Kimberly   Beckstead    24 216-555-2549  24701 76062  DC Comi~ Mighty M~  2018
## 2 Neel       Garvin       35 819-555-3189  25626 70595  Ninjago  Rise of ~  2018
## 3 Neel       Garvin       35 819-555-3189  24665 21031  Archite~ <NA>       2018
## 4 Chelsea    Bouchard     41 <NA>          24695 31048  Creator  <NA>       2018
## 5 Chelsea    Bouchard     41 <NA>          25626 70595  Ninjago  Rise of ~  2018
## 6 Chelsea    Bouchard     41 <NA>          24721 10831  Duplo    <NA>       2018
## # ... with 5 more variables: name <chr>, pieces <dbl>, us_price <dbl>,
## #   image_url <chr>, quantity <dbl>

What are the three most common first names of customers?

lego %>%
  count(first_name, sort = TRUE)
## # A tibble: 211 x 2
##    first_name     n
##    <chr>      <int>
##  1 Jackson       13
##  2 Jacob         11
##  3 Joseph        11
##  4 Michael       10
##  5 Audrey         8
##  6 Connor         8
##  7 Kaitlyn        8
##  8 Lucas          8
##  9 Amanda         7
## 10 Joshua         7
## # ... with 201 more rows

Jackson, Jacob, and Joseph are the three most common first names.

What are the three most common themes of lego sets purchased?

lego %>%
  count(theme, sort = TRUE)
## # A tibble: 25 x 2
##    theme            n
##    <chr>        <int>
##  1 Star Wars       75
##  2 Nexo Knights    64
##  3 Gear            55
##  4 Mixels          55
##  5 City            45
##  6 Friends         42
##  7 Ninjago         38
##  8 Duplo           35
##  9 Bionicle        34
## 10 Creator         25
## # ... with 15 more rows

Star Wars, Nexo Knights, and Gear are the three most common themes.

Among the most common theme of lego sets purchased, what is the most common subtheme?

lego %>%
  filter(theme == "Star Wars" | theme == "Nexo Knights" | theme == "Gear") %>%
  count(subtheme, sort = TRUE)
## # A tibble: 27 x 2
##    subtheme              n
##    <chr>             <int>
##  1 <NA>                 42
##  2 Ultimate             22
##  3 The Force Awakens    15
##  4 Buildable Figures    11
##  5 Episode V            10
##  6 MicroFighters        10
##  7 Battlefront           7
##  8 Original Content      7
##  9 Role-Play toys        7
## 10 Stationery            7
## # ... with 17 more rows

Ultimate is the most common subtheme among the most common theme of lego sets.

Create a new variable called age_group and group the ages into the following categories: “18 and under”, “19 - 25”, “26 - 35”, “36 - 50”, “51 and over”. Be sure to save the updated data set so you can use the new variable in other questions.

lego_age <- lego %>%
  mutate(age_group = case_when(between(age, 1, 18) ~ "18 and under", between(age, 19, 25) ~ "19 - 25", between(age, 26, 35) ~ "26 - 35", between(age, 36, 50) ~ "36 - 50", between(age, 51, 68) ~ "51 and over"))
age <- lego_age %>%
  select(first_name, last_name, age_group)
age
## # A tibble: 620 x 3
##    first_name last_name      age_group
##    <chr>      <chr>          <chr>    
##  1 Kimberly   Beckstead      19 - 25  
##  2 Neel       Garvin         26 - 35  
##  3 Neel       Garvin         26 - 35  
##  4 Chelsea    Bouchard       36 - 50  
##  5 Chelsea    Bouchard       36 - 50  
##  6 Chelsea    Bouchard       36 - 50  
##  7 Bryanna    Welsh          19 - 25  
##  8 Bryanna    Welsh          19 - 25  
##  9 Caleb      Garcia-Wideman 36 - 50  
## 10 Caleb      Garcia-Wideman 36 - 50  
## # ... with 610 more rows

What is the probability a randomly selected customer

is in the 19 - 25 age group?

lego_age %>%
count(age_group == "19 - 25")/nrow(lego_age)
##   age_group == "19 - 25"         n
## 1            0.000000000 0.7919355
## 2            0.001612903 0.2080645

Answer: 0.208

is in the 19 - 25 age group and purchased a Duplo theme set?

lego_age %>%
count(age_group == "19 - 25" & theme == "Duplo")/nrow(lego_age)
##   age_group == "19 - 25" & theme == "Duplo"          n
## 1                               0.000000000 0.98548387
## 2                               0.001612903 0.01451613

Answer: 0.0145

is in the 19 - 25 age group given they purchased a Duplo theme set?

lego_age_d <- lego_age %>%
  filter(theme == "Duplo") 
lego_age %>%
filter(theme == "Duplo") %>%
count(age_group == "19 - 25")/nrow(lego_age_d)
##   age_group == "19 - 25"         n
## 1             0.00000000 0.7428571
## 2             0.02857143 0.2571429

Answer: 0.257

Which age group has purchased the largest number of lego sets? How many did they purchase?

lego_age_q <- lego_age %>%
  group_by(age_group) %>%
 summarize(s = sum(quantity))

lego_age_q
## # A tibble: 5 x 2
##   age_group        s
##   <chr>        <dbl>
## 1 18 and under    45
## 2 19 - 25        174
## 3 26 - 35        267
## 4 36 - 50        313
## 5 51 and over     92

Age group “36 - 50” has purchased the largest number of lego sets which is 313.

Which age group has spent the most money on legos? How much did they spend?

lego_age_spend <- lego_age %>%
  mutate(spend = us_price * quantity) %>%
  arrange(desc(spend))
lego_age_spend
## # A tibble: 620 x 16
##    first_name last_name    age phone_number set_id number theme  subtheme   year
##    <chr>      <chr>      <dbl> <chr>         <dbl> <chr>  <chr>  <chr>     <dbl>
##  1 Jennifer   Dana          26 956-555-9719  25597 75827  Ghost~ <NA>       2018
##  2 Cecilia    Mauro         50 <NA>          25921 75157  Star ~ Rebels     2018
##  3 Kaitlyn    Degeer        24 316-555-7890  24994 10251  Advan~ Modular ~  2018
##  4 Elias      Garrett       50 832-555-9165  24915 60130  City   Police     2018
##  5 Hector     Collins       20 340-555-2508  24627 75098  Star ~ Ultimate~  2018
##  6 Nerea      Feldman       28 423-555-9631  25921 75157  Star ~ Rebels     2018
##  7 Carolyn    Quarry        23 567-555-7649  25627 70596  Ninja~ Rise of ~  2018
##  8 Cody       Gonzales      44 973-555-3236  25922 75149  Star ~ The Forc~  2018
##  9 Caleb      Garcia-Wi~    37 907-555-9236  25611 21127  Minec~ Minifig-~  2018
## 10 Melanie    Rollert       43 <NA>          26026 41179  Elves  <NA>       2018
## # ... with 610 more rows, and 7 more variables: name <chr>, pieces <dbl>,
## #   us_price <dbl>, image_url <chr>, quantity <dbl>, age_group <chr>,
## #   spend <dbl>

26 - 35 age group has spent the most money on legos, $699.98.

Come up with a question you want to answer using this data, and write it down. Then, create a data visualization that answers the question, and briefly explain how your visualization answers the question.

Which age group has spend the most on Star Wars theme?

lego_age_s <- lego_age_spend %>%
  filter(theme == "Star Wars")

  ggplot(lego_age_s, aes(age_group, spend)) +
  geom_col(fill = "red")

Age group 26-35 has purchased the most pieces of lego sets, almost 8000 pieces of star wars theme.

Add one element to the plot from the previous exercise to change the look of the plot without changing the underlying data. For example, you can change the theme, background color, add annotations, etc. State the change you’re making and display the updated visualization. We encourage you to be creative!

  lego_age_s <- lego_age_spend %>%
  filter(theme == "Star Wars")

  ggplot(lego_age_s, aes(age_group, spend)) +
  geom_col(alpha = 0.8, fill = "lightgreen") +
  theme_minimal() +
  labs(
    x = "Age group", y = "Spend", 
    title = "Total puchase amount of starwars theme purchased by each age group" 
        )

I added minimal theme and labels for title, x axis, and y axis.