8 min read

Data Wrangling using Tidyverse

Input the Hotels.csv file

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
hotels <- read_csv("hotels.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   hotel = col_character(),
##   arrival_date_month = col_character(),
##   meal = col_character(),
##   country = col_character(),
##   market_segment = col_character(),
##   distribution_channel = col_character(),
##   reserved_room_type = col_character(),
##   assigned_room_type = col_character(),
##   deposit_type = col_character(),
##   agent = col_character(),
##   company = col_character(),
##   customer_type = col_character(),
##   reservation_status = col_character(),
##   reservation_status_date = col_date(format = "")
## )
## i Use `spec()` for the full column specifications.
hotels <- read_csv(file.choose())
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   hotel = col_character(),
##   arrival_date_month = col_character(),
##   meal = col_character(),
##   country = col_character(),
##   market_segment = col_character(),
##   distribution_channel = col_character(),
##   reserved_room_type = col_character(),
##   assigned_room_type = col_character(),
##   deposit_type = col_character(),
##   agent = col_character(),
##   company = col_character(),
##   customer_type = col_character(),
##   reservation_status = col_character(),
##   reservation_status_date = col_date(format = "")
## )
## i Use `spec()` for the full column specifications.
head(hotels)
## # A tibble: 6 x 32
##   hotel is_canceled lead_time arrival_date_ye~ arrival_date_mo~ arrival_date_we~
##   <chr>       <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Reso~           0       342             2015 July                           27
## 2 Reso~           0       737             2015 July                           27
## 3 Reso~           0         7             2015 July                           27
## 4 Reso~           0        13             2015 July                           27
## 5 Reso~           0        14             2015 July                           27
## 6 Reso~           0        14             2015 July                           27
## # ... with 26 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## #   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## #   total_of_special_requests <dbl>, reservation_status <chr>,
## #   reservation_status_date <date>
names(hotels)
##  [1] "hotel"                          "is_canceled"                   
##  [3] "lead_time"                      "arrival_date_year"             
##  [5] "arrival_date_month"             "arrival_date_week_number"      
##  [7] "arrival_date_day_of_month"      "stays_in_weekend_nights"       
##  [9] "stays_in_week_nights"           "adults"                        
## [11] "children"                       "babies"                        
## [13] "meal"                           "country"                       
## [15] "market_segment"                 "distribution_channel"          
## [17] "is_repeated_guest"              "previous_cancellations"        
## [19] "previous_bookings_not_canceled" "reserved_room_type"            
## [21] "assigned_room_type"             "booking_changes"               
## [23] "deposit_type"                   "agent"                         
## [25] "company"                        "days_in_waiting_list"          
## [27] "customer_type"                  "adr"                           
## [29] "required_car_parking_spaces"    "total_of_special_requests"     
## [31] "reservation_status"             "reservation_status_date"

Select a variable

select(hotels, hotel, lead_time)
## # A tibble: 119,390 x 2
##    hotel        lead_time
##    <chr>            <dbl>
##  1 Resort Hotel       342
##  2 Resort Hotel       737
##  3 Resort Hotel         7
##  4 Resort Hotel        13
##  5 Resort Hotel        14
##  6 Resort Hotel        14
##  7 Resort Hotel         0
##  8 Resort Hotel         9
##  9 Resort Hotel        85
## 10 Resort Hotel        75
## # ... with 119,380 more rows
hotels %>%
  select(hotel, lead_time) %>%
  arrange(desc(lead_time))
## # A tibble: 119,390 x 2
##    hotel        lead_time
##    <chr>            <dbl>
##  1 Resort Hotel       737
##  2 Resort Hotel       709
##  3 City Hotel         629
##  4 City Hotel         629
##  5 City Hotel         629
##  6 City Hotel         629
##  7 City Hotel         629
##  8 City Hotel         629
##  9 City Hotel         629
## 10 City Hotel         629
## # ... with 119,380 more rows

Select a range of variable

hotels %>%
  select(hotel:arrival_date_month)
## # A tibble: 119,390 x 5
##    hotel        is_canceled lead_time arrival_date_year arrival_date_month
##    <chr>              <dbl>     <dbl>             <dbl> <chr>             
##  1 Resort Hotel           0       342              2015 July              
##  2 Resort Hotel           0       737              2015 July              
##  3 Resort Hotel           0         7              2015 July              
##  4 Resort Hotel           0        13              2015 July              
##  5 Resort Hotel           0        14              2015 July              
##  6 Resort Hotel           0        14              2015 July              
##  7 Resort Hotel           0         0              2015 July              
##  8 Resort Hotel           0         9              2015 July              
##  9 Resort Hotel           1        85              2015 July              
## 10 Resort Hotel           1        75              2015 July              
## # ... with 119,380 more rows
hotels %>%
  select(starts_with("arrival"))
## # A tibble: 119,390 x 4
##    arrival_date_year arrival_date_mon~ arrival_date_week_n~ arrival_date_day_of~
##                <dbl> <chr>                            <dbl>                <dbl>
##  1              2015 July                                27                    1
##  2              2015 July                                27                    1
##  3              2015 July                                27                    1
##  4              2015 July                                27                    1
##  5              2015 July                                27                    1
##  6              2015 July                                27                    1
##  7              2015 July                                27                    1
##  8              2015 July                                27                    1
##  9              2015 July                                27                    1
## 10              2015 July                                27                    1
## # ... with 119,380 more rows

Select a range of rows / cases

hotels %>%
  #We will select 1st, 5th, 8th and 10th row
  slice(c(1, 5, 8, 10))
## # A tibble: 4 x 32
##   hotel is_canceled lead_time arrival_date_ye~ arrival_date_mo~ arrival_date_we~
##   <chr>       <dbl>     <dbl>            <dbl> <chr>                       <dbl>
## 1 Reso~           0       342             2015 July                           27
## 2 Reso~           0        14             2015 July                           27
## 3 Reso~           0         9             2015 July                           27
## 4 Reso~           1        75             2015 July                           27
## # ... with 26 more variables: arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## #   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## #   total_of_special_requests <dbl>, reservation_status <chr>,
## #   reservation_status_date <date>
hotels %>%
  filter(hotel == "City Hotel")
## # A tibble: 79,330 x 32
##    hotel      is_canceled lead_time arrival_date_year arrival_date_month
##    <chr>            <dbl>     <dbl>             <dbl> <chr>             
##  1 City Hotel           0         6              2015 July              
##  2 City Hotel           1        88              2015 July              
##  3 City Hotel           1        65              2015 July              
##  4 City Hotel           1        92              2015 July              
##  5 City Hotel           1       100              2015 July              
##  6 City Hotel           1        79              2015 July              
##  7 City Hotel           0         3              2015 July              
##  8 City Hotel           1        63              2015 July              
##  9 City Hotel           1        62              2015 July              
## 10 City Hotel           1        62              2015 July              
## # ... with 79,320 more rows, and 27 more variables:
## #   arrival_date_week_number <dbl>, arrival_date_day_of_month <dbl>,
## #   stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## #   children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## #   market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## #   previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## #   reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## #   deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## #   customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## #   total_of_special_requests <dbl>, reservation_status <chr>,
## #   reservation_status_date <date>
hotels %>%
  filter( 
    adults == 0,
    children >= 1
    ) %>% 
  select(adults, babies, children)
## # A tibble: 223 x 3
##    adults babies children
##     <dbl>  <dbl>    <dbl>
##  1      0      0        3
##  2      0      0        2
##  3      0      0        2
##  4      0      0        2
##  5      0      0        2
##  6      0      0        3
##  7      0      1        2
##  8      0      0        2
##  9      0      0        2
## 10      0      0        2
## # ... with 213 more rows
# bookings with no adults and some children or babies in the room
hotels %>%
  filter( 
    adults == 0,     
    children >= 1 | babies >= 1     # | means or
    ) %>%
  select(adults, babies, children)
## # A tibble: 223 x 3
##    adults babies children
##     <dbl>  <dbl>    <dbl>
##  1      0      0        3
##  2      0      0        2
##  3      0      0        2
##  4      0      0        2
##  5      0      0        2
##  6      0      0        3
##  7      0      1        2
##  8      0      0        2
##  9      0      0        2
## 10      0      0        2
## # ... with 213 more rows

Summarizing

hotels %>%
  count(market_segment)
## # A tibble: 8 x 2
##   market_segment     n
##   <chr>          <int>
## 1 Aviation         237
## 2 Complementary    743
## 3 Corporate       5295
## 4 Direct         12606
## 5 Groups         19811
## 6 Offline TA/TO  24219
## 7 Online TA      56477
## 8 Undefined          2
hotels %>%
  count(hotel, market_segment)
## # A tibble: 14 x 3
##    hotel        market_segment     n
##    <chr>        <chr>          <int>
##  1 City Hotel   Aviation         237
##  2 City Hotel   Complementary    542
##  3 City Hotel   Corporate       2986
##  4 City Hotel   Direct          6093
##  5 City Hotel   Groups         13975
##  6 City Hotel   Offline TA/TO  16747
##  7 City Hotel   Online TA      38748
##  8 City Hotel   Undefined          2
##  9 Resort Hotel Complementary    201
## 10 Resort Hotel Corporate       2309
## 11 Resort Hotel Direct          6513
## 12 Resort Hotel Groups          5836
## 13 Resort Hotel Offline TA/TO   7472
## 14 Resort Hotel Online TA      17729

Mutation

hotels %>%
  mutate(little_ones = children + babies) %>%
  select(children, babies, little_ones) %>%
  arrange(desc(little_ones))
## # A tibble: 119,390 x 3
##    children babies little_ones
##       <dbl>  <dbl>       <dbl>
##  1       10      0          10
##  2        0     10          10
##  3        0      9           9
##  4        2      1           3
##  5        2      1           3
##  6        2      1           3
##  7        3      0           3
##  8        2      1           3
##  9        2      1           3
## 10        3      0           3
## # ... with 119,380 more rows
little_ones <- hotels
little_ones %<>%
  mutate(little_ones = children + babies) %>%
  select(children, babies, little_ones) %>%
  arrange(desc(little_ones))

Summary

hotels %>%
  group_by(hotel) %>%
  summarise(mean_adr = mean(adr))
## # A tibble: 2 x 2
##   hotel        mean_adr
##   <chr>           <dbl>
## 1 City Hotel      105. 
## 2 Resort Hotel     95.0
summary <- hotels %>%
group_by(hotel) %>%
  summarise(mean_adr = mean(adr))
summary
## # A tibble: 2 x 2
##   hotel        mean_adr
##   <chr>           <dbl>
## 1 City Hotel      105. 
## 2 Resort Hotel     95.0