Input the Hotels.csv file
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
hotels <- read_csv("hotels.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## hotel = col_character(),
## arrival_date_month = col_character(),
## meal = col_character(),
## country = col_character(),
## market_segment = col_character(),
## distribution_channel = col_character(),
## reserved_room_type = col_character(),
## assigned_room_type = col_character(),
## deposit_type = col_character(),
## agent = col_character(),
## company = col_character(),
## customer_type = col_character(),
## reservation_status = col_character(),
## reservation_status_date = col_date(format = "")
## )
## i Use `spec()` for the full column specifications.
hotels <- read_csv(file.choose())
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## hotel = col_character(),
## arrival_date_month = col_character(),
## meal = col_character(),
## country = col_character(),
## market_segment = col_character(),
## distribution_channel = col_character(),
## reserved_room_type = col_character(),
## assigned_room_type = col_character(),
## deposit_type = col_character(),
## agent = col_character(),
## company = col_character(),
## customer_type = col_character(),
## reservation_status = col_character(),
## reservation_status_date = col_date(format = "")
## )
## i Use `spec()` for the full column specifications.
head(hotels)
## # A tibble: 6 x 32
## hotel is_canceled lead_time arrival_date_ye~ arrival_date_mo~ arrival_date_we~
## <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 Reso~ 0 342 2015 July 27
## 2 Reso~ 0 737 2015 July 27
## 3 Reso~ 0 7 2015 July 27
## 4 Reso~ 0 13 2015 July 27
## 5 Reso~ 0 14 2015 July 27
## 6 Reso~ 0 14 2015 July 27
## # ... with 26 more variables: arrival_date_day_of_month <dbl>,
## # stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## # children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## # market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## # previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## # reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## # deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## # customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## # total_of_special_requests <dbl>, reservation_status <chr>,
## # reservation_status_date <date>
names(hotels)
## [1] "hotel" "is_canceled"
## [3] "lead_time" "arrival_date_year"
## [5] "arrival_date_month" "arrival_date_week_number"
## [7] "arrival_date_day_of_month" "stays_in_weekend_nights"
## [9] "stays_in_week_nights" "adults"
## [11] "children" "babies"
## [13] "meal" "country"
## [15] "market_segment" "distribution_channel"
## [17] "is_repeated_guest" "previous_cancellations"
## [19] "previous_bookings_not_canceled" "reserved_room_type"
## [21] "assigned_room_type" "booking_changes"
## [23] "deposit_type" "agent"
## [25] "company" "days_in_waiting_list"
## [27] "customer_type" "adr"
## [29] "required_car_parking_spaces" "total_of_special_requests"
## [31] "reservation_status" "reservation_status_date"
Select a variable
select(hotels, hotel, lead_time)
## # A tibble: 119,390 x 2
## hotel lead_time
## <chr> <dbl>
## 1 Resort Hotel 342
## 2 Resort Hotel 737
## 3 Resort Hotel 7
## 4 Resort Hotel 13
## 5 Resort Hotel 14
## 6 Resort Hotel 14
## 7 Resort Hotel 0
## 8 Resort Hotel 9
## 9 Resort Hotel 85
## 10 Resort Hotel 75
## # ... with 119,380 more rows
hotels %>%
select(hotel, lead_time) %>%
arrange(desc(lead_time))
## # A tibble: 119,390 x 2
## hotel lead_time
## <chr> <dbl>
## 1 Resort Hotel 737
## 2 Resort Hotel 709
## 3 City Hotel 629
## 4 City Hotel 629
## 5 City Hotel 629
## 6 City Hotel 629
## 7 City Hotel 629
## 8 City Hotel 629
## 9 City Hotel 629
## 10 City Hotel 629
## # ... with 119,380 more rows
Select a range of variable
hotels %>%
select(hotel:arrival_date_month)
## # A tibble: 119,390 x 5
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 Resort Hotel 0 342 2015 July
## 2 Resort Hotel 0 737 2015 July
## 3 Resort Hotel 0 7 2015 July
## 4 Resort Hotel 0 13 2015 July
## 5 Resort Hotel 0 14 2015 July
## 6 Resort Hotel 0 14 2015 July
## 7 Resort Hotel 0 0 2015 July
## 8 Resort Hotel 0 9 2015 July
## 9 Resort Hotel 1 85 2015 July
## 10 Resort Hotel 1 75 2015 July
## # ... with 119,380 more rows
hotels %>%
select(starts_with("arrival"))
## # A tibble: 119,390 x 4
## arrival_date_year arrival_date_mon~ arrival_date_week_n~ arrival_date_day_of~
## <dbl> <chr> <dbl> <dbl>
## 1 2015 July 27 1
## 2 2015 July 27 1
## 3 2015 July 27 1
## 4 2015 July 27 1
## 5 2015 July 27 1
## 6 2015 July 27 1
## 7 2015 July 27 1
## 8 2015 July 27 1
## 9 2015 July 27 1
## 10 2015 July 27 1
## # ... with 119,380 more rows
Select a range of rows / cases
hotels %>%
#We will select 1st, 5th, 8th and 10th row
slice(c(1, 5, 8, 10))
## # A tibble: 4 x 32
## hotel is_canceled lead_time arrival_date_ye~ arrival_date_mo~ arrival_date_we~
## <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 Reso~ 0 342 2015 July 27
## 2 Reso~ 0 14 2015 July 27
## 3 Reso~ 0 9 2015 July 27
## 4 Reso~ 1 75 2015 July 27
## # ... with 26 more variables: arrival_date_day_of_month <dbl>,
## # stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## # children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## # market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## # previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## # reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## # deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## # customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## # total_of_special_requests <dbl>, reservation_status <chr>,
## # reservation_status_date <date>
hotels %>%
filter(hotel == "City Hotel")
## # A tibble: 79,330 x 32
## hotel is_canceled lead_time arrival_date_year arrival_date_month
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 City Hotel 0 6 2015 July
## 2 City Hotel 1 88 2015 July
## 3 City Hotel 1 65 2015 July
## 4 City Hotel 1 92 2015 July
## 5 City Hotel 1 100 2015 July
## 6 City Hotel 1 79 2015 July
## 7 City Hotel 0 3 2015 July
## 8 City Hotel 1 63 2015 July
## 9 City Hotel 1 62 2015 July
## 10 City Hotel 1 62 2015 July
## # ... with 79,320 more rows, and 27 more variables:
## # arrival_date_week_number <dbl>, arrival_date_day_of_month <dbl>,
## # stays_in_weekend_nights <dbl>, stays_in_week_nights <dbl>, adults <dbl>,
## # children <dbl>, babies <dbl>, meal <chr>, country <chr>,
## # market_segment <chr>, distribution_channel <chr>, is_repeated_guest <dbl>,
## # previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
## # reserved_room_type <chr>, assigned_room_type <chr>, booking_changes <dbl>,
## # deposit_type <chr>, agent <chr>, company <chr>, days_in_waiting_list <dbl>,
## # customer_type <chr>, adr <dbl>, required_car_parking_spaces <dbl>,
## # total_of_special_requests <dbl>, reservation_status <chr>,
## # reservation_status_date <date>
hotels %>%
filter(
adults == 0,
children >= 1
) %>%
select(adults, babies, children)
## # A tibble: 223 x 3
## adults babies children
## <dbl> <dbl> <dbl>
## 1 0 0 3
## 2 0 0 2
## 3 0 0 2
## 4 0 0 2
## 5 0 0 2
## 6 0 0 3
## 7 0 1 2
## 8 0 0 2
## 9 0 0 2
## 10 0 0 2
## # ... with 213 more rows
# bookings with no adults and some children or babies in the room
hotels %>%
filter(
adults == 0,
children >= 1 | babies >= 1 # | means or
) %>%
select(adults, babies, children)
## # A tibble: 223 x 3
## adults babies children
## <dbl> <dbl> <dbl>
## 1 0 0 3
## 2 0 0 2
## 3 0 0 2
## 4 0 0 2
## 5 0 0 2
## 6 0 0 3
## 7 0 1 2
## 8 0 0 2
## 9 0 0 2
## 10 0 0 2
## # ... with 213 more rows
Summarizing
hotels %>%
count(market_segment)
## # A tibble: 8 x 2
## market_segment n
## <chr> <int>
## 1 Aviation 237
## 2 Complementary 743
## 3 Corporate 5295
## 4 Direct 12606
## 5 Groups 19811
## 6 Offline TA/TO 24219
## 7 Online TA 56477
## 8 Undefined 2
hotels %>%
count(hotel, market_segment)
## # A tibble: 14 x 3
## hotel market_segment n
## <chr> <chr> <int>
## 1 City Hotel Aviation 237
## 2 City Hotel Complementary 542
## 3 City Hotel Corporate 2986
## 4 City Hotel Direct 6093
## 5 City Hotel Groups 13975
## 6 City Hotel Offline TA/TO 16747
## 7 City Hotel Online TA 38748
## 8 City Hotel Undefined 2
## 9 Resort Hotel Complementary 201
## 10 Resort Hotel Corporate 2309
## 11 Resort Hotel Direct 6513
## 12 Resort Hotel Groups 5836
## 13 Resort Hotel Offline TA/TO 7472
## 14 Resort Hotel Online TA 17729
Mutation
hotels %>%
mutate(little_ones = children + babies) %>%
select(children, babies, little_ones) %>%
arrange(desc(little_ones))
## # A tibble: 119,390 x 3
## children babies little_ones
## <dbl> <dbl> <dbl>
## 1 10 0 10
## 2 0 10 10
## 3 0 9 9
## 4 2 1 3
## 5 2 1 3
## 6 2 1 3
## 7 3 0 3
## 8 2 1 3
## 9 2 1 3
## 10 3 0 3
## # ... with 119,380 more rows
little_ones <- hotels
little_ones %<>%
mutate(little_ones = children + babies) %>%
select(children, babies, little_ones) %>%
arrange(desc(little_ones))
Summary
hotels %>%
group_by(hotel) %>%
summarise(mean_adr = mean(adr))
## # A tibble: 2 x 2
## hotel mean_adr
## <chr> <dbl>
## 1 City Hotel 105.
## 2 Resort Hotel 95.0
summary <- hotels %>%
group_by(hotel) %>%
summarise(mean_adr = mean(adr))
summary
## # A tibble: 2 x 2
## hotel mean_adr
## <chr> <dbl>
## 1 City Hotel 105.
## 2 Resort Hotel 95.0