library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:purrr':
##
## compact
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: iterators
## Loading required package: parallel
library(dplyr)
library(grid)
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:plyr':
##
## ozone
##
## The following object is masked from 'package:purrr':
##
## map
library(sf)
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(spData)
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(maps)
library(corrplot)
## corrplot 0.92 loaded
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
##
## The following object is masked from 'package:maps':
##
## unemp
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(Metrics)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(neuralnet)
##
## Attaching package: 'neuralnet'
##
## The following object is masked from 'package:dplyr':
##
## compute
library(wordcloud)
## Loading required package: RColorBrewer
library(rpart.plot)
## Loading required package: rpart
list.files(path = "../input")
## character(0)
Load the data
states <- map_data("state")
df <- read.csv("airbnb-listings.csv", sep = ";", header = T)
Let’s limnit our analyze region to only United States
df <- df|>
filter(Country == "United States")
#mutate(price_per_night = round(Price/Minimum.Nights))|>
#head(df,10)
tail(df, 2)
## ID Listing.Url Scrape.ID Last.Scraped
## 134544 927607 https://www.airbnb.com/rooms/927607 20170306202425 2017-03-07
## 134545 1716439 https://www.airbnb.com/rooms/1716439 20170306202425 2017-03-07
## Name
## 134544 Modern Home near Downtown and SXSW venues
## 134545 Vintage Travis Hieghts Bungalow
## Summary
## 134544 Gorgeous sunny house, feels like a retreat in the woods, 5-10 minutes to Austin's best restaurants, bars, live music venues, the convention center, Botanical Gardens, Long Center and 2 min to hike and bike trail and famous Barton Springs pool.
## 134545 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location. Upscale finish out with beautifully appointed furnishings.
## Space
## 134544 Great vacation or festival house that features amazing proximity to downtown while feeling like a retreat in the woods. Within 5-10 minutes to Austin's best restaurants, bars, live music venues, festivals like ACL and SXSW, the convention center, the Botanical Gardens, the flagship Whole Foods, Zilker Park, Barton Springs Pool, Zach Theatre and the Long Center, Broken Spoke Dance Hall, and two minutes to the hike and bike trail which takes you downtown and all over Austin. The house is over 2000 sq ft. and has 3 large bedrooms plus an office, The master has a queen, the second bedroom has a twin, and the third has an American Leather brand queen sleeper sofa with the highest quality foam. Both master and second bedroom mattresses are organic. The living room couch has a ratchet system that allows it to lie flat as a double bed. There is also a queen and single air mattress. Possible to sleep 8 in a pinch. There are two full bathrooms. Master has separate bath and shower. There is also
## 134545 This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area. Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail.
## Description
## 134544 Gorgeous sunny house, feels like a retreat in the woods, 5-10 minutes to Austin's best restaurants, bars, live music venues, the convention center, Botanical Gardens, Long Center and 2 min to hike and bike trail and famous Barton Springs pool. Great vacation or festival house that features amazing proximity to downtown while feeling like a retreat in the woods. Within 5-10 minutes to Austin's best restaurants, bars, live music venues, festivals like ACL and SXSW, the convention center, the Botanical Gardens, the flagship Whole Foods, Zilker Park, Barton Springs Pool, Zach Theatre and the Long Center, Broken Spoke Dance Hall, and two minutes to the hike and bike trail which takes you downtown and all over Austin. The house is over 2000 sq ft. and has 3 large bedrooms plus an office, The master has a queen, the second bedroom has a twin, and the third has an American Leather brand queen sleeper sofa with the highest quality foam. Both master and second bedroom mattresses are organic. The
## 134545 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location. Upscale finish out with beautifully appointed furnishings. This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area. Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail. Guests will have access to full house, wifi, and satellite TV. Home also boasts flagstone patio and covered porch. Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive. Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
## Experiences.Offered
## 134544 none
## 134545 none
## Neighborhood.Overview
## 134544 It is quiet, peaceful, and very safe, close to the trailheads and surrounded by woods, and a stone's throw from all the action of downtown.
## 134545 Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
## Notes
## 134544 Special summer 2-month price available--please inquire. House was modified to be wheelchair friendly so there is a ramp to front door and no stairs anywhere--ideal for little children or those with mobility issues.
## 134545
## Transit
## 134544 There is a bus stop about 3/4 mile from house.
## 134545
## Access
## 134544 Large trampoline (waiver must be signed), monkey bars, chin-up bars
## 134545 Guests will have access to full house, wifi, and satellite TV. Home also boasts flagstone patio and covered porch.
## Interaction
## 134544 We are available by phone and text to answer any questions. We are also happy to welcome you personally and walk you through the property.
## 134545 Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive.
## House.Rules
## 134544 Please try to leave the place the way it was when you arrived. We will provide a cleaning service after you leave. You can use the deck for socializing, but no loud parties are allowed on the premises in order to respect the neighbors. We prefer no shoes in the house please. Other Rules: No Drugs on the premises No Smoking on the premises. Guests are not allowed to use the trampoline or monkey bars, if they do it is at their own risk. If you do decide to use the trampoline at your own risk, manufacturer's instructions are only one adult on the equipment at a time. No Back Flips. Garbage should be taken out to the curb on Monday Night. Try to recycle your used bottles, cans, plastic containers and newspapers. If you use the bikes, you must wear a helmet and lock the bike.
## 134545
## Thumbnail.Url Medium.Url
## 134544
## 134545
## Picture.Url
## 134544 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/18b58deccfc4d9f6fba97bed64e617ea
## 134545 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/bab47474c20ba6981894ffaf74da095a
## XL.Picture.Url Host.ID Host.URL
## 134544 719332 https://www.airbnb.com/users/show/719332
## 134545 8690771 https://www.airbnb.com/users/show/8690771
## Host.Name Host.Since Host.Location
## 134544 Mike + Nina 2011-06-19 Austin, Texas, United States
## 134545 Cathy 2013-09-07 Austin, Texas, United States
## Host.About
## 134544 My wife Nina and I like to bike on the hike and bike trail--starts just minutes from our house-- with our son in tow on his tagalong. Our house is convenient for biking to Zilker park and all its festivals, Zach theatre and the Long Center, and both spring fed pools (Deep Eddy and Barton Springs), and to all the shops and restaurants downtown, so we plan some activity out almost every weekend. We also hike through the Greenbelt, and tube on the creek (5 minute walk) whenever there's water. We love to barbecue and enjoy dinners on the deck. And we grow our own vegetables in a garden plot on the side of the house. Nina is a family photographer so spends a lot of time photographing kids in the parks of Austin, and I'm usually at home programming. We are passionate adventure travelers and want to see the world but always seem to end up on the East Coast where our families live. We will do everything possible to make your stay wonderful and to provide you with every resource and amenity we (or you!) can think of. We're here for you!
## 134545
## Host.Response.Time Host.Response.Rate Host.Acceptance.Rate
## 134544 within a day 100
## 134545 NA
## Host.Thumbnail.Url
## 134544 https://a0.muscache.com/im/users/719332/profile_pic/1308543387/original.jpg?aki_policy=profile_small
## 134545 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_small
## Host.Picture.Url
## 134544 https://a0.muscache.com/im/users/719332/profile_pic/1308543387/original.jpg?aki_policy=profile_x_medium
## 134545 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_x_medium
## Host.Neighbourhood Host.Listings.Count Host.Total.Listings.Count
## 134544 Barton Hills 1 1
## 134545 Travis Heights 1 1
## Host.Verifications
## 134544 email,phone,facebook,reviews,kba
## 134545 email,phone,kba
## Street Neighbourhood
## 134544 Barton Hills Drive, Austin, TX 78704, United States Barton Hills
## 134545 Sunny Lane, Austin, TX 78704, United States Travis Heights
## Neighbourhood.Cleansed Neighbourhood.Group.Cleansed City State Zipcode
## 134544 78704 Austin TX 78704
## 134545 78704 Austin TX 78704
## Market Smart.Location Country.Code Country Latitude Longitude
## 134544 Austin Austin, TX US United States 30.24666 -97.78398
## 134545 Austin Austin, TX US United States 30.25170 -97.74109
## Property.Type Room.Type Accommodates Bathrooms Bedrooms Beds
## 134544 House Entire home/apt 6 2 3 3
## 134545 House Entire home/apt 3 2 2 2
## Bed.Type
## 134544 Real Bed
## 134545 Real Bed
## Amenities
## 134544 TV,Cable TV,Internet,Wireless Internet,Air conditioning,Wheelchair accessible,Kitchen,Free parking on premises,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Carbon monoxide detector,Fire extinguisher,Essentials,Shampoo
## 134545 TV,Cable TV,Wireless Internet,Air conditioning,Kitchen,Free parking on premises,Heating,Washer,Dryer
## Square.Feet Price Weekly.Price Monthly.Price Security.Deposit
## 134544 NA 399 NA 2900 500
## 134545 NA 900 NA NA 500
## Cleaning.Fee Guests.Included Extra.People Minimum.Nights Maximum.Nights
## 134544 150 4 25 3 90
## 134545 150 1 0 1 1125
## Calendar.Updated Has.Availability Availability.30 Availability.60
## 134544 today 19 31
## 134545 14 months ago 30 60
## Availability.90 Availability.365 Calendar.last.Scraped Number.of.Reviews
## 134544 40 150 2017-03-06 11
## 134545 90 365 2017-03-06 0
## First.Review Last.Review Review.Scores.Rating Review.Scores.Accuracy
## 134544 2013-03-17 2016-03-15 98 10
## 134545 NA NA
## Review.Scores.Cleanliness Review.Scores.Checkin
## 134544 10 10
## 134545 NA NA
## Review.Scores.Communication Review.Scores.Location Review.Scores.Value
## 134544 10 9 9
## 134545 NA NA NA
## License Jurisdiction.Names Cancellation.Policy
## 134544 strict
## 134545 flexible
## Calculated.host.listings.count Reviews.per.Month
## 134544 1 0.23
## 134545 1 NA
## Geolocation
## 134544 30.246658830356882,-97.78398419366533
## 134545 30.2516986542039,-97.74109388414536
## Features
## 134544 Host Has Profile Pic,Host Identity Verified,Is Location Exact
## 134545 Host Has Profile Pic,Is Location Exact
Zero price listing
One further observation from examining the tail of the data (when sorted by price_percentile) is that there are a number of Airbnbs with a listed price of zero. As nice as this would be, it’s likely some manner of interal issue with the listing (perhaps an incomplete listing, or some other issue). Before we take our 95% of data, we should also get rid of the low end anomalies. In fact, let’s get rid of everything with a price of $10 or less, just to be on the safe side
# remove values with price of $10 or lower
df <- df |>
filter(Price > 10)|>
mutate(price_per_night = round(Price/Minimum.Nights))|>
filter(price_per_night > 10)
Investigate the missing data
Since this is a huge dataset, it is unadvoidably will have NA values. Let’s look at the last review, review_per_month and its NA values:
df|>
select(Last.Review, Reviews.per.Month) |>
filter((is.na(Last.Review) & !is.na(Reviews.per.Month)) | (!is.na(Last.Review) & is.na(Reviews.per.Month))) |>
dim()
## [1] 27396 2
We see that there is no noticeable observation.
NA value in neighbourhood_group
We notice there is a significantly large number in neighbourhood_group. count = 115845. We will group them and observe the abnormality.
df |>
group_by(Neighbourhood.Group.Cleansed) |>
count()|>
tail(1)
## ID Listing.Url Scrape.ID Last.Scraped
## 125274 1716439 https://www.airbnb.com/rooms/1716439 20170306202425 2017-03-07
## Name
## 125274 Vintage Travis Hieghts Bungalow
## Summary
## 125274 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location. Upscale finish out with beautifully appointed furnishings.
## Space
## 125274 This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area. Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail.
## Description
## 125274 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location. Upscale finish out with beautifully appointed furnishings. This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area. Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail. Guests will have access to full house, wifi, and satellite TV. Home also boasts flagstone patio and covered porch. Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive. Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
## Experiences.Offered
## 125274 none
## Neighborhood.Overview
## 125274 Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
## Notes Transit
## 125274
## Access
## 125274 Guests will have access to full house, wifi, and satellite TV. Home also boasts flagstone patio and covered porch.
## Interaction
## 125274 Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive.
## House.Rules Thumbnail.Url Medium.Url
## 125274
## Picture.Url
## 125274 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/bab47474c20ba6981894ffaf74da095a
## XL.Picture.Url Host.ID Host.URL
## 125274 8690771 https://www.airbnb.com/users/show/8690771
## Host.Name Host.Since Host.Location Host.About
## 125274 Cathy 2013-09-07 Austin, Texas, United States
## Host.Response.Time Host.Response.Rate Host.Acceptance.Rate
## 125274 NA
## Host.Thumbnail.Url
## 125274 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_small
## Host.Picture.Url
## 125274 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_x_medium
## Host.Neighbourhood Host.Listings.Count Host.Total.Listings.Count
## 125274 Travis Heights 1 1
## Host.Verifications Street
## 125274 email,phone,kba Sunny Lane, Austin, TX 78704, United States
## Neighbourhood Neighbourhood.Cleansed Neighbourhood.Group.Cleansed
## 125274 Travis Heights 78704
## City State Zipcode Market Smart.Location Country.Code Country
## 125274 Austin TX 78704 Austin Austin, TX US United States
## Latitude Longitude Property.Type Room.Type Accommodates Bathrooms
## 125274 30.2517 -97.74109 House Entire home/apt 3 2
## Bedrooms Beds Bed.Type
## 125274 2 2 Real Bed
## Amenities
## 125274 TV,Cable TV,Wireless Internet,Air conditioning,Kitchen,Free parking on premises,Heating,Washer,Dryer
## Square.Feet Price Weekly.Price Monthly.Price Security.Deposit
## 125274 NA 900 NA NA 500
## Cleaning.Fee Guests.Included Extra.People Minimum.Nights Maximum.Nights
## 125274 150 1 0 1 1125
## Calendar.Updated Has.Availability Availability.30 Availability.60
## 125274 14 months ago 30 60
## Availability.90 Availability.365 Calendar.last.Scraped Number.of.Reviews
## 125274 90 365 2017-03-06 0
## First.Review Last.Review Review.Scores.Rating Review.Scores.Accuracy
## 125274 NA NA
## Review.Scores.Cleanliness Review.Scores.Checkin
## 125274 NA NA
## Review.Scores.Communication Review.Scores.Location Review.Scores.Value
## 125274 NA NA NA
## License Jurisdiction.Names Cancellation.Policy
## 125274 flexible
## Calculated.host.listings.count Reviews.per.Month
## 125274 1 NA
## Geolocation
## 125274 30.2516986542039,-97.74109388414536
## Features price_per_night freq
## 125274 Host Has Profile Pic,Is Location Exact 900 1
Now, let’s plot these neighbourhood_group on the map and observe its NA values.
df |>
filter(Longitude > -140 & Latitude > 25) |>
ggplot() +
geom_polygon(data=states, fill = "white", aes(long, lat, group=group), colour = "black") +
geom_point(aes(x=Longitude, y=Latitude, color=Neighbourhood.Group.Cleansed, size=2, alpha=0.4)) +
coord_map()
As we can see, the NA (gray) scatter through out the country. Hence, it
is ok to change these NA values to “other cities”, which make further
analyze clearer.
df = df |>
mutate(neighbourhood_group = ifelse(Neighbourhood.Group.Cleansed == "Other Cities", "Other LA Cities", Neighbourhood.Group.Cleansed),
neighbourhood_group = ifelse(Neighbourhood.Group.Cleansed == "Other neighborhoods", "Other Seattle neighbourhoods", Neighbourhood.Group.Cleansed),
neighbourhood_group = ifelse(is.na(Neighbourhood.Group.Cleansed), "Other Cities", Neighbourhood.Group.Cleansed))
Take a glimpse at the data, we notice that there is no column of state, which is also a good factor needed to be analyze. Hence, let’s create a function that convert longtitude and latitude of each location into its state.
lonlat_to_state <- function(pointsDF,
states = spData::us_states,
name_col = "NAME") {
## Convert points data.frame to an sf POINTS object
pts <- st_as_sf(pointsDF, coords = 1:2, crs = 4326)
## Transform spatial data to some planar coordinate system
## (e.g. Web Mercator) as required for geometric operations
states <- st_transform(states, crs = 3857)
pts <- st_transform(pts, crs = 3857)
## Find names of state (if any) intersected by each point
state_names <- states[[name_col]]
ii <- as.integer(st_intersects(pts, states))
state_names[ii]
}
lonlat_points <- data.frame(x =df$Longitude, y = df$Latitude)
df$state = lonlat_to_state(lonlat_points)
Number listing per state
Now, let’s have a look at number listing per state. I predict that the highly populated states like California and Newyork should be at the top of then list.
number_of_listings_by_state <- aggregate(cbind(df$ID), by = list(state = df$state), FUN = length)
order_df<- number_of_listings_by_state[order(number_of_listings_by_state$V1, decreasing = TRUE),]
colnames(order_df)[2] = "Number of Listing by state"
head(order_df)
## state Number of Listing by state
## 1 California 43481
## 9 New York 37057
## 12 Texas 9031
## 3 District of Columbia 7169
## 5 Louisiana 5203
## 4 Illinois 4748
removeRowsWithNA <- function(df, desiredCols) {
completeVec <- complete.cases(df[, desiredCols])
return(df[completeVec, ])
}
Indeed, our prediction is correct. Now, let’s visualize it
p<- ggplot(number_of_listings_by_state, aes(x =state, y= V1, fill = state))+
geom_bar(stat="identity")+
theme_minimal()+
xlab("State")+ ylab("Number of listing") + labs(title = "Number of listing per state")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p
# Get the room types and their percentages
room_types_counts <- table(df$Room.Type)
room_types <- names(room_types_counts)
counts <- as.vector(room_types_counts)
percentages <- scales::percent(round(counts/sum(counts), 2))
room_types_percentages <- sprintf("%s (%s)", room_types, percentages)
room_types_counts_df <- data.frame(group = room_types, value = counts)
# Plot
pie <- ggplot(room_types_counts_df, aes(x = "", y = value, fill = room_types_percentages))+
geom_bar(width = 1, stat = "identity")+
coord_polar("y", start = 0)+
scale_fill_brewer("Room Types", palette = "Dark2")+
ggtitle("Type of listings")+
ylab("")+
xlab("")+
labs(fill="")+
theme(axis.ticks = element_blank(), panel.grid = element_blank(), axis.text = element_blank())+
geom_text(aes(label = percentages), size = 5, position = position_stack(vjust = 0.5))
pie
Most of listing is entire home or apartments. Private room also takes a
significant portion. Meanwhile, the shared room plays a really small
role in this.
One thing we can take away from this is that traveler prefer privacy at a high degree. That is why the investors focus on operating in entire home, apartment and private room. This is also a strong point of Airbnb versus the traditional hotel format where the staying locations are much more confined and exposed to public.
Pricing
Let’s analyze the most concerned factor of any business, Pricing. First, let start with the average pring per state. Since California and Newyork have the most listing, we believe that they should have the most affordable pricing since the high competition in such crowded states.
# Calculate the average price per state
average_prices_per_state <- aggregate(cbind(df$price_per_night),
by = list(state = df$state),
FUN = function(x) mean(x))
# Plot
ggplot(data = average_prices_per_state, aes(x = average_prices_per_state$state, y = average_prices_per_state$V1))+
geom_bar(stat = "identity", fill = "steelblue", width = 0.7)+
geom_text(aes(label = round(average_prices_per_state$V1, 2)), size=4)+
coord_flip()+
xlab("State")+
ylab("Average Price Per Night")+
labs(title = "Average Price per State") +
theme_minimal()
## Warning: Use of `average_prices_per_state$state` is discouraged.
## ℹ Use `state` instead.
## Warning: Use of `average_prices_per_state$V1` is discouraged.
## ℹ Use `V1` instead.
## Use of `average_prices_per_state$V1` is discouraged.
## ℹ Use `V1` instead.
## Warning: Use of `average_prices_per_state$state` is discouraged.
## ℹ Use `state` instead.
## Warning: Use of `average_prices_per_state$V1` is discouraged.
## ℹ Use `V1` instead.
Indeed, most of the states has the average pricing per night fluctuate around $50 to $125.
highest_price_per_night <- df |>
group_by(state)|>
select(state, price_per_night,Price, Minimum.Nights)|>
filter(Price == max(Price), state != "NA")
#mutate(price_per_night = price/minimum_nights)
highest_price_per_night<-distinct(highest_price_per_night)
highest_price_per_night
## # A tibble: 28 × 4
## # Groups: state [13]
## state price_per_night Price Minimum.Nights
## <chr> <dbl> <int> <int>
## 1 Louisiana 999 999 1
## 2 Texas 333 999 3
## 3 District of Columbia 500 999 2
## 4 Louisiana 500 999 2
## 5 California 999 999 1
## 6 California 200 999 5
## 7 California 333 999 3
## 8 Massachusetts 999 999 1
## 9 Colorado 498 995 2
## 10 Texas 999 999 1
## # ℹ 18 more rows
ggplot(data = highest_price_per_night, aes(x = highest_price_per_night$state, y = highest_price_per_night$price_per_night))+
geom_bar(stat = "identity", fill = "steelblue", width = 0.7)+
#geom_text(aes(label = round(highest_price_per_night$price_per_night, 2)), size=4)+
coord_flip()+
xlab("State")+
ylab("Highest Price")+
labs(title = "Highest price per state")+
theme_minimal()
ggplot(data = df, aes(x = state, y = price_per_night, color = state)) +
geom_boxplot(outlier.shape = NA) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
coord_cartesian(ylim = c(0, 750))
Let’s investigate the relation between number of reviews and its pricing.
df|>
select(Name, Number.of.Reviews, price_per_night)|>
arrange(desc(Number.of.Reviews))|>
tail()
## Name Number.of.Reviews price_per_night
## 125269 2 beds available near Domain! 0 250
## 125270 Quite apartment near shopping center 0 100
## 125271 Home Away From Home, Chic and Comfy 0 237
## 125272 Spacious remodeled 3/2 condo 0 90
## 125273 Forest- Great for SXSW! 0 122
## 125274 Vintage Travis Hieghts Bungalow 0 900
p1<-ggplot(df, aes(x = price_per_night, y = Number.of.Reviews )) +
geom_point( size = 2)+
geom_smooth(method = lm, se = FALSE)+
labs(title = "Relation between pricing and number of reviews")+
ylab("Number of reviews")+
xlab("Price per Night")
p1
## `geom_smooth()` using formula = 'y ~ x'
As we notice, on the lower range of price, which means they are more
affordable. This leads to these listing have more reviews.
ggplot(data = df, aes(x = Room.Type, y = price_per_night,fill=Room.Type)) +
geom_boxplot(outlier.shape = NA) +theme(axis.text.x = element_text(angle = 90, hjust = 1)) +coord_cartesian(ylim = c(0, 500))+
xlab("Room type")+ ylab("Price per Night")+ labs(title = "Relation between room type and price")
This also consistent with our observation so far where the Entire home
should takes the most percentage as well as the highest average
value.
ggplot(data = df, aes(x = Host.Total.Listings.Count , y = price_per_night, color=Host.Total.Listings.Count )) +geom_point(size=0.1) +
labs(title = "Total host listing counts vs Price")
## Warning: Removed 260 rows containing missing values (`geom_point()`).
This is also makes sense with the market. The lower the price, the more bookings.
Let’s look at the distribution of property type.
df_no_NA <-subset(df,Property.Type != "N/A" & Property.Type != "")
ggplot(data = df_no_NA, aes(x = Property.Type, y = price_per_night,color=Property.Type)) +geom_boxplot(outlier.shape = NA) +theme(axis.text.x = element_text(angle = 90, hjust = 1)) +coord_cartesian(ylim = c(0, 1700))+
labs(title = "Property Types Distribution")
It seems like townhouse play the major role in this market. However, it is interesting to see such a wide range of different property types, such as castle, train, tent,…
city_to_state <- df |>
group_by(City, state) |>
count() |>
filter(!is.na(state)) |>
arrange(City) |>
ungroup() |>
distinct(City, .keep_all=TRUE) |>
select(City, state)
df_city <- df |> left_join(city_to_state, by="City", suffix=c("_sf", "_imputed"))
# creating a unified state field based on the simple features value if present and the imputed value otherwise
df_city <- df_city |>
mutate(state = ifelse(is.na(state_sf), state_imputed, state_sf)) |>
select(-state_sf, -state_imputed)
head(df_city, 2)
## ID Listing.Url Scrape.ID Last.Scraped
## 1 4917301 https://www.airbnb.com/rooms/4917301 20170502172350 2017-05-02
## 2 18240041 https://www.airbnb.com/rooms/18240041 20170502172350 2017-05-03
## Name
## 1 Studio Under House!
## 2 4BD/3.5Bth Pool Home w/ Game Room in the Hills
## Summary
## 1 This studio is fantastic! Such a beautiful space. Fits 1, or more, depends on if you want to sleep on air mattress and a bed, or bed only! 3 beds in the room - 1 queen bed, 1 twin, and 1 air mattress
## 2 Charming and grand, yet comfortable pool home! The house has 4 bedrooms, 3.5 bathrooms, an office, recreation room with 9' pool/ping pong table and wet-bar, in-home laundry machines, 2-car detached garage + parking space for one more vehicle. Located with amazing views of the hills of Woodland Hills. Close proximity to the famous Mulholland Dr, Woodland Hills Country Club, DTLA, West LA (Santa Monica/Beverly Hills/Malibu/ETC), the 101 Freeway, and plenty of shopping and dining!
## Space
## 1
## 2 -4 bedrooms (each w/ queen beds) -3.5 bathrooms -70" & 55" LED TVs w/ cable -Full kitchen available: stove, fridge, microwave, oven, blender, coffee maker, pantry, cooking & eating utensils, plateware -Dining & living rooms -Recreation room: 9' pool & ping pong table, custom built wet-bar, 55" LED TV w/ cable, access to backyard -Smart home tech: voice activated lights in foyer, living & dining rooms, voice activated thermostats, voice activated front door lock (all with Amazon Echo) -Clean linens & towels upon your arrival
## Description
## 1 This studio is fantastic! Such a beautiful space. Fits 1, or more, depends on if you want to sleep on air mattress and a bed, or bed only! 3 beds in the room - 1 queen bed, 1 twin, and 1 air mattress
## 2 Charming and grand, yet comfortable pool home! The house has 4 bedrooms, 3.5 bathrooms, an office, recreation room with 9' pool/ping pong table and wet-bar, in-home laundry machines, 2-car detached garage + parking space for one more vehicle. Located with amazing views of the hills of Woodland Hills. Close proximity to the famous Mulholland Dr, Woodland Hills Country Club, DTLA, West LA (Santa Monica/Beverly Hills/Malibu/ETC), the 101 Freeway, and plenty of shopping and dining! -4 bedrooms (each w/ queen beds) -3.5 bathrooms -70" & 55" LED TVs w/ cable -Full kitchen available: stove, fridge, microwave, oven, blender, coffee maker, pantry, cooking & eating utensils, plateware -Dining & living rooms -Recreation room: 9' pool & ping pong table, custom built wet-bar, 55" LED TV w/ cable, access to backyard -Smart home tech: voice activated lights in foyer, living & dining rooms, voice activated thermostats, voice activated front door lock (all with Amazon Echo) -Clean linens & towels upon
## Experiences.Offered
## 1 none
## 2 none
## Neighborhood.Overview
## 1
## 2 Woodland Hills is a very nice and affluent city within the greater Los Angeles Metropolitan area. The neighborhood is very safe with families regularly out for walks in the evenings. Our house is located in the hills with views of the Woodland Hills and homes. At just a block from Ventura Blvd, our home is conveniently located near plenty of restaurants and shops.
## Notes Transit
## 1
## 2
## Access
## 1
## 2 Guests can use anything inside the house. Feel free to play around with some of the smart home tech we have installed! With the Amazon Echo in the living room, you can tell "Alexa" to do certain tasks! Some things you can say: -"Alexa, turn on/off the living room lights" -"Alexa, turn on/off the dining room lights" -"Alexa, turn on/off the foyer lights" -"Alexa, set the temperature to 70 degrees" -"Alexa, lock/unlock the front door" -"Alexa, how is the weather in Woodland Hills/LA/Pasadena/etc" -"Alexa, tell me the news" -"Alexa, tell me a joke" -"Alexa, sing me a song" Give it a try!
## Interaction
## 1
## 2 I am available via text messages. Please text me first as I am not always available to talk, and I will call you back!
## House.Rules
## 1
## 2 - No rearranging or modifications to any of the rooms without host approval. - No shoes inside the house. - No loud noise after 10pm in respect to our neighbors. - No unsupervised children in the pool. We are not liable should anything happen. - If you need to ship any packages/mail/parcels to our home, we are not liable should you not receive them. It is your responsibility to take delivery. - If parking any vehicles in the driveway, please park horizontally so as to not block the street with the tail of your car. - NO PARTIES! If we find that there have been any parties, we will keep the entire deposit. - Please make sure to wash any dishes, utensils, glasses/mugs, and cookingware that is used.
## Thumbnail.Url
## 1 https://a0.muscache.com/im/pictures/61653511/dac2562e_original.jpg?aki_policy=small
## 2 https://a0.muscache.com/im/pictures/7ad6459a-2c55-437f-9495-52244635b524.jpg?aki_policy=small
## Medium.Url
## 1 https://a0.muscache.com/im/pictures/61653511/dac2562e_original.jpg?aki_policy=medium
## 2 https://a0.muscache.com/im/pictures/7ad6459a-2c55-437f-9495-52244635b524.jpg?aki_policy=medium
## Picture.Url
## 1 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/cbb8358379daaf0cc74d8c30f0f82e12
## 2 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/41b1c3cc72d3567d4e6dc39d6e85b4ff
## XL.Picture.Url
## 1 https://a0.muscache.com/im/pictures/61653511/dac2562e_original.jpg?aki_policy=x_large
## 2 https://a0.muscache.com/im/pictures/7ad6459a-2c55-437f-9495-52244635b524.jpg?aki_policy=x_large
## Host.ID Host.URL Host.Name Host.Since
## 1 24035721 https://www.airbnb.com/users/show/24035721 Michael 2014-11-22
## 2 57197381 https://www.airbnb.com/users/show/57197381 Daniel 2016-02-02
## Host.Location
## 1 Los Angeles, California, United States
## 2 Arcadia, California, United States
## Host.About
## 1
## 2 A 25 year old Taiwanese/Korean American who loves to travel. Currently working as a property manager for 18 vacation rental properties and 3 multi-family residences throughout Southern California. As a property manager, I am fully aware of the ins and outs of Airbnb and vacation rentals in general, and I'll make sure your property is treated as if it were my own!\n\nYou can call me a:\n-Film Buff \n-Auto Enthusiast\n-Music Lover\n-Sports Fanatic\n-Tech Geek\n-Outdoor Traveller\n\nThrough my travels, I've realized the best culture comes from meeting new people. Whether staying in hostels, or meeting Airbnb hosts, the culture does not necessarily come from museum tours, art exhibits, or trying different ethnic foods. Culture comes from social interaction. I hope to learn a little something about your village/town/city/state/country as a guest in your house.
## Host.Response.Time Host.Response.Rate Host.Acceptance.Rate
## 1 NA
## 2 within a day 100
## Host.Thumbnail.Url
## 1 https://a0.muscache.com/im/users/24035721/profile_pic/1419823516/original.jpg?aki_policy=profile_small
## 2 https://a0.muscache.com/im/pictures/bf63b574-7d86-4460-b501-a69af78c2de1.jpg?aki_policy=profile_small
## Host.Picture.Url
## 1 https://a0.muscache.com/im/users/24035721/profile_pic/1419823516/original.jpg?aki_policy=profile_x_medium
## 2 https://a0.muscache.com/im/pictures/bf63b574-7d86-4460-b501-a69af78c2de1.jpg?aki_policy=profile_x_medium
## Host.Neighbourhood Host.Listings.Count Host.Total.Listings.Count
## 1 Woodland Hills/Warner Center 1 1
## 2 Woodland Hills/Warner Center 1 1
## Host.Verifications
## 1 email,phone,jumio
## 2 email,phone,facebook,reviews,kba
## Street
## 1 Woodland Hills/Warner Center, Los Angeles, CA 91364, United States
## 2 Woodland Hills/Warner Center, Los Angeles, CA 91364, United States
## Neighbourhood Neighbourhood.Cleansed
## 1 Woodland Hills/Warner Center Woodland Hills
## 2 Woodland Hills/Warner Center Woodland Hills
## Neighbourhood.Group.Cleansed City State Zipcode Market
## 1 Los Angeles CA 91364 Los Angeles
## 2 Los Angeles CA 91364 Los Angeles
## Smart.Location Country.Code Country Latitude Longitude Property.Type
## 1 Los Angeles, CA US United States 34.14776 -118.5913 Apartment
## 2 Los Angeles, CA US United States 34.16460 -118.6005 House
## Room.Type Accommodates Bathrooms Bedrooms Beds Bed.Type
## 1 Private room 2 1.0 1 1 Real Bed
## 2 Entire home/apt 8 3.5 4 4 Real Bed
## Amenities
## 1 TV,Wireless Internet,Air conditioning,Free parking on premises,Hot tub,Heating,Smoke detector,Carbon monoxide detector,Essentials,Shampoo
## 2 Wireless Internet,Air conditioning,Pool,Kitchen,Free parking on premises,Pets allowed,Indoor fireplace,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Carbon monoxide detector,First aid kit,Safety card,Essentials,Shampoo,Hangers,Hair dryer,Iron,Laptop friendly workspace,TV
## Square.Feet Price Weekly.Price Monthly.Price Security.Deposit Cleaning.Fee
## 1 NA 150 NA NA NA NA
## 2 NA 175 NA NA 300 150
## Guests.Included Extra.People Minimum.Nights Maximum.Nights Calendar.Updated
## 1 1 0 1 1125 29 months ago
## 2 8 25 2 1125 today
## Has.Availability Availability.30 Availability.60 Availability.90
## 1 30 60 90
## 2 9 19 28
## Availability.365 Calendar.last.Scraped Number.of.Reviews First.Review
## 1 365 2017-05-02 0
## 2 28 2017-05-03 0
## Last.Review Review.Scores.Rating Review.Scores.Accuracy
## 1 NA NA
## 2 NA NA
## Review.Scores.Cleanliness Review.Scores.Checkin Review.Scores.Communication
## 1 NA NA NA
## 2 NA NA NA
## Review.Scores.Location Review.Scores.Value License Jurisdiction.Names
## 1 NA NA City of Los Angeles, CA
## 2 NA NA City of Los Angeles, CA
## Cancellation.Policy Calculated.host.listings.count Reviews.per.Month
## 1 flexible 1 NA
## 2 flexible 1 NA
## Geolocation
## 1 34.14775649234191,-118.59133780081316
## 2 34.164597238320674,-118.60051225075456
## Features price_per_night
## 1 Host Has Profile Pic,Is Location Exact 150
## 2 Host Has Profile Pic,Host Identity Verified,Is Location Exact 88
## neighbourhood_group state
## 1 California
## 2 California
df_city |>
ggplot(aes(x=price_per_night)) +
geom_histogram(bins=40, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
xlim(0, 3000) + xlab("price per night")
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
ggtitle("Distribution of AirBnb Prices in US Dataset")
## $title
## [1] "Distribution of AirBnb Prices in US Dataset"
##
## attr(,"class")
## [1] "labels"
ggplot(data = df, aes(x = Cancellation.Policy, y = price_per_night,color=Cancellation.Policy)) +
geom_boxplot(outlier.shape = NA) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_cartesian(ylim = c(0, 3000))
We can see that the prices are slightly more expensive for the listings
that have a strict cancellation policy
# create copy of dataset
data_clean <- df
# remove values with price of $10 or lower
data_clean <- data_clean %>% filter(price_per_night > 10)
# get rid of the top 5% percentile of prices in each city
data_clean = data_clean %>%
group_by(City) %>%
mutate(price_percentile = rank(price_per_night, ties.method="first") / length(price_per_night)) %>%
filter(price_percentile < 0.9) %>%
ungroup()
top_states <- number_of_listings_by_state|>
arrange(desc(V1))|>
head(7)
top_states
## state V1
## 1 California 43481
## 2 New York 37057
## 3 Texas 9031
## 4 District of Columbia 7169
## 5 Louisiana 5203
## 6 Illinois 4748
## 7 Washington 3657
df_city %>% filter(state %in% top_states$state) %>%
ggplot(aes(x=state, y=price_per_night, fill=state)) +
geom_boxplot(alpha=0.9) +
ylim(0, 1000) +
ggtitle("Price distribution by rich States") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_fill_viridis_d()
This is also suspicious - why are New York Airbnbs cheaper than other states? Potentially, this could be down to the types of rooms on offer.
data_clean %>% filter(state %in% top_states$state) %>%
mutate(state = ifelse(state == "New York", "New York", "Other Top States")) %>%
ggplot(aes(x=state, y=price_per_night, fill=Room.Type, order = (Room.Type) )) +
geom_bar(position = position_fill(reverse = TRUE), stat = "identity") +
ggtitle("Breakdown by type of room") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
scale_fill_viridis_d()
Looking at the breakdown, we see that NewYork offers more private room
than Entire home/apt, and the price of private room abviously cheaper
than the entire home, which in average will bring New York relatively
cheaper than other top states.
Review analysis
colnames(df)
## [1] "ID" "Listing.Url"
## [3] "Scrape.ID" "Last.Scraped"
## [5] "Name" "Summary"
## [7] "Space" "Description"
## [9] "Experiences.Offered" "Neighborhood.Overview"
## [11] "Notes" "Transit"
## [13] "Access" "Interaction"
## [15] "House.Rules" "Thumbnail.Url"
## [17] "Medium.Url" "Picture.Url"
## [19] "XL.Picture.Url" "Host.ID"
## [21] "Host.URL" "Host.Name"
## [23] "Host.Since" "Host.Location"
## [25] "Host.About" "Host.Response.Time"
## [27] "Host.Response.Rate" "Host.Acceptance.Rate"
## [29] "Host.Thumbnail.Url" "Host.Picture.Url"
## [31] "Host.Neighbourhood" "Host.Listings.Count"
## [33] "Host.Total.Listings.Count" "Host.Verifications"
## [35] "Street" "Neighbourhood"
## [37] "Neighbourhood.Cleansed" "Neighbourhood.Group.Cleansed"
## [39] "City" "State"
## [41] "Zipcode" "Market"
## [43] "Smart.Location" "Country.Code"
## [45] "Country" "Latitude"
## [47] "Longitude" "Property.Type"
## [49] "Room.Type" "Accommodates"
## [51] "Bathrooms" "Bedrooms"
## [53] "Beds" "Bed.Type"
## [55] "Amenities" "Square.Feet"
## [57] "Price" "Weekly.Price"
## [59] "Monthly.Price" "Security.Deposit"
## [61] "Cleaning.Fee" "Guests.Included"
## [63] "Extra.People" "Minimum.Nights"
## [65] "Maximum.Nights" "Calendar.Updated"
## [67] "Has.Availability" "Availability.30"
## [69] "Availability.60" "Availability.90"
## [71] "Availability.365" "Calendar.last.Scraped"
## [73] "Number.of.Reviews" "First.Review"
## [75] "Last.Review" "Review.Scores.Rating"
## [77] "Review.Scores.Accuracy" "Review.Scores.Cleanliness"
## [79] "Review.Scores.Checkin" "Review.Scores.Communication"
## [81] "Review.Scores.Location" "Review.Scores.Value"
## [83] "License" "Jurisdiction.Names"
## [85] "Cancellation.Policy" "Calculated.host.listings.count"
## [87] "Reviews.per.Month" "Geolocation"
## [89] "Features" "price_per_night"
## [91] "neighbourhood_group" "state"
scores <- c("Review.Scores.Rating","Review.Scores.Accuracy","Review.Scores.Cleanliness","Review.Scores.Checkin","Review.Scores.Communication","Review.Scores.Location","Review.Scores.Value")
scores_data <- df[scores]
scores_data <- removeRowsWithNA(scores_data, scores)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
library(gridExtra)
a <-ggplot(data = scores_data, aes(x = Review.Scores.Accuracy, y = Review.Scores.Rating )) +
geom_jitter(size = 0.1) + xlab("accuracy") +ylab("Rating")
b <- ggplot(data = scores_data, aes(x = Review.Scores.Cleanliness, y = Review.Scores.Rating )) +
geom_jitter(size = 0.1) + xlab("cleanliness") +ylab("Rating")
c <- ggplot(data = scores_data, aes(x = Review.Scores.Checkin, y = Review.Scores.Rating )) +
geom_jitter(size = 0.1)+ xlab("checkin") +ylab("Rating")
d <- ggplot(data = scores_data, aes(x = Review.Scores.Communication, y = Review.Scores.Rating )) +
geom_jitter(size = 0.1)+ xlab("Communication") +ylab("Rating")
e <- ggplot(data = scores_data, aes(x = Review.Scores.Location, y = Review.Scores.Rating )) +
geom_jitter(size = 0.1)+ xlab("Location") +ylab("Rating")
f <- ggplot(data = scores_data, aes(x = Review.Scores.Value, y = Review.Scores.Rating )) +
geom_jitter(size = 0.1)+ xlab("Value")+ylab("Rating")
grid.arrange(a, b, c, d, e, f , ncol = 2, nrow = 3)
From the plots, we can see that most of the people who give the listings high ratings, give high scores for all the other types of scores (denser in right top corners).
Host behaviors vs Price
df_no_NA <-subset(df,Host.Response.Time != "N/A" & Host.Response.Time != "")
ggplot(data = df_no_NA, aes(x = Host.Response.Time, y = price_per_night,color=Host.Response.Time))+
geom_boxplot(outlier.shape = NA) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+coord_cartesian(ylim = c(0, 400))
ggplot(data = df, aes(x = Host.Response.Rate, y = price_per_night, color=Host.Response.Rate)) +
geom_point(size=0.5)
## Warning: Removed 27906 rows containing missing values (`geom_point()`).
ggplot(data = df, aes(x = Host.Response.Rate, y = Host.Total.Listings.Count, color=Host.Response.Rate)) +
geom_point(size=0.5)
## Warning: Removed 27906 rows containing missing values (`geom_point()`).
ggplot(data = df, aes(x = Cleaning.Fee, y = Host.Total.Listings.Count, color=Cleaning.Fee)) +
geom_point(size=0.5)
## Warning: Removed 34191 rows containing missing values (`geom_point()`).
Modeling and Prediction
Relationship between ratings.
Let’s start with a basic model linear regression between all the rating reviews.
rating_regression <- lm(data=df, Review.Scores.Rating~Review.Scores.Accuracy+Review.Scores.Cleanliness+Review.Scores.Checkin+Review.Scores.Communication+Review.Scores.Location+Review.Scores.Value)
summary(rating_regression)
##
## Call:
## lm(formula = Review.Scores.Rating ~ Review.Scores.Accuracy +
## Review.Scores.Cleanliness + Review.Scores.Checkin + Review.Scores.Communication +
## Review.Scores.Location + Review.Scores.Value, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.04 -1.72 0.28 1.28 46.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.14158 0.23937 -25.66 <2e-16 ***
## Review.Scores.Accuracy 2.20828 0.02414 91.46 <2e-16 ***
## Review.Scores.Cleanliness 2.21864 0.01786 124.24 <2e-16 ***
## Review.Scores.Checkin 0.99444 0.02868 34.68 <2e-16 ***
## Review.Scores.Communication 1.64802 0.02976 55.38 <2e-16 ***
## Review.Scores.Location 0.77998 0.01895 41.15 <2e-16 ***
## Review.Scores.Value 2.63684 0.02298 114.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.906 on 96150 degrees of freedom
## (29117 observations deleted due to missingness)
## Multiple R-squared: 0.7344, Adjusted R-squared: 0.7343
## F-statistic: 4.43e+04 on 6 and 96150 DF, p-value: < 2.2e-16
The model has good p-value (<2.2e-16). All the factors are significant. Let’s plot this and observe its behavior:
ggplot(data = rating_regression, aes(Review.Scores.Rating,Review.Scores.Accuracy+Review.Scores.Cleanliness+Review.Scores.Checkin+Review.Scores.Communication+Review.Scores.Location+Review.Scores.Value)) +
geom_point()+
geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'
Let’s plot more plots to see clearer the relationship.
plot(rating_regression)
The “Normal Q-Q” plot shows if residuals are normally distributed. Our residuals are not well lined on the straight dashed line except in the middle of the plot, which is not quite good.
The “Scale-Location” plot lets us check the assumption of equal variance. Our line is not horizontal with randomly spread points, thus, our residuals are not homoscedastic. This was expected, since from the previous plots of the different types of ratings, we could clearly see that the variance depends on the score.
The “Residuals vs Leverage” plot helps us find influential cases. In fact, even though data has outliers, they might not be influential to determine a regression line. In our plot, we can barely see Cook’s distance lines because all cases are well inside of them. i.e: if we exclude the “52474” case for example, the changes in the slope coefficients won’t be important.
Relation between Price and its factors
price_regression <- p_reg1 <- lm(data=df, price_per_night~Host.Response.Rate+Host.Acceptance.Rate+Host.Total.Listings.Count+Property.Type+Room.Type+Accommodates+Bathrooms+Bedrooms+Beds+Bed.Type+Square.Feet+Security.Deposit+Cleaning.Fee+Extra.People+Minimum.Nights+Maximum.Nights+Number.of.Reviews+Cancellation.Policy+State)
summary(price_regression)
##
## Call:
## lm(formula = price_per_night ~ Host.Response.Rate + Host.Acceptance.Rate +
## Host.Total.Listings.Count + Property.Type + Room.Type + Accommodates +
## Bathrooms + Bedrooms + Beds + Bed.Type + Square.Feet + Security.Deposit +
## Cleaning.Fee + Extra.People + Minimum.Nights + Maximum.Nights +
## Number.of.Reviews + Cancellation.Policy + State, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149.54 -32.08 -7.00 15.27 660.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.196e+02 3.929e+01 3.043 0.002424 **
## Host.Response.Rate -4.497e-01 2.087e-01 -2.155 0.031473 *
## Host.Acceptance.Rate0% -2.236e+01 2.994e+01 -0.747 0.455494
## Host.Acceptance.Rate100% -1.250e+01 1.225e+01 -1.020 0.307830
## Host.Acceptance.Rate31% -4.929e+00 7.127e+01 -0.069 0.944879
## Host.Acceptance.Rate33% 2.008e+01 5.179e+01 0.388 0.698378
## Host.Acceptance.Rate50% -3.879e+01 3.678e+01 -1.055 0.291932
## Host.Acceptance.Rate56% -2.652e+01 4.688e+01 -0.566 0.571762
## Host.Acceptance.Rate58% -4.789e+01 7.139e+01 -0.671 0.502491
## Host.Acceptance.Rate63% -1.775e+02 7.829e+01 -2.267 0.023666 *
## Host.Acceptance.Rate67% -2.678e+01 5.262e+01 -0.509 0.610953
## Host.Acceptance.Rate68% 2.500e+02 7.112e+01 3.515 0.000466 ***
## Host.Acceptance.Rate71% 5.685e+00 3.680e+01 0.154 0.877262
## Host.Acceptance.Rate75% -5.647e+01 7.122e+01 -0.793 0.428049
## Host.Acceptance.Rate76% -1.094e+01 5.050e+01 -0.217 0.828477
## Host.Acceptance.Rate78% -5.153e+00 5.420e+01 -0.095 0.924282
## Host.Acceptance.Rate80% -4.977e+01 7.091e+01 -0.702 0.482950
## Host.Acceptance.Rate82% -9.241e-01 7.137e+01 -0.013 0.989672
## Host.Acceptance.Rate83% -9.152e+00 5.119e+01 -0.179 0.858156
## Host.Acceptance.Rate86% -6.195e+01 4.235e+01 -1.463 0.143968
## Host.Acceptance.Rate87% -6.780e+01 6.196e+01 -1.094 0.274146
## Host.Acceptance.Rate88% -5.438e+01 4.188e+01 -1.299 0.194508
## Host.Acceptance.Rate89% 5.936e+01 7.140e+01 0.831 0.405997
## Host.Acceptance.Rate90% 4.168e+00 7.112e+01 0.059 0.953290
## Host.Acceptance.Rate91% -1.037e+02 3.329e+01 -3.115 0.001912 **
## Host.Acceptance.Rate92% -4.572e+01 3.933e+01 -1.162 0.245411
## Host.Acceptance.Rate93% 2.097e+00 7.118e+01 0.029 0.976504
## Host.Acceptance.Rate94% -4.977e+01 7.657e+01 -0.650 0.515856
## Host.Acceptance.Rate95% 2.948e+01 5.041e+01 0.585 0.558958
## Host.Acceptance.Rate96% -4.536e+01 4.173e+01 -1.087 0.277383
## Host.Acceptance.Rate97% -3.110e+01 4.124e+01 -0.754 0.451054
## Host.Acceptance.Rate98% -1.429e+01 4.280e+01 -0.334 0.738593
## Host.Acceptance.Rate99% -1.730e+01 7.300e+01 -0.237 0.812755
## Host.Total.Listings.Count 1.465e-01 3.603e-01 0.407 0.684405
## Property.TypeBed & Breakfast -3.657e+01 4.384e+01 -0.834 0.404483
## Property.TypeBungalow -5.356e+00 2.594e+01 -0.206 0.836470
## Property.TypeCabin 1.224e+01 4.153e+01 0.295 0.768238
## Property.TypeCamper/RV -1.741e+01 4.193e+01 -0.415 0.678048
## Property.TypeCondominium 9.010e+00 1.665e+01 0.541 0.588453
## Property.TypeGuest suite -3.489e+01 5.026e+01 -0.694 0.487772
## Property.TypeGuesthouse 5.393e+01 3.624e+01 1.488 0.137167
## Property.TypeHouse -8.470e+00 6.633e+00 -1.277 0.201984
## Property.TypeLoft 2.178e+01 1.622e+01 1.343 0.179676
## Property.TypeOther 1.134e+02 2.726e+01 4.160 3.55e-05 ***
## Property.TypeTownhouse -3.633e+01 2.506e+01 -1.449 0.147617
## Property.TypeVilla 4.721e+01 7.138e+01 0.661 0.508566
## Room.TypePrivate room -6.104e+00 7.648e+00 -0.798 0.425054
## Room.TypeShared room -6.015e+01 3.790e+01 -1.587 0.112922
## Accommodates 1.303e+01 2.113e+00 6.164 1.15e-09 ***
## Bathrooms 1.106e+01 6.290e+00 1.759 0.079013 .
## Bedrooms -1.389e+01 5.046e+00 -2.753 0.006053 **
## Beds -4.000e-01 2.836e+00 -0.141 0.887861
## Bed.TypeCouch 9.413e+01 5.400e+01 1.743 0.081733 .
## Bed.TypeFuton -3.018e+01 3.663e+01 -0.824 0.410180
## Bed.TypePull-out Sofa -2.080e+01 4.007e+01 -0.519 0.603831
## Bed.TypeReal Bed -1.892e+01 3.235e+01 -0.585 0.558882
## Square.Feet 7.674e-03 4.178e-03 1.837 0.066648 .
## Security.Deposit 6.908e-02 1.688e-02 4.093 4.71e-05 ***
## Cleaning.Fee 3.720e-01 8.125e-02 4.578 5.49e-06 ***
## Extra.People 3.091e-01 1.064e-01 2.906 0.003763 **
## Minimum.Nights -1.347e+01 1.078e+00 -12.495 < 2e-16 ***
## Maximum.Nights -5.103e-08 3.646e-08 -1.400 0.162043
## Number.of.Reviews -2.148e-02 4.579e-02 -0.469 0.639131
## Cancellation.Policymoderate -3.042e+01 1.336e+01 -2.276 0.023103 *
## Cancellation.Policystrict -2.337e+01 1.313e+01 -1.779 0.075613 .
## Cancellation.Policysuper_strict_30 -4.145e+01 5.205e+01 -0.796 0.426108
## StateCO -2.601e-01 1.905e+01 -0.014 0.989111
## StateDC -3.764e+01 1.520e+01 -2.477 0.013482 *
## StateIL -2.643e+01 2.042e+01 -1.294 0.196036
## StateLA -5.269e+00 1.248e+01 -0.422 0.673024
## StateMA 4.763e+00 2.836e+01 0.168 0.866689
## StateNY -1.903e+01 7.348e+00 -2.589 0.009805 **
## StateOR -1.806e+01 1.889e+01 -0.956 0.339312
## StateTN 4.683e+01 2.975e+01 1.574 0.115879
## StateTX 1.281e+01 8.985e+00 1.426 0.154205
## StateWA -1.308e+01 1.583e+01 -0.826 0.408812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.52 on 752 degrees of freedom
## (124446 observations deleted due to missingness)
## Multiple R-squared: 0.4537, Adjusted R-squared: 0.3992
## F-statistic: 8.328 on 75 and 752 DF, p-value: < 2.2e-16
For this model, we have good p-value (<2.2e-16) with good R-squred (0.4537).
plot(price_regression)
## Warning: not plotting observations with leverage one:
## 94, 556, 627, 662, 713, 784, 812
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced