library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loading required package: iterators
## Loading required package: parallel
library(dplyr)
library(grid)

library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:plyr':
## 
##     ozone
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(sf) 
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(spData) 
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(maps) 
library(corrplot)
## corrplot 0.92 loaded
library(viridis) 
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:maps':
## 
##     unemp
library(tm) 
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(Metrics) 
library(randomForest) 
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(neuralnet) 
## 
## Attaching package: 'neuralnet'
## 
## The following object is masked from 'package:dplyr':
## 
##     compute
library(wordcloud) 
## Loading required package: RColorBrewer
library(rpart.plot) 
## Loading required package: rpart
list.files(path = "../input")
## character(0)

Load the data

states <- map_data("state")
df <- read.csv("airbnb-listings.csv", sep = ";", header = T)

Let’s limnit our analyze region to only United States

df <- df|>
  filter(Country == "United States")
  #mutate(price_per_night = round(Price/Minimum.Nights))|>
  #head(df,10)
tail(df, 2)
##             ID                          Listing.Url      Scrape.ID Last.Scraped
## 134544  927607  https://www.airbnb.com/rooms/927607 20170306202425   2017-03-07
## 134545 1716439 https://www.airbnb.com/rooms/1716439 20170306202425   2017-03-07
##                                             Name
## 134544 Modern Home near Downtown and SXSW venues
## 134545           Vintage Travis Hieghts Bungalow
##                                                                                                                                                                                                                                                                                                                                                 Summary
## 134544                                                                                              Gorgeous sunny house, feels like a retreat in the woods, 5-10 minutes to Austin's best restaurants, bars, live music venues, the convention center, Botanical Gardens, Long Center and 2 min to hike and bike trail and famous Barton Springs pool.
## 134545 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location.  Upscale finish out with beautifully appointed furnishings.
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Space
## 134544 Great vacation or festival house that features amazing proximity to downtown while feeling like a retreat in the woods. Within 5-10 minutes to Austin's best restaurants, bars, live music venues, festivals like ACL and SXSW, the convention center, the Botanical Gardens, the flagship Whole Foods, Zilker Park, Barton Springs Pool, Zach Theatre and the Long Center, Broken Spoke Dance Hall, and two minutes to the hike and bike trail which takes you downtown and all over Austin. The house is over 2000 sq ft. and has 3 large bedrooms plus an office, The master has a queen, the second bedroom has a twin, and the third has an American Leather brand queen sleeper sofa with the highest quality foam. Both master and second bedroom mattresses are organic. The living room couch has a ratchet system that allows it to lie flat as a double bed. There is also a queen and single air mattress. Possible to sleep 8 in a pinch. There are two full bathrooms. Master has separate bath and shower. There is also
## 134545                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area.  Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail.
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Description
## 134544 Gorgeous sunny house, feels like a retreat in the woods, 5-10 minutes to Austin's best restaurants, bars, live music venues, the convention center, Botanical Gardens, Long Center and 2 min to hike and bike trail and famous Barton Springs pool. Great vacation or festival house that features amazing proximity to downtown while feeling like a retreat in the woods. Within 5-10 minutes to Austin's best restaurants, bars, live music venues, festivals like ACL and SXSW, the convention center, the Botanical Gardens, the flagship Whole Foods, Zilker Park, Barton Springs Pool, Zach Theatre and the Long Center, Broken Spoke Dance Hall, and two minutes to the hike and bike trail which takes you downtown and all over Austin. The house is over 2000 sq ft. and has 3 large bedrooms plus an office, The master has a queen, the second bedroom has a twin, and the third has an American Leather brand queen sleeper sofa with the highest quality foam. Both master and second bedroom mattresses are organic. The
## 134545                                  Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location.  Upscale finish out with beautifully appointed furnishings. This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area.  Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail. Guests will have access to full house, wifi, and satellite TV.  Home also boasts flagstone patio and covered porch. Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive. Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
##        Experiences.Offered
## 134544                none
## 134545                none
##                                                                                                                                                                                      Neighborhood.Overview
## 134544                                                         It is quiet, peaceful, and very safe, close to the trailheads and surrounded by woods, and a stone's throw from all the action of downtown.
## 134545 Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
##                                                                                                                                                                                                                         Notes
## 134544 Special summer 2-month price available--please inquire. House was modified to be wheelchair friendly so there is a ramp to front door and no stairs anywhere--ideal for little children or those with mobility issues.
## 134545                                                                                                                                                                                                                       
##                                               Transit
## 134544 There is a bus stop about 3/4 mile from house.
## 134545                                               
##                                                                                                                     Access
## 134544                                                 Large trampoline (waiver must be signed), monkey bars, chin-up bars
## 134545 Guests will have access to full house, wifi, and satellite TV.  Home also boasts flagstone patio and covered porch.
##                                                                                                                                       Interaction
## 134544 We are available by phone and text to answer any questions. We are also happy to welcome you personally and walk you through the property.
## 134545                           Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive.
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               House.Rules
## 134544 Please try to leave the place the way it was when you arrived.  We will provide a cleaning service after you leave. You can use the deck for socializing, but no loud parties are allowed on the premises in order to respect the neighbors. We prefer no shoes in the house please. Other Rules: No Drugs on the premises No Smoking on the premises. Guests are not allowed to use the trampoline or monkey bars, if they do it is at their own risk.   If you do decide to use the trampoline at your own risk, manufacturer's instructions are only one adult on the equipment at a time.  No Back Flips. Garbage should be taken out to the curb on Monday Night.  Try to recycle your used bottles, cans, plastic containers and newspapers. If you use the bikes, you must wear a helmet and lock the bike.
## 134545                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
##        Thumbnail.Url Medium.Url
## 134544                         
## 134545                         
##                                                                                                           Picture.Url
## 134544 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/18b58deccfc4d9f6fba97bed64e617ea
## 134545 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/bab47474c20ba6981894ffaf74da095a
##        XL.Picture.Url Host.ID                                  Host.URL
## 134544                 719332  https://www.airbnb.com/users/show/719332
## 134545                8690771 https://www.airbnb.com/users/show/8690771
##          Host.Name Host.Since                Host.Location
## 134544 Mike + Nina 2011-06-19 Austin, Texas, United States
## 134545       Cathy 2013-09-07 Austin, Texas, United States
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Host.About
## 134544 My wife Nina and I like to bike on the hike and bike trail--starts just minutes from our house-- with our son in tow on his tagalong. Our house is convenient for biking to Zilker park and all its festivals, Zach theatre and the Long Center, and both spring fed pools (Deep Eddy and Barton Springs), and to all the shops and restaurants downtown, so we plan some activity out almost every weekend. We also hike through the Greenbelt, and tube on the creek (5 minute walk) whenever there's water. We love to barbecue and enjoy dinners on the deck. And we grow our own vegetables in a garden plot on the side of the house. Nina is a family photographer so spends a lot of time photographing kids in the parks of Austin, and I'm usually at home programming.  We are passionate adventure travelers and want to see the world but always seem to end up on the East Coast where our families live. We will do everything possible to make your stay wonderful and to provide you with every resource and amenity we (or you!) can think of. We're here for you!
## 134545                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
##        Host.Response.Time Host.Response.Rate Host.Acceptance.Rate
## 134544       within a day                100                     
## 134545                                    NA                     
##                                                                                           Host.Thumbnail.Url
## 134544  https://a0.muscache.com/im/users/719332/profile_pic/1308543387/original.jpg?aki_policy=profile_small
## 134545 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_small
##                                                                                                Host.Picture.Url
## 134544  https://a0.muscache.com/im/users/719332/profile_pic/1308543387/original.jpg?aki_policy=profile_x_medium
## 134545 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_x_medium
##        Host.Neighbourhood Host.Listings.Count Host.Total.Listings.Count
## 134544       Barton Hills                   1                         1
## 134545     Travis Heights                   1                         1
##                      Host.Verifications
## 134544 email,phone,facebook,reviews,kba
## 134545                  email,phone,kba
##                                                     Street  Neighbourhood
## 134544 Barton Hills Drive, Austin, TX 78704, United States   Barton Hills
## 134545         Sunny Lane, Austin, TX 78704, United States Travis Heights
##        Neighbourhood.Cleansed Neighbourhood.Group.Cleansed   City State Zipcode
## 134544                  78704                              Austin    TX   78704
## 134545                  78704                              Austin    TX   78704
##        Market Smart.Location Country.Code       Country Latitude Longitude
## 134544 Austin     Austin, TX           US United States 30.24666 -97.78398
## 134545 Austin     Austin, TX           US United States 30.25170 -97.74109
##        Property.Type       Room.Type Accommodates Bathrooms Bedrooms Beds
## 134544         House Entire home/apt            6         2        3    3
## 134545         House Entire home/apt            3         2        2    2
##        Bed.Type
## 134544 Real Bed
## 134545 Real Bed
##                                                                                                                                                                                                                                   Amenities
## 134544 TV,Cable TV,Internet,Wireless Internet,Air conditioning,Wheelchair accessible,Kitchen,Free parking on premises,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Carbon monoxide detector,Fire extinguisher,Essentials,Shampoo
## 134545                                                                                                                                 TV,Cable TV,Wireless Internet,Air conditioning,Kitchen,Free parking on premises,Heating,Washer,Dryer
##        Square.Feet Price Weekly.Price Monthly.Price Security.Deposit
## 134544          NA   399           NA          2900              500
## 134545          NA   900           NA            NA              500
##        Cleaning.Fee Guests.Included Extra.People Minimum.Nights Maximum.Nights
## 134544          150               4           25              3             90
## 134545          150               1            0              1           1125
##        Calendar.Updated Has.Availability Availability.30 Availability.60
## 134544            today                               19              31
## 134545    14 months ago                               30              60
##        Availability.90 Availability.365 Calendar.last.Scraped Number.of.Reviews
## 134544              40              150            2017-03-06                11
## 134545              90              365            2017-03-06                 0
##        First.Review Last.Review Review.Scores.Rating Review.Scores.Accuracy
## 134544   2013-03-17  2016-03-15                   98                     10
## 134545                                            NA                     NA
##        Review.Scores.Cleanliness Review.Scores.Checkin
## 134544                        10                    10
## 134545                        NA                    NA
##        Review.Scores.Communication Review.Scores.Location Review.Scores.Value
## 134544                          10                      9                   9
## 134545                          NA                     NA                  NA
##        License Jurisdiction.Names Cancellation.Policy
## 134544                                         strict
## 134545                                       flexible
##        Calculated.host.listings.count Reviews.per.Month
## 134544                              1              0.23
## 134545                              1                NA
##                                  Geolocation
## 134544 30.246658830356882,-97.78398419366533
## 134545   30.2516986542039,-97.74109388414536
##                                                             Features
## 134544 Host Has Profile Pic,Host Identity Verified,Is Location Exact
## 134545                        Host Has Profile Pic,Is Location Exact

Zero price listing

One further observation from examining the tail of the data (when sorted by price_percentile) is that there are a number of Airbnbs with a listed price of zero. As nice as this would be, it’s likely some manner of interal issue with the listing (perhaps an incomplete listing, or some other issue). Before we take our 95% of data, we should also get rid of the low end anomalies. In fact, let’s get rid of everything with a price of $10 or less, just to be on the safe side

# remove values with price of $10 or lower
df <- df |>
  
  filter(Price > 10)|>
  mutate(price_per_night = round(Price/Minimum.Nights))|>
  filter(price_per_night > 10)

Investigate the missing data

Since this is a huge dataset, it is unadvoidably will have NA values. Let’s look at the last review, review_per_month and its NA values:

df|>
  select(Last.Review, Reviews.per.Month) |>
    filter((is.na(Last.Review) & !is.na(Reviews.per.Month)) | (!is.na(Last.Review) & is.na(Reviews.per.Month))) |>
    dim()
## [1] 27396     2

We see that there is no noticeable observation.

NA value in neighbourhood_group

We notice there is a significantly large number in neighbourhood_group. count = 115845. We will group them and observe the abnormality.

df |>
  group_by(Neighbourhood.Group.Cleansed) |>
  count()|>
  tail(1)
##             ID                          Listing.Url      Scrape.ID Last.Scraped
## 125274 1716439 https://www.airbnb.com/rooms/1716439 20170306202425   2017-03-07
##                                   Name
## 125274 Vintage Travis Hieghts Bungalow
##                                                                                                                                                                                                                                                                                                                                                 Summary
## 125274 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location.  Upscale finish out with beautifully appointed furnishings.
##                                                                                                                                                                                                                Space
## 125274 This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area.  Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail.
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Description
## 125274 Fabulous 2 bedroom bungalow with downtown views. Perched overlooking downtown in Austin's premier neighborhood. Walking distance to abundant shopping and restaurants on South Congress Avenue. Lady Bird Lake and downtown also just a short walk away from this fabulous location.  Upscale finish out with beautifully appointed furnishings. This home is outfitted with beautiful furninshings, high end finish out, and is located in Austin's best area.  Walkable to South Congress shops, restaurants, and night life as well as the Lady Bird Trail. Guests will have access to full house, wifi, and satellite TV.  Home also boasts flagstone patio and covered porch. Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive. Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
##        Experiences.Offered
## 125274                none
##                                                                                                                                                                                      Neighborhood.Overview
## 125274 Travis Height's is one of Austin's premiere neighborhoods. Its a great mix of hidden bungalows and historic mansions. The location is what drives people here... with the best walkability in town.
##        Notes Transit
## 125274              
##                                                                                                                     Access
## 125274 Guests will have access to full house, wifi, and satellite TV.  Home also boasts flagstone patio and covered porch.
##                                                                                                             Interaction
## 125274 Guests will have entire home. We can meet them for key exchange or leave instructions for them when they arrive.
##        House.Rules Thumbnail.Url Medium.Url
## 125274                                     
##                                                                                                           Picture.Url
## 125274 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/bab47474c20ba6981894ffaf74da095a
##        XL.Picture.Url Host.ID                                  Host.URL
## 125274                8690771 https://www.airbnb.com/users/show/8690771
##        Host.Name Host.Since                Host.Location Host.About
## 125274     Cathy 2013-09-07 Austin, Texas, United States           
##        Host.Response.Time Host.Response.Rate Host.Acceptance.Rate
## 125274                                    NA                     
##                                                                                           Host.Thumbnail.Url
## 125274 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_small
##                                                                                                Host.Picture.Url
## 125274 https://a0.muscache.com/im/users/8690771/profile_pic/1378581847/original.jpg?aki_policy=profile_x_medium
##        Host.Neighbourhood Host.Listings.Count Host.Total.Listings.Count
## 125274     Travis Heights                   1                         1
##        Host.Verifications                                      Street
## 125274    email,phone,kba Sunny Lane, Austin, TX 78704, United States
##         Neighbourhood Neighbourhood.Cleansed Neighbourhood.Group.Cleansed
## 125274 Travis Heights                  78704                             
##          City State Zipcode Market Smart.Location Country.Code       Country
## 125274 Austin    TX   78704 Austin     Austin, TX           US United States
##        Latitude Longitude Property.Type       Room.Type Accommodates Bathrooms
## 125274  30.2517 -97.74109         House Entire home/apt            3         2
##        Bedrooms Beds Bed.Type
## 125274        2    2 Real Bed
##                                                                                                   Amenities
## 125274 TV,Cable TV,Wireless Internet,Air conditioning,Kitchen,Free parking on premises,Heating,Washer,Dryer
##        Square.Feet Price Weekly.Price Monthly.Price Security.Deposit
## 125274          NA   900           NA            NA              500
##        Cleaning.Fee Guests.Included Extra.People Minimum.Nights Maximum.Nights
## 125274          150               1            0              1           1125
##        Calendar.Updated Has.Availability Availability.30 Availability.60
## 125274    14 months ago                               30              60
##        Availability.90 Availability.365 Calendar.last.Scraped Number.of.Reviews
## 125274              90              365            2017-03-06                 0
##        First.Review Last.Review Review.Scores.Rating Review.Scores.Accuracy
## 125274                                            NA                     NA
##        Review.Scores.Cleanliness Review.Scores.Checkin
## 125274                        NA                    NA
##        Review.Scores.Communication Review.Scores.Location Review.Scores.Value
## 125274                          NA                     NA                  NA
##        License Jurisdiction.Names Cancellation.Policy
## 125274                                       flexible
##        Calculated.host.listings.count Reviews.per.Month
## 125274                              1                NA
##                                Geolocation
## 125274 30.2516986542039,-97.74109388414536
##                                      Features price_per_night freq
## 125274 Host Has Profile Pic,Is Location Exact             900    1

Now, let’s plot these neighbourhood_group on the map and observe its NA values.

df |>
    filter(Longitude > -140 & Latitude > 25) |>
    ggplot() + 
    geom_polygon(data=states, fill = "white", aes(long, lat, group=group), colour = "black") +
    geom_point(aes(x=Longitude, y=Latitude, color=Neighbourhood.Group.Cleansed, size=2, alpha=0.4)) +     
    coord_map()

As we can see, the NA (gray) scatter through out the country. Hence, it is ok to change these NA values to “other cities”, which make further analyze clearer.

df = df |>
  mutate(neighbourhood_group = ifelse(Neighbourhood.Group.Cleansed == "Other Cities", "Other LA Cities", Neighbourhood.Group.Cleansed),
               neighbourhood_group = ifelse(Neighbourhood.Group.Cleansed == "Other neighborhoods", "Other Seattle neighbourhoods", Neighbourhood.Group.Cleansed),
               neighbourhood_group = ifelse(is.na(Neighbourhood.Group.Cleansed), "Other Cities", Neighbourhood.Group.Cleansed))

Take a glimpse at the data, we notice that there is no column of state, which is also a good factor needed to be analyze. Hence, let’s create a function that convert longtitude and latitude of each location into its state.

lonlat_to_state <- function(pointsDF,
                            states = spData::us_states,
                            name_col = "NAME") {
    ## Convert points data.frame to an sf POINTS object
    pts <- st_as_sf(pointsDF, coords = 1:2, crs = 4326)

    ## Transform spatial data to some planar coordinate system
    ## (e.g. Web Mercator) as required for geometric operations
    states <- st_transform(states, crs = 3857)
    pts <- st_transform(pts, crs = 3857)

    ## Find names of state (if any) intersected by each point
    state_names <- states[[name_col]]
    ii <- as.integer(st_intersects(pts, states))
    state_names[ii]
}

lonlat_points <- data.frame(x =df$Longitude, y = df$Latitude)
df$state = lonlat_to_state(lonlat_points)

Number listing per state

Now, let’s have a look at number listing per state. I predict that the highly populated states like California and Newyork should be at the top of then list.

number_of_listings_by_state <- aggregate(cbind(df$ID), by = list(state = df$state), FUN = length)
order_df<- number_of_listings_by_state[order(number_of_listings_by_state$V1, decreasing = TRUE),]
colnames(order_df)[2] = "Number of Listing by state"
head(order_df)
##                   state Number of Listing by state
## 1            California                      43481
## 9              New York                      37057
## 12                Texas                       9031
## 3  District of Columbia                       7169
## 5             Louisiana                       5203
## 4              Illinois                       4748
removeRowsWithNA <- function(df, desiredCols) {
  completeVec <- complete.cases(df[, desiredCols])
  return(df[completeVec, ])
}

Indeed, our prediction is correct. Now, let’s visualize it

p<- ggplot(number_of_listings_by_state, aes(x =state, y= V1, fill = state))+
  geom_bar(stat="identity")+
  theme_minimal()+
  xlab("State")+ ylab("Number of listing") + labs(title = "Number of listing per state")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
p

# Get the room types and their percentages
room_types_counts <- table(df$Room.Type)
room_types <- names(room_types_counts)
counts <- as.vector(room_types_counts)
percentages <- scales::percent(round(counts/sum(counts), 2))
room_types_percentages <- sprintf("%s (%s)", room_types, percentages)
room_types_counts_df <- data.frame(group = room_types, value = counts)

# Plot
pie <- ggplot(room_types_counts_df, aes(x = "", y = value, fill = room_types_percentages))+
  geom_bar(width = 1, stat = "identity")+
  coord_polar("y", start = 0)+
  scale_fill_brewer("Room Types", palette = "Dark2")+
  ggtitle("Type of listings")+
  ylab("")+
  xlab("")+
  labs(fill="")+
  theme(axis.ticks = element_blank(), panel.grid = element_blank(), axis.text = element_blank())+
  geom_text(aes(label = percentages), size = 5, position = position_stack(vjust = 0.5))
pie

Most of listing is entire home or apartments. Private room also takes a significant portion. Meanwhile, the shared room plays a really small role in this.

One thing we can take away from this is that traveler prefer privacy at a high degree. That is why the investors focus on operating in entire home, apartment and private room. This is also a strong point of Airbnb versus the traditional hotel format where the staying locations are much more confined and exposed to public.

Pricing

Let’s analyze the most concerned factor of any business, Pricing. First, let start with the average pring per state. Since California and Newyork have the most listing, we believe that they should have the most affordable pricing since the high competition in such crowded states.

# Calculate the average price per state
average_prices_per_state <- aggregate(cbind(df$price_per_night),
                  by = list(state = df$state),
                  FUN = function(x) mean(x))

# Plot
ggplot(data = average_prices_per_state, aes(x = average_prices_per_state$state, y = average_prices_per_state$V1))+
    geom_bar(stat = "identity", fill = "steelblue", width = 0.7)+
    geom_text(aes(label = round(average_prices_per_state$V1, 2)), size=4)+
    coord_flip()+
    xlab("State")+
    ylab("Average Price Per Night")+ 
  labs(title = "Average Price per State") +
    theme_minimal()
## Warning: Use of `average_prices_per_state$state` is discouraged.
## ℹ Use `state` instead.
## Warning: Use of `average_prices_per_state$V1` is discouraged.
## ℹ Use `V1` instead.
## Use of `average_prices_per_state$V1` is discouraged.
## ℹ Use `V1` instead.
## Warning: Use of `average_prices_per_state$state` is discouraged.
## ℹ Use `state` instead.
## Warning: Use of `average_prices_per_state$V1` is discouraged.
## ℹ Use `V1` instead.

Indeed, most of the states has the average pricing per night fluctuate around $50 to $125.

highest_price_per_night <- df |>
  group_by(state)|>
  select(state, price_per_night,Price,  Minimum.Nights)|>
  filter(Price == max(Price), state != "NA")
  #mutate(price_per_night = price/minimum_nights)
 
highest_price_per_night<-distinct(highest_price_per_night)
highest_price_per_night
## # A tibble: 28 × 4
## # Groups:   state [13]
##    state                price_per_night Price Minimum.Nights
##    <chr>                          <dbl> <int>          <int>
##  1 Louisiana                        999   999              1
##  2 Texas                            333   999              3
##  3 District of Columbia             500   999              2
##  4 Louisiana                        500   999              2
##  5 California                       999   999              1
##  6 California                       200   999              5
##  7 California                       333   999              3
##  8 Massachusetts                    999   999              1
##  9 Colorado                         498   995              2
## 10 Texas                            999   999              1
## # ℹ 18 more rows
ggplot(data = highest_price_per_night, aes(x = highest_price_per_night$state, y = highest_price_per_night$price_per_night))+
    geom_bar(stat = "identity", fill = "steelblue", width = 0.7)+
    #geom_text(aes(label = round(highest_price_per_night$price_per_night, 2)), size=4)+
    coord_flip()+
    xlab("State")+
    ylab("Highest Price")+  
    labs(title = "Highest price per state")+
    theme_minimal()

ggplot(data = df, aes(x = state, y = price_per_night, color = state)) +
  geom_boxplot(outlier.shape = NA) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  coord_cartesian(ylim = c(0, 750))

Let’s investigate the relation between number of reviews and its pricing.

df|>
  select(Name, Number.of.Reviews, price_per_night)|>
  arrange(desc(Number.of.Reviews))|>
  tail()
##                                        Name Number.of.Reviews price_per_night
## 125269        2 beds available near Domain!                 0             250
## 125270 Quite apartment near shopping center                 0             100
## 125271  Home Away From Home, Chic and Comfy                 0             237
## 125272         Spacious remodeled 3/2 condo                 0              90
## 125273              Forest- Great for SXSW!                 0             122
## 125274      Vintage Travis Hieghts Bungalow                 0             900
p1<-ggplot(df, aes(x = price_per_night, y = Number.of.Reviews )) +
  geom_point( size = 2)+
  geom_smooth(method = lm, se = FALSE)+
  labs(title = "Relation between pricing and number of reviews")+
  ylab("Number of reviews")+
  xlab("Price per Night")
p1
## `geom_smooth()` using formula = 'y ~ x'

As we notice, on the lower range of price, which means they are more affordable. This leads to these listing have more reviews.

ggplot(data = df, aes(x = Room.Type, y = price_per_night,fill=Room.Type)) +
  geom_boxplot(outlier.shape = NA) +theme(axis.text.x = element_text(angle = 90, hjust = 1)) +coord_cartesian(ylim = c(0, 500))+
  xlab("Room type")+ ylab("Price per Night")+ labs(title = "Relation between room type and price")

This also consistent with our observation so far where the Entire home should takes the most percentage as well as the highest average value.

ggplot(data = df, aes(x = Host.Total.Listings.Count , y = price_per_night, color=Host.Total.Listings.Count )) +geom_point(size=0.1) +
  labs(title = "Total host listing counts vs Price")
## Warning: Removed 260 rows containing missing values (`geom_point()`).

This is also makes sense with the market. The lower the price, the more bookings.

Let’s look at the distribution of property type.

df_no_NA <-subset(df,Property.Type != "N/A" & Property.Type != "")

ggplot(data = df_no_NA, aes(x = Property.Type, y = price_per_night,color=Property.Type)) +geom_boxplot(outlier.shape = NA) +theme(axis.text.x = element_text(angle = 90, hjust = 1)) +coord_cartesian(ylim = c(0, 1700))+
  labs(title = "Property Types Distribution")

It seems like townhouse play the major role in this market. However, it is interesting to see such a wide range of different property types, such as castle, train, tent,…

city_to_state <- df |>
  group_by(City, state) |>
    count() |> 
    filter(!is.na(state)) |> 
    arrange(City) |> 
    ungroup() |> 
    distinct(City, .keep_all=TRUE) |>
    select(City, state) 
df_city <- df |> left_join(city_to_state, by="City", suffix=c("_sf", "_imputed"))

# creating a unified state field based on the simple features value if present and the imputed value otherwise
df_city <- df_city |> 
  mutate(state = ifelse(is.na(state_sf), state_imputed, state_sf)) |>
  select(-state_sf, -state_imputed)
head(df_city, 2)
##         ID                           Listing.Url      Scrape.ID Last.Scraped
## 1  4917301  https://www.airbnb.com/rooms/4917301 20170502172350   2017-05-02
## 2 18240041 https://www.airbnb.com/rooms/18240041 20170502172350   2017-05-03
##                                             Name
## 1                            Studio Under House!
## 2 4BD/3.5Bth Pool Home w/ Game Room in the Hills
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Summary
## 1                                                                                                                                                                                                                                                                                            This studio is fantastic! Such a beautiful space. Fits 1, or more, depends on if you want to sleep on air mattress and a bed, or bed only! 3 beds in the room - 1 queen bed, 1 twin, and 1 air mattress
## 2 Charming and grand, yet comfortable pool home! The house has 4 bedrooms, 3.5 bathrooms, an office, recreation room with 9' pool/ping pong table and wet-bar, in-home laundry machines, 2-car detached garage + parking space for one more vehicle. Located with amazing views of the hills of Woodland Hills. Close proximity to the famous Mulholland Dr, Woodland Hills Country Club, DTLA, West LA (Santa Monica/Beverly Hills/Malibu/ETC), the 101 Freeway, and plenty of shopping and dining!
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Space
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
## 2 -4 bedrooms (each w/ queen beds) -3.5 bathrooms -70" & 55" LED TVs w/ cable -Full kitchen available: stove, fridge, microwave, oven, blender, coffee maker, pantry, cooking & eating utensils, plateware -Dining & living rooms -Recreation room: 9' pool & ping pong table, custom built wet-bar, 55" LED TV w/ cable, access to backyard -Smart home tech: voice activated lights in foyer, living & dining rooms, voice activated thermostats, voice activated front door lock (all with Amazon Echo) -Clean linens & towels upon your arrival
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Description
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 This studio is fantastic! Such a beautiful space. Fits 1, or more, depends on if you want to sleep on air mattress and a bed, or bed only! 3 beds in the room - 1 queen bed, 1 twin, and 1 air mattress
## 2 Charming and grand, yet comfortable pool home! The house has 4 bedrooms, 3.5 bathrooms, an office, recreation room with 9' pool/ping pong table and wet-bar, in-home laundry machines, 2-car detached garage + parking space for one more vehicle. Located with amazing views of the hills of Woodland Hills. Close proximity to the famous Mulholland Dr, Woodland Hills Country Club, DTLA, West LA (Santa Monica/Beverly Hills/Malibu/ETC), the 101 Freeway, and plenty of shopping and dining! -4 bedrooms (each w/ queen beds) -3.5 bathrooms -70" & 55" LED TVs w/ cable -Full kitchen available: stove, fridge, microwave, oven, blender, coffee maker, pantry, cooking & eating utensils, plateware -Dining & living rooms -Recreation room: 9' pool & ping pong table, custom built wet-bar, 55" LED TV w/ cable, access to backyard -Smart home tech: voice activated lights in foyer, living & dining rooms, voice activated thermostats, voice activated front door lock (all with Amazon Echo) -Clean linens & towels upon
##   Experiences.Offered
## 1                none
## 2                none
##                                                                                                                                                                                                                                                                                                                                                            Neighborhood.Overview
## 1                                                                                                                                                                                                                                                                                                                                                                               
## 2 Woodland Hills is a very nice and affluent city within the greater Los Angeles Metropolitan area. The neighborhood is very safe with families regularly out for walks in the evenings. Our house is located in the hills with views of the Woodland Hills and homes. At just a block from Ventura Blvd, our home is conveniently located near plenty of restaurants and shops.
##   Notes Transit
## 1              
## 2              
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Access
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
## 2 Guests can use anything inside the house. Feel free to play around with some of the smart home tech we have installed! With the Amazon Echo in the living room, you can tell "Alexa" to do certain tasks!  Some things you can say:  -"Alexa, turn on/off the living room lights" -"Alexa, turn on/off the dining room lights" -"Alexa, turn on/off the foyer lights" -"Alexa, set the temperature to 70 degrees" -"Alexa, lock/unlock the front door"  -"Alexa, how is the weather in Woodland Hills/LA/Pasadena/etc" -"Alexa, tell me the news" -"Alexa, tell me a joke" -"Alexa, sing me a song" Give it a try!
##                                                                                                              Interaction
## 1                                                                                                                       
## 2 I am available via text messages. Please text me first as I am not always available to talk, and I will call you back!
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             House.Rules
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## 2 - No rearranging or modifications to any of the rooms without host approval. - No shoes inside the house.  - No loud noise after 10pm in respect to our neighbors. - No unsupervised children in the pool. We are not liable should anything happen.  - If you need to ship any packages/mail/parcels to our home, we are not liable should you not receive them. It is your responsibility to take delivery. - If parking any vehicles in the driveway, please park horizontally so as to not block the street with the tail of your car.  - NO PARTIES! If we find that there have been any parties, we will keep the entire deposit.  - Please make sure to wash any dishes, utensils, glasses/mugs, and cookingware that is used.
##                                                                                   Thumbnail.Url
## 1           https://a0.muscache.com/im/pictures/61653511/dac2562e_original.jpg?aki_policy=small
## 2 https://a0.muscache.com/im/pictures/7ad6459a-2c55-437f-9495-52244635b524.jpg?aki_policy=small
##                                                                                       Medium.Url
## 1           https://a0.muscache.com/im/pictures/61653511/dac2562e_original.jpg?aki_policy=medium
## 2 https://a0.muscache.com/im/pictures/7ad6459a-2c55-437f-9495-52244635b524.jpg?aki_policy=medium
##                                                                                                      Picture.Url
## 1 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/cbb8358379daaf0cc74d8c30f0f82e12
## 2 https://public.opendatasoft.com/api/v2/catalog/datasets/airbnb-listings/files/41b1c3cc72d3567d4e6dc39d6e85b4ff
##                                                                                    XL.Picture.Url
## 1           https://a0.muscache.com/im/pictures/61653511/dac2562e_original.jpg?aki_policy=x_large
## 2 https://a0.muscache.com/im/pictures/7ad6459a-2c55-437f-9495-52244635b524.jpg?aki_policy=x_large
##    Host.ID                                   Host.URL Host.Name Host.Since
## 1 24035721 https://www.airbnb.com/users/show/24035721   Michael 2014-11-22
## 2 57197381 https://www.airbnb.com/users/show/57197381    Daniel 2016-02-02
##                            Host.Location
## 1 Los Angeles, California, United States
## 2     Arcadia, California, United States
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Host.About
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
## 2 A 25 year old Taiwanese/Korean American who loves to travel. Currently working as a property manager for 18 vacation rental properties and 3 multi-family residences throughout Southern California. As a property manager, I am fully aware of the ins and outs of Airbnb and vacation rentals in general, and I'll make sure your property is treated as if it were my own!\n\nYou can call me a:\n-Film Buff \n-Auto Enthusiast\n-Music Lover\n-Sports Fanatic\n-Tech Geek\n-Outdoor Traveller\n\nThrough my travels, I've realized the best culture comes from meeting new people. Whether staying in hostels, or meeting Airbnb hosts, the culture does not necessarily come from museum tours, art exhibits, or trying different ethnic foods. Culture comes from social interaction. I hope to learn a little something about your village/town/city/state/country as a guest in your house.
##   Host.Response.Time Host.Response.Rate Host.Acceptance.Rate
## 1                                    NA                     
## 2       within a day                100                     
##                                                                                       Host.Thumbnail.Url
## 1 https://a0.muscache.com/im/users/24035721/profile_pic/1419823516/original.jpg?aki_policy=profile_small
## 2  https://a0.muscache.com/im/pictures/bf63b574-7d86-4460-b501-a69af78c2de1.jpg?aki_policy=profile_small
##                                                                                            Host.Picture.Url
## 1 https://a0.muscache.com/im/users/24035721/profile_pic/1419823516/original.jpg?aki_policy=profile_x_medium
## 2  https://a0.muscache.com/im/pictures/bf63b574-7d86-4460-b501-a69af78c2de1.jpg?aki_policy=profile_x_medium
##             Host.Neighbourhood Host.Listings.Count Host.Total.Listings.Count
## 1 Woodland Hills/Warner Center                   1                         1
## 2 Woodland Hills/Warner Center                   1                         1
##                 Host.Verifications
## 1                email,phone,jumio
## 2 email,phone,facebook,reviews,kba
##                                                               Street
## 1 Woodland Hills/Warner Center, Los Angeles, CA 91364, United States
## 2 Woodland Hills/Warner Center, Los Angeles, CA 91364, United States
##                  Neighbourhood Neighbourhood.Cleansed
## 1 Woodland Hills/Warner Center         Woodland Hills
## 2 Woodland Hills/Warner Center         Woodland Hills
##   Neighbourhood.Group.Cleansed        City State Zipcode      Market
## 1                              Los Angeles    CA   91364 Los Angeles
## 2                              Los Angeles    CA   91364 Los Angeles
##    Smart.Location Country.Code       Country Latitude Longitude Property.Type
## 1 Los Angeles, CA           US United States 34.14776 -118.5913     Apartment
## 2 Los Angeles, CA           US United States 34.16460 -118.6005         House
##         Room.Type Accommodates Bathrooms Bedrooms Beds Bed.Type
## 1    Private room            2       1.0        1    1 Real Bed
## 2 Entire home/apt            8       3.5        4    4 Real Bed
##                                                                                                                                                                                                                                                                                   Amenities
## 1                                                                                                                                                 TV,Wireless Internet,Air conditioning,Free parking on premises,Hot tub,Heating,Smoke detector,Carbon monoxide detector,Essentials,Shampoo
## 2 Wireless Internet,Air conditioning,Pool,Kitchen,Free parking on premises,Pets allowed,Indoor fireplace,Heating,Family/kid friendly,Washer,Dryer,Smoke detector,Carbon monoxide detector,First aid kit,Safety card,Essentials,Shampoo,Hangers,Hair dryer,Iron,Laptop friendly workspace,TV
##   Square.Feet Price Weekly.Price Monthly.Price Security.Deposit Cleaning.Fee
## 1          NA   150           NA            NA               NA           NA
## 2          NA   175           NA            NA              300          150
##   Guests.Included Extra.People Minimum.Nights Maximum.Nights Calendar.Updated
## 1               1            0              1           1125    29 months ago
## 2               8           25              2           1125            today
##   Has.Availability Availability.30 Availability.60 Availability.90
## 1                               30              60              90
## 2                                9              19              28
##   Availability.365 Calendar.last.Scraped Number.of.Reviews First.Review
## 1              365            2017-05-02                 0             
## 2               28            2017-05-03                 0             
##   Last.Review Review.Scores.Rating Review.Scores.Accuracy
## 1                               NA                     NA
## 2                               NA                     NA
##   Review.Scores.Cleanliness Review.Scores.Checkin Review.Scores.Communication
## 1                        NA                    NA                          NA
## 2                        NA                    NA                          NA
##   Review.Scores.Location Review.Scores.Value License      Jurisdiction.Names
## 1                     NA                  NA         City of Los Angeles, CA
## 2                     NA                  NA         City of Los Angeles, CA
##   Cancellation.Policy Calculated.host.listings.count Reviews.per.Month
## 1            flexible                              1                NA
## 2            flexible                              1                NA
##                              Geolocation
## 1  34.14775649234191,-118.59133780081316
## 2 34.164597238320674,-118.60051225075456
##                                                        Features price_per_night
## 1                        Host Has Profile Pic,Is Location Exact             150
## 2 Host Has Profile Pic,Host Identity Verified,Is Location Exact              88
##   neighbourhood_group      state
## 1                     California
## 2                     California
df_city |>
    ggplot(aes(x=price_per_night)) + 
    geom_histogram(bins=40, fill="#69b3a2", color="#e9ecef", alpha=0.9) + 
    xlim(0, 3000) + xlab("price per night")
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

    ggtitle("Distribution of AirBnb Prices in US Dataset")
## $title
## [1] "Distribution of AirBnb Prices in US Dataset"
## 
## attr(,"class")
## [1] "labels"
ggplot(data = df, aes(x = Cancellation.Policy, y = price_per_night,color=Cancellation.Policy)) +
  geom_boxplot(outlier.shape = NA) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  coord_cartesian(ylim = c(0, 3000))

We can see that the prices are slightly more expensive for the listings that have a strict cancellation policy

# create copy of dataset
data_clean <- df 

# remove values with price of $10 or lower
data_clean <- data_clean %>% filter(price_per_night > 10)

# get rid of the top 5% percentile of prices in each city
data_clean = data_clean %>% 
    group_by(City) %>% 
    mutate(price_percentile = rank(price_per_night, ties.method="first") / length(price_per_night)) %>% 
    filter(price_percentile < 0.9) %>% 
    ungroup()
top_states <- number_of_listings_by_state|>
  arrange(desc(V1))|>
  head(7)
top_states
##                  state    V1
## 1           California 43481
## 2             New York 37057
## 3                Texas  9031
## 4 District of Columbia  7169
## 5            Louisiana  5203
## 6             Illinois  4748
## 7           Washington  3657
df_city %>% filter(state %in% top_states$state) %>% 
    ggplot(aes(x=state, y=price_per_night, fill=state)) + 
    geom_boxplot(alpha=0.9) + 
    ylim(0, 1000) +
    ggtitle("Price distribution by rich States")  + 
    theme(axis.text.x=element_text(angle=45, hjust=1)) + 
    scale_fill_viridis_d()

This is also suspicious - why are New York Airbnbs cheaper than other states? Potentially, this could be down to the types of rooms on offer.

data_clean %>% filter(state %in% top_states$state) %>%

    mutate(state = ifelse(state == "New York", "New York", "Other Top States"))  %>%
    ggplot(aes(x=state, y=price_per_night, fill=Room.Type, order = (Room.Type) )) + 
    geom_bar(position = position_fill(reverse = TRUE), stat = "identity") + 
    ggtitle("Breakdown by type of room")  + 
    theme(axis.text.x=element_text(angle=45, hjust=1)) + 
    scale_fill_viridis_d()

Looking at the breakdown, we see that NewYork offers more private room than Entire home/apt, and the price of private room abviously cheaper than the entire home, which in average will bring New York relatively cheaper than other top states.

Review analysis

colnames(df)
##  [1] "ID"                             "Listing.Url"                   
##  [3] "Scrape.ID"                      "Last.Scraped"                  
##  [5] "Name"                           "Summary"                       
##  [7] "Space"                          "Description"                   
##  [9] "Experiences.Offered"            "Neighborhood.Overview"         
## [11] "Notes"                          "Transit"                       
## [13] "Access"                         "Interaction"                   
## [15] "House.Rules"                    "Thumbnail.Url"                 
## [17] "Medium.Url"                     "Picture.Url"                   
## [19] "XL.Picture.Url"                 "Host.ID"                       
## [21] "Host.URL"                       "Host.Name"                     
## [23] "Host.Since"                     "Host.Location"                 
## [25] "Host.About"                     "Host.Response.Time"            
## [27] "Host.Response.Rate"             "Host.Acceptance.Rate"          
## [29] "Host.Thumbnail.Url"             "Host.Picture.Url"              
## [31] "Host.Neighbourhood"             "Host.Listings.Count"           
## [33] "Host.Total.Listings.Count"      "Host.Verifications"            
## [35] "Street"                         "Neighbourhood"                 
## [37] "Neighbourhood.Cleansed"         "Neighbourhood.Group.Cleansed"  
## [39] "City"                           "State"                         
## [41] "Zipcode"                        "Market"                        
## [43] "Smart.Location"                 "Country.Code"                  
## [45] "Country"                        "Latitude"                      
## [47] "Longitude"                      "Property.Type"                 
## [49] "Room.Type"                      "Accommodates"                  
## [51] "Bathrooms"                      "Bedrooms"                      
## [53] "Beds"                           "Bed.Type"                      
## [55] "Amenities"                      "Square.Feet"                   
## [57] "Price"                          "Weekly.Price"                  
## [59] "Monthly.Price"                  "Security.Deposit"              
## [61] "Cleaning.Fee"                   "Guests.Included"               
## [63] "Extra.People"                   "Minimum.Nights"                
## [65] "Maximum.Nights"                 "Calendar.Updated"              
## [67] "Has.Availability"               "Availability.30"               
## [69] "Availability.60"                "Availability.90"               
## [71] "Availability.365"               "Calendar.last.Scraped"         
## [73] "Number.of.Reviews"              "First.Review"                  
## [75] "Last.Review"                    "Review.Scores.Rating"          
## [77] "Review.Scores.Accuracy"         "Review.Scores.Cleanliness"     
## [79] "Review.Scores.Checkin"          "Review.Scores.Communication"   
## [81] "Review.Scores.Location"         "Review.Scores.Value"           
## [83] "License"                        "Jurisdiction.Names"            
## [85] "Cancellation.Policy"            "Calculated.host.listings.count"
## [87] "Reviews.per.Month"              "Geolocation"                   
## [89] "Features"                       "price_per_night"               
## [91] "neighbourhood_group"            "state"
scores <- c("Review.Scores.Rating","Review.Scores.Accuracy","Review.Scores.Cleanliness","Review.Scores.Checkin","Review.Scores.Communication","Review.Scores.Location","Review.Scores.Value")
scores_data <- df[scores]
scores_data <- removeRowsWithNA(scores_data, scores)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(gridExtra)
a <-ggplot(data = scores_data, aes(x = Review.Scores.Accuracy, y = Review.Scores.Rating )) +
  geom_jitter(size = 0.1) + xlab("accuracy") +ylab("Rating")

b <- ggplot(data = scores_data, aes(x = Review.Scores.Cleanliness, y = Review.Scores.Rating )) + 
  geom_jitter(size = 0.1)   + xlab("cleanliness") +ylab("Rating")
c <- ggplot(data = scores_data, aes(x = Review.Scores.Checkin, y = Review.Scores.Rating )) +
  geom_jitter(size = 0.1)+ xlab("checkin") +ylab("Rating")

d <-  ggplot(data = scores_data, aes(x = Review.Scores.Communication, y = Review.Scores.Rating )) +
  geom_jitter(size = 0.1)+ xlab("Communication") +ylab("Rating")

e <- ggplot(data = scores_data, aes(x = Review.Scores.Location, y = Review.Scores.Rating )) +
  geom_jitter(size = 0.1)+ xlab("Location") +ylab("Rating")

f <-  ggplot(data = scores_data, aes(x = Review.Scores.Value, y = Review.Scores.Rating )) +
  geom_jitter(size = 0.1)+ xlab("Value")+ylab("Rating")

grid.arrange(a, b, c, d, e, f , ncol = 2, nrow = 3)

From the plots, we can see that most of the people who give the listings high ratings, give high scores for all the other types of scores (denser in right top corners).

Host behaviors vs Price

df_no_NA <-subset(df,Host.Response.Time != "N/A" & Host.Response.Time != "")
ggplot(data = df_no_NA, aes(x = Host.Response.Time, y = price_per_night,color=Host.Response.Time))+
  geom_boxplot(outlier.shape = NA) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+coord_cartesian(ylim = c(0, 400))

ggplot(data = df, aes(x = Host.Response.Rate, y = price_per_night, color=Host.Response.Rate)) +
  geom_point(size=0.5)
## Warning: Removed 27906 rows containing missing values (`geom_point()`).

ggplot(data = df, aes(x = Host.Response.Rate, y = Host.Total.Listings.Count, color=Host.Response.Rate)) +
  geom_point(size=0.5)
## Warning: Removed 27906 rows containing missing values (`geom_point()`).

ggplot(data = df, aes(x = Cleaning.Fee, y = Host.Total.Listings.Count, color=Cleaning.Fee)) +
  geom_point(size=0.5)
## Warning: Removed 34191 rows containing missing values (`geom_point()`).

Modeling and Prediction

Relationship between ratings.

Let’s start with a basic model linear regression between all the rating reviews.

rating_regression <- lm(data=df, Review.Scores.Rating~Review.Scores.Accuracy+Review.Scores.Cleanliness+Review.Scores.Checkin+Review.Scores.Communication+Review.Scores.Location+Review.Scores.Value)
summary(rating_regression)
## 
## Call:
## lm(formula = Review.Scores.Rating ~ Review.Scores.Accuracy + 
##     Review.Scores.Cleanliness + Review.Scores.Checkin + Review.Scores.Communication + 
##     Review.Scores.Location + Review.Scores.Value, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58.04  -1.72   0.28   1.28  46.66 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.14158    0.23937  -25.66   <2e-16 ***
## Review.Scores.Accuracy       2.20828    0.02414   91.46   <2e-16 ***
## Review.Scores.Cleanliness    2.21864    0.01786  124.24   <2e-16 ***
## Review.Scores.Checkin        0.99444    0.02868   34.68   <2e-16 ***
## Review.Scores.Communication  1.64802    0.02976   55.38   <2e-16 ***
## Review.Scores.Location       0.77998    0.01895   41.15   <2e-16 ***
## Review.Scores.Value          2.63684    0.02298  114.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.906 on 96150 degrees of freedom
##   (29117 observations deleted due to missingness)
## Multiple R-squared:  0.7344, Adjusted R-squared:  0.7343 
## F-statistic: 4.43e+04 on 6 and 96150 DF,  p-value: < 2.2e-16

The model has good p-value (<2.2e-16). All the factors are significant. Let’s plot this and observe its behavior:

ggplot(data = rating_regression, aes(Review.Scores.Rating,Review.Scores.Accuracy+Review.Scores.Cleanliness+Review.Scores.Checkin+Review.Scores.Communication+Review.Scores.Location+Review.Scores.Value)) +
  geom_point()+
  geom_smooth(method = 'lm')
## `geom_smooth()` using formula = 'y ~ x'

Let’s plot more plots to see clearer the relationship.

plot(rating_regression)

The “Normal Q-Q” plot shows if residuals are normally distributed. Our residuals are not well lined on the straight dashed line except in the middle of the plot, which is not quite good.

The “Scale-Location” plot lets us check the assumption of equal variance. Our line is not horizontal with randomly spread points, thus, our residuals are not homoscedastic. This was expected, since from the previous plots of the different types of ratings, we could clearly see that the variance depends on the score.

The “Residuals vs Leverage” plot helps us find influential cases. In fact, even though data has outliers, they might not be influential to determine a regression line. In our plot, we can barely see Cook’s distance lines because all cases are well inside of them. i.e: if we exclude the “52474” case for example, the changes in the slope coefficients won’t be important.

Relation between Price and its factors

price_regression <- p_reg1 <- lm(data=df, price_per_night~Host.Response.Rate+Host.Acceptance.Rate+Host.Total.Listings.Count+Property.Type+Room.Type+Accommodates+Bathrooms+Bedrooms+Beds+Bed.Type+Square.Feet+Security.Deposit+Cleaning.Fee+Extra.People+Minimum.Nights+Maximum.Nights+Number.of.Reviews+Cancellation.Policy+State)
summary(price_regression)
## 
## Call:
## lm(formula = price_per_night ~ Host.Response.Rate + Host.Acceptance.Rate + 
##     Host.Total.Listings.Count + Property.Type + Room.Type + Accommodates + 
##     Bathrooms + Bedrooms + Beds + Bed.Type + Square.Feet + Security.Deposit + 
##     Cleaning.Fee + Extra.People + Minimum.Nights + Maximum.Nights + 
##     Number.of.Reviews + Cancellation.Policy + State, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.54  -32.08   -7.00   15.27  660.64 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         1.196e+02  3.929e+01   3.043 0.002424 ** 
## Host.Response.Rate                 -4.497e-01  2.087e-01  -2.155 0.031473 *  
## Host.Acceptance.Rate0%             -2.236e+01  2.994e+01  -0.747 0.455494    
## Host.Acceptance.Rate100%           -1.250e+01  1.225e+01  -1.020 0.307830    
## Host.Acceptance.Rate31%            -4.929e+00  7.127e+01  -0.069 0.944879    
## Host.Acceptance.Rate33%             2.008e+01  5.179e+01   0.388 0.698378    
## Host.Acceptance.Rate50%            -3.879e+01  3.678e+01  -1.055 0.291932    
## Host.Acceptance.Rate56%            -2.652e+01  4.688e+01  -0.566 0.571762    
## Host.Acceptance.Rate58%            -4.789e+01  7.139e+01  -0.671 0.502491    
## Host.Acceptance.Rate63%            -1.775e+02  7.829e+01  -2.267 0.023666 *  
## Host.Acceptance.Rate67%            -2.678e+01  5.262e+01  -0.509 0.610953    
## Host.Acceptance.Rate68%             2.500e+02  7.112e+01   3.515 0.000466 ***
## Host.Acceptance.Rate71%             5.685e+00  3.680e+01   0.154 0.877262    
## Host.Acceptance.Rate75%            -5.647e+01  7.122e+01  -0.793 0.428049    
## Host.Acceptance.Rate76%            -1.094e+01  5.050e+01  -0.217 0.828477    
## Host.Acceptance.Rate78%            -5.153e+00  5.420e+01  -0.095 0.924282    
## Host.Acceptance.Rate80%            -4.977e+01  7.091e+01  -0.702 0.482950    
## Host.Acceptance.Rate82%            -9.241e-01  7.137e+01  -0.013 0.989672    
## Host.Acceptance.Rate83%            -9.152e+00  5.119e+01  -0.179 0.858156    
## Host.Acceptance.Rate86%            -6.195e+01  4.235e+01  -1.463 0.143968    
## Host.Acceptance.Rate87%            -6.780e+01  6.196e+01  -1.094 0.274146    
## Host.Acceptance.Rate88%            -5.438e+01  4.188e+01  -1.299 0.194508    
## Host.Acceptance.Rate89%             5.936e+01  7.140e+01   0.831 0.405997    
## Host.Acceptance.Rate90%             4.168e+00  7.112e+01   0.059 0.953290    
## Host.Acceptance.Rate91%            -1.037e+02  3.329e+01  -3.115 0.001912 ** 
## Host.Acceptance.Rate92%            -4.572e+01  3.933e+01  -1.162 0.245411    
## Host.Acceptance.Rate93%             2.097e+00  7.118e+01   0.029 0.976504    
## Host.Acceptance.Rate94%            -4.977e+01  7.657e+01  -0.650 0.515856    
## Host.Acceptance.Rate95%             2.948e+01  5.041e+01   0.585 0.558958    
## Host.Acceptance.Rate96%            -4.536e+01  4.173e+01  -1.087 0.277383    
## Host.Acceptance.Rate97%            -3.110e+01  4.124e+01  -0.754 0.451054    
## Host.Acceptance.Rate98%            -1.429e+01  4.280e+01  -0.334 0.738593    
## Host.Acceptance.Rate99%            -1.730e+01  7.300e+01  -0.237 0.812755    
## Host.Total.Listings.Count           1.465e-01  3.603e-01   0.407 0.684405    
## Property.TypeBed & Breakfast       -3.657e+01  4.384e+01  -0.834 0.404483    
## Property.TypeBungalow              -5.356e+00  2.594e+01  -0.206 0.836470    
## Property.TypeCabin                  1.224e+01  4.153e+01   0.295 0.768238    
## Property.TypeCamper/RV             -1.741e+01  4.193e+01  -0.415 0.678048    
## Property.TypeCondominium            9.010e+00  1.665e+01   0.541 0.588453    
## Property.TypeGuest suite           -3.489e+01  5.026e+01  -0.694 0.487772    
## Property.TypeGuesthouse             5.393e+01  3.624e+01   1.488 0.137167    
## Property.TypeHouse                 -8.470e+00  6.633e+00  -1.277 0.201984    
## Property.TypeLoft                   2.178e+01  1.622e+01   1.343 0.179676    
## Property.TypeOther                  1.134e+02  2.726e+01   4.160 3.55e-05 ***
## Property.TypeTownhouse             -3.633e+01  2.506e+01  -1.449 0.147617    
## Property.TypeVilla                  4.721e+01  7.138e+01   0.661 0.508566    
## Room.TypePrivate room              -6.104e+00  7.648e+00  -0.798 0.425054    
## Room.TypeShared room               -6.015e+01  3.790e+01  -1.587 0.112922    
## Accommodates                        1.303e+01  2.113e+00   6.164 1.15e-09 ***
## Bathrooms                           1.106e+01  6.290e+00   1.759 0.079013 .  
## Bedrooms                           -1.389e+01  5.046e+00  -2.753 0.006053 ** 
## Beds                               -4.000e-01  2.836e+00  -0.141 0.887861    
## Bed.TypeCouch                       9.413e+01  5.400e+01   1.743 0.081733 .  
## Bed.TypeFuton                      -3.018e+01  3.663e+01  -0.824 0.410180    
## Bed.TypePull-out Sofa              -2.080e+01  4.007e+01  -0.519 0.603831    
## Bed.TypeReal Bed                   -1.892e+01  3.235e+01  -0.585 0.558882    
## Square.Feet                         7.674e-03  4.178e-03   1.837 0.066648 .  
## Security.Deposit                    6.908e-02  1.688e-02   4.093 4.71e-05 ***
## Cleaning.Fee                        3.720e-01  8.125e-02   4.578 5.49e-06 ***
## Extra.People                        3.091e-01  1.064e-01   2.906 0.003763 ** 
## Minimum.Nights                     -1.347e+01  1.078e+00 -12.495  < 2e-16 ***
## Maximum.Nights                     -5.103e-08  3.646e-08  -1.400 0.162043    
## Number.of.Reviews                  -2.148e-02  4.579e-02  -0.469 0.639131    
## Cancellation.Policymoderate        -3.042e+01  1.336e+01  -2.276 0.023103 *  
## Cancellation.Policystrict          -2.337e+01  1.313e+01  -1.779 0.075613 .  
## Cancellation.Policysuper_strict_30 -4.145e+01  5.205e+01  -0.796 0.426108    
## StateCO                            -2.601e-01  1.905e+01  -0.014 0.989111    
## StateDC                            -3.764e+01  1.520e+01  -2.477 0.013482 *  
## StateIL                            -2.643e+01  2.042e+01  -1.294 0.196036    
## StateLA                            -5.269e+00  1.248e+01  -0.422 0.673024    
## StateMA                             4.763e+00  2.836e+01   0.168 0.866689    
## StateNY                            -1.903e+01  7.348e+00  -2.589 0.009805 ** 
## StateOR                            -1.806e+01  1.889e+01  -0.956 0.339312    
## StateTN                             4.683e+01  2.975e+01   1.574 0.115879    
## StateTX                             1.281e+01  8.985e+00   1.426 0.154205    
## StateWA                            -1.308e+01  1.583e+01  -0.826 0.408812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.52 on 752 degrees of freedom
##   (124446 observations deleted due to missingness)
## Multiple R-squared:  0.4537, Adjusted R-squared:  0.3992 
## F-statistic: 8.328 on 75 and 752 DF,  p-value: < 2.2e-16

For this model, we have good p-value (<2.2e-16) with good R-squred (0.4537).

plot(price_regression)
## Warning: not plotting observations with leverage one:
##   94, 556, 627, 662, 713, 784, 812

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced