# Title: Exploratory Data Analysis on Airbnb Reviews
# Table of Contents
# 1. Executive Summary
# 2. Introduction
# 3. Data Loading and Preliminary Analysis
# 4. Exploratory Data Analysis
# 4.1 Text Length Distribution
# 4.2 Popular Neighborhoods Analysis
# 4.3 Sentiment Analysis
# 4.4 Word Frequency Analysis
# 5. Word Cloud Visualization
# 6. Concluding Remarks
# 7. References
# 1. Executive Summary
# This report provides an exploratory data analysis on Airbnb reviews, focusing on sentiment analysis
# and common words in the comments. The analysis includes text length distribution, popular neighborhoods,
# sentiment scores, and word frequency. Visualizations such as histograms, bar plots, and word clouds are used
# to present the findings.
# 2. Introduction
# Airbnb reviews contain valuable insights into customer sentiment and preferences. This analysis aims to uncover
# patterns in customer reviews, identify popular neighborhoods, and analyze sentiment scores.
# 3. Data Loading and Preliminary Analysis
# Load necessary libraries
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.3.2
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.3.2
# Load AFINN lexicon for sentiment analysis
sentiment_scores <- get_sentiments("afinn")
# Load the Airbnb reviews dataset
airbnb_reviews <- read.csv("airbnb_reviews.csv")
# Check for duplicates in the "comments" column
duplicates <- airbnb_reviews[duplicated(airbnb_reviews$comments), ]
cat("Number of duplicates:", nrow(duplicates), "\n")
## Number of duplicates: 4446
# Remove duplicates based on the "comments" column
airbnb_reviews_unique <- airbnb_reviews[!duplicated(airbnb_reviews$comments), ]
# Create a new column "comments_len" with the length of the comment text
airbnb_reviews$comments_len <- str_count(airbnb_reviews$comments, pattern = ".")
# Show summary statistics of the data
summary(airbnb_reviews)
## listing_id date comments neighbourhood
## Length:149905 Length:149905 Length:149905 Length:149905
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## property_type accommodates price review_scores_value
## Length:149905 Min. : 1.00 Length:149905 Min. : 2.000
## Class :character 1st Qu.: 2.00 Class :character 1st Qu.: 9.000
## Mode :character Median : 2.00 Mode :character Median : 9.000
## Mean : 3.07 Mean : 9.235
## 3rd Qu.: 4.00 3rd Qu.:10.000
## Max. :16.00 Max. :10.000
## NA's :287
## comments_len
## Min. : 0.0
## 1st Qu.: 144.0
## Median : 261.0
## Mean : 336.9
## 3rd Qu.: 435.0
## Max. :3109.0
##
# 4. Exploratory Data Analysis
# 4.1 Text Length Distribution
# Visualize the distribution of text lengths
ggplot(airbnb_reviews, aes(x = comments_len)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Text Lengths",
x = "Comments Length",
y = "Frequency")

# 4.2 Popular Neighborhoods Analysis
# Analyze popular neighborhoods
popular_neighborhoods <- airbnb_reviews %>%
group_by(neighbourhood) %>%
summarize(total_reviews = n()) %>%
arrange(desc(total_reviews))
# Visualize popular neighborhoods
ggplot(popular_neighborhoods, aes(x = reorder(neighbourhood, total_reviews), y = total_reviews)) +
geom_col(fill = "blue") +
labs(title = "Popular Neighborhoods by Total Reviews",
x = "Neighborhood",
y = "Total Reviews") +
coord_flip()

# Select the top 3 neighborhoods for further analysis
top_neighborhoods <- head(popular_neighborhoods$neighbourhood, 3)
airbnb_reviews_top <- airbnb_reviews %>% filter(neighbourhood %in% top_neighborhoods)
# 4.3 Sentiment Analysis
# Tokenization and preprocessing for sentiment analysis
tidy_reviews_sentiment <- airbnb_reviews_top %>%
unnest_tokens(word, comments) %>%
anti_join(stop_words) %>%
filter(!str_detect(word, "\\d+")) %>%
left_join(sentiment_scores, by = c("word" = "word")) %>%
group_by(neighbourhood, listing_id) %>%
summarize(sentiment_score = sum(value, na.rm = TRUE)) %>%
ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
# Visualize sentiment scores by neighborhood
ggplot(tidy_reviews_sentiment, aes(x = neighbourhood, y = sentiment_score, fill = neighbourhood)) +
geom_boxplot() +
labs(title = "Sentiment Scores by Neighborhood",
x = "Neighborhood",
y = "Sentiment Score")

# 4.4 Word Frequency Analysis
# Tokenization and preprocessing for comments
tidy_comments <- airbnb_reviews_top %>%
unnest_tokens(word, comments) %>%
anti_join(stop_words) %>%
filter(!str_detect(word, "\\d+")) # Remove numbers
## Joining with `by = join_by(word)`
# Display the first few rows of the tidy data
head(tidy_comments)
## listing_id date neighbourhood property_type accommodates price
## 1 x15194207 2016-10-04 Harlem Apartment 4 $90.00
## 2 x15194207 2016-10-04 Harlem Apartment 4 $90.00
## 3 x15194207 2016-10-04 Harlem Apartment 4 $90.00
## 4 x15194207 2016-10-04 Harlem Apartment 4 $90.00
## 5 x15194207 2016-10-04 Harlem Apartment 4 $90.00
## 6 x15194207 2016-10-04 Harlem Apartment 4 $90.00
## review_scores_value comments_len word
## 1 10 137 welcoming
## 2 10 137 pretty
## 3 10 137 house
## 4 10 137 kryshana
## 5 10 137 boyfriend
## 6 10 137 helpful
# Analyze the most common words
word_freq <- tidy_comments %>%
count(word, sort = TRUE)
# Display the most common words
head(word_freq, 10)
## word n
## 1 apartment 29644
## 2 stay 24391
## 3 location 18417
## 4 clean 16035
## 5 host 15536
## 6 nice 12952
## 7 de 12868
## 8 time 10801
## 9 comfortable 9983
## 10 subway 9564
# Visualize the most common words
ggplot(head(word_freq, 10), aes(x = reorder(word, n), y = n)) +
geom_col(fill = "blue") +
labs(title = "Top 10 Most Common Words in Comments",
x = "Word",
y = "Frequency") +
coord_flip()

# 5. Word Cloud Visualization
# Create a document-term matrix for word cloud
dtm_wordcloud <- tidy_comments %>%
count(word) %>%
as.data.frame()
# Generate a word cloud
wordcloud2(dtm_wordcloud, size = 1.5)
# 6. Concluding Remarks
# The analysis provides insights into Airbnb reviews, highlighting sentiment scores, popular neighborhoods,
# and common words in comments. The word cloud visualization offers a concise representation of frequently
# occurring words.
# 7. References
# [1] Wickham, H., & Bryan, J. (2019). Tidyr: Easily Tidy Data with 'spread()' and 'gather()' Functions.
# [2] Chang, W., Cheng, J., Allaire, J. J., Xie, Y., & McPherson, J. (2019). Shiny: Web Application Framework for R.
# [3] Robinson, D., & Silge, J. (2017). Tidytext: Text Mining and Analysis Using Tidy Data Principles.
# [4] Fellows, I., & Newsom, D. (2019). Wordcloud2: Create Word Cloud by 'htmlwidget'.