# Title: Exploratory Data Analysis on Airbnb Reviews
# Table of Contents
# 1. Executive Summary
# 2. Introduction
# 3. Data Loading and Preliminary Analysis
# 4. Exploratory Data Analysis
#     4.1 Text Length Distribution
#     4.2 Popular Neighborhoods Analysis
#     4.3 Sentiment Analysis
#     4.4 Word Frequency Analysis
# 5. Word Cloud Visualization
# 6. Concluding Remarks
# 7. References
# 1. Executive Summary
# This report provides an exploratory data analysis on Airbnb reviews, focusing on sentiment analysis
# and common words in the comments. The analysis includes text length distribution, popular neighborhoods,
# sentiment scores, and word frequency. Visualizations such as histograms, bar plots, and word clouds are used
# to present the findings.
# 2. Introduction
# Airbnb reviews contain valuable insights into customer sentiment and preferences. This analysis aims to uncover
# patterns in customer reviews, identify popular neighborhoods, and analyze sentiment scores.
# 3. Data Loading and Preliminary Analysis
# Load necessary libraries
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.3.2
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.3.2
# Load AFINN lexicon for sentiment analysis
sentiment_scores <- get_sentiments("afinn")

# Load the Airbnb reviews dataset
airbnb_reviews <- read.csv("airbnb_reviews.csv")

# Check for duplicates in the "comments" column
duplicates <- airbnb_reviews[duplicated(airbnb_reviews$comments), ]
cat("Number of duplicates:", nrow(duplicates), "\n")
## Number of duplicates: 4446
# Remove duplicates based on the "comments" column
airbnb_reviews_unique <- airbnb_reviews[!duplicated(airbnb_reviews$comments), ]

# Create a new column "comments_len" with the length of the comment text
airbnb_reviews$comments_len <- str_count(airbnb_reviews$comments, pattern = ".")

# Show summary statistics of the data
summary(airbnb_reviews)
##   listing_id            date             comments         neighbourhood     
##  Length:149905      Length:149905      Length:149905      Length:149905     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  property_type       accommodates      price           review_scores_value
##  Length:149905      Min.   : 1.00   Length:149905      Min.   : 2.000     
##  Class :character   1st Qu.: 2.00   Class :character   1st Qu.: 9.000     
##  Mode  :character   Median : 2.00   Mode  :character   Median : 9.000     
##                     Mean   : 3.07                      Mean   : 9.235     
##                     3rd Qu.: 4.00                      3rd Qu.:10.000     
##                     Max.   :16.00                      Max.   :10.000     
##                                                        NA's   :287        
##   comments_len   
##  Min.   :   0.0  
##  1st Qu.: 144.0  
##  Median : 261.0  
##  Mean   : 336.9  
##  3rd Qu.: 435.0  
##  Max.   :3109.0  
## 
# 4. Exploratory Data Analysis
# 4.1 Text Length Distribution
# Visualize the distribution of text lengths
ggplot(airbnb_reviews, aes(x = comments_len)) +
  geom_histogram(binwidth = 10, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Text Lengths",
       x = "Comments Length",
       y = "Frequency")

# 4.2 Popular Neighborhoods Analysis
# Analyze popular neighborhoods
popular_neighborhoods <- airbnb_reviews %>%
  group_by(neighbourhood) %>%
  summarize(total_reviews = n()) %>%
  arrange(desc(total_reviews))

# Visualize popular neighborhoods
ggplot(popular_neighborhoods, aes(x = reorder(neighbourhood, total_reviews), y = total_reviews)) +
  geom_col(fill = "blue") +
  labs(title = "Popular Neighborhoods by Total Reviews",
       x = "Neighborhood",
       y = "Total Reviews") +
  coord_flip()

# Select the top 3 neighborhoods for further analysis
top_neighborhoods <- head(popular_neighborhoods$neighbourhood, 3)
airbnb_reviews_top <- airbnb_reviews %>% filter(neighbourhood %in% top_neighborhoods)
# 4.3 Sentiment Analysis
# Tokenization and preprocessing for sentiment analysis
tidy_reviews_sentiment <- airbnb_reviews_top %>%
  unnest_tokens(word, comments) %>%
  anti_join(stop_words) %>%
  filter(!str_detect(word, "\\d+")) %>%
  left_join(sentiment_scores, by = c("word" = "word")) %>%
  group_by(neighbourhood, listing_id) %>%
  summarize(sentiment_score = sum(value, na.rm = TRUE)) %>%
  ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'neighbourhood'. You can override using the
## `.groups` argument.
# Visualize sentiment scores by neighborhood
ggplot(tidy_reviews_sentiment, aes(x = neighbourhood, y = sentiment_score, fill = neighbourhood)) +
  geom_boxplot() +
  labs(title = "Sentiment Scores by Neighborhood",
       x = "Neighborhood",
       y = "Sentiment Score")

# 4.4 Word Frequency Analysis
# Tokenization and preprocessing for comments
tidy_comments <- airbnb_reviews_top %>%
  unnest_tokens(word, comments) %>%
  anti_join(stop_words) %>%
  filter(!str_detect(word, "\\d+"))  # Remove numbers
## Joining with `by = join_by(word)`
# Display the first few rows of the tidy data
head(tidy_comments)
##   listing_id       date neighbourhood property_type accommodates  price
## 1  x15194207 2016-10-04        Harlem     Apartment            4 $90.00
## 2  x15194207 2016-10-04        Harlem     Apartment            4 $90.00
## 3  x15194207 2016-10-04        Harlem     Apartment            4 $90.00
## 4  x15194207 2016-10-04        Harlem     Apartment            4 $90.00
## 5  x15194207 2016-10-04        Harlem     Apartment            4 $90.00
## 6  x15194207 2016-10-04        Harlem     Apartment            4 $90.00
##   review_scores_value comments_len      word
## 1                  10          137 welcoming
## 2                  10          137    pretty
## 3                  10          137     house
## 4                  10          137  kryshana
## 5                  10          137 boyfriend
## 6                  10          137   helpful
# Analyze the most common words
word_freq <- tidy_comments %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_freq, 10)
##           word     n
## 1    apartment 29644
## 2         stay 24391
## 3     location 18417
## 4        clean 16035
## 5         host 15536
## 6         nice 12952
## 7           de 12868
## 8         time 10801
## 9  comfortable  9983
## 10      subway  9564
# Visualize the most common words
ggplot(head(word_freq, 10), aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "blue") +
  labs(title = "Top 10 Most Common Words in Comments",
       x = "Word",
       y = "Frequency") +
  coord_flip()

# 5. Word Cloud Visualization
# Create a document-term matrix for word cloud
dtm_wordcloud <- tidy_comments %>%
  count(word) %>%
  as.data.frame()

# Generate a word cloud
wordcloud2(dtm_wordcloud, size = 1.5)
# 6. Concluding Remarks
# The analysis provides insights into Airbnb reviews, highlighting sentiment scores, popular neighborhoods,
# and common words in comments. The word cloud visualization offers a concise representation of frequently
# occurring words.
# 7. References
# [1] Wickham, H., & Bryan, J. (2019). Tidyr: Easily Tidy Data with 'spread()' and 'gather()' Functions.
# [2] Chang, W., Cheng, J., Allaire, J. J., Xie, Y., & McPherson, J. (2019). Shiny: Web Application Framework for R.
# [3] Robinson, D., & Silge, J. (2017). Tidytext: Text Mining and Analysis Using Tidy Data Principles.
# [4] Fellows, I., & Newsom, D. (2019). Wordcloud2: Create Word Cloud by 'htmlwidget'.