In [1]:
print('helloworld')
helloworld
In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import tokenize

import string
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # Added missing imports here
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix as cn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.dummy import DummyClassifier
In [3]:
# Begin EDA
df = pd.read_csv ('ClothingReviews.csv')

print(df.shape)
df.info()
(23486, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      23486 non-null  int64 
 1   Title                    19676 non-null  object
 2   Review Text              22641 non-null  object
 3   Rating                   23486 non-null  int64 
 4   Recommended IND          23486 non-null  int64 
 5   Positive Feedback Count  23486 non-null  int64 
 6   Department Name          23472 non-null  object
 7   Class Name               23472 non-null  object
dtypes: int64(4), object(4)
memory usage: 1.4+ MB
In [4]:
print(df.describe())
                Age        Rating  Recommended IND  Positive Feedback Count
count  23486.000000  23486.000000     23486.000000             23486.000000
mean      43.198544      4.196032         0.822362                 2.535936
std       12.279544      1.110031         0.382216                 5.702202
min       18.000000      1.000000         0.000000                 0.000000
25%       34.000000      4.000000         1.000000                 0.000000
50%       41.000000      5.000000         1.000000                 1.000000
75%       52.000000      5.000000         1.000000                 3.000000
max       99.000000      5.000000         1.000000               122.000000
In [5]:
df.head(3)
Out[5]:
Age Title Review Text Rating Recommended IND Positive Feedback Count Department Name Class Name
0 33 NaN Absolutely wonderful - silky and sexy and comf... 4 1 0 Intimate Intimates
1 34 NaN Love this dress! it's sooo pretty. i happene... 5 1 4 Dresses Dresses
2 60 Some major design flaws I had such high hopes for this dress and reall... 3 0 0 Dresses Dresses
In [6]:
def print_missing():
    cols = df.columns[df.isnull().any()].tolist()
    for c in cols:
        print(c, df[c].isnull().sum())

print_missing()
Title 3810
Review Text 845
Department Name 14
Class Name 14
In [8]:
df['Review Text'].value_counts()
Out[8]:
Perfect fit and i've gotten so many compliments. i buy all my suits from here now!                                                                                                                                                                                                                                                                                                                                                                                                                                        3
I purchased this and another eva franco dress during retailer's recent 20% off sale. i was looking for dresses that were work appropriate, but that would also transition well to happy hour or date night. they both seemed to be just what i was looking for. i ordered a 4 regular and a 6 regular, as i am usually in between sizes. the 4 was definitely too small. the 6 fit, technically, but was very ill fitting. not only is the dress itself short, but it is very short-waisted. i am only 5'3", but it fe    2
The sweater and skirt are so pretty! they're really soft and have such an easy, comfortable look together. really love this gorgeous outfit.\n\ni am borderline small/medium and kept the size small after trying both on.                                                                                                                                                                                                                                                                                                2
Love, love these jeans. being short they come right to my ankle. super soft and don?t require any hemming. i ordered my typical jean size of 26 and they fit like a glove. would love to have these in black and grey.                                                                                                                                                                                                                                                                                                    2
Lightweight, soft cotton top and shorts. i think it's meant to be a beach cover-up but i'm wearing it as a thin, light-weight summer outfit on these hot hot days. the top has a loose elastic around the bottom which i didn't realize when i ordered it, but i like it and it matches the look in the photos. and the shorts are very low-cut - don't expect them up around your waist. again, i like that. some might want to wear a cami underneath because it's a thin cotton but i'm fine as-is. i bought it i      2
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         ..
I love charlie pants. i've bought a few in the past, and i've never been disappointed with quality, style, or fit. i love the side zip on these and the nice flat front as a result. i wish more pants were side zip- so much more flattering. i got these in a 4, my usual size, and the fit is perfect. i also got these in the regular, rather than petite, but the length works just fine for my height. i bought these in the grey stripe, but like them so much i think i'll order another pair in the orange.      1
I'm a size 4 all day everyday. but, these pants wouldn't even meet around my hips. sadly, i'm returning them tomorrow.                                                                                                                                                                                                                                                                                                                                                                                                    1
These are beautiful pants, especially when you walk, they are simply beautiful and flow nicely. however, they are soooo very long (i am 5 5 1/2) and even with a big wedge's i would have to get these altered. the waist seemed very small, i am a sz 12 (10 in some stores) and i got the large and it was way too tight in the waist and have no stretch or give at all. . probably a sz 10 at most. so unfortunetly they have to go back.                                                                             1
I like the pattern and material on this top but the bubble hem isn't flattering on my figure. i would recommend this top if you are tiny and petite.                                                                                                                                                                                                                                                                                                                                                                      1
This dress in a lovely platinum is feminine and fits perfectly, easy to wear and comfy, too! highly recommend!                                                                                                                                                                                                                                                                                                                                                                                                            1
Name: Review Text, Length: 22634, dtype: int64
In [9]:
print(df.columns)
Index(['Age', 'Title', 'Review Text', 'Rating', 'Recommended IND',
       'Positive Feedback Count', 'Department Name', 'Class Name'],
      dtype='object')
In [16]:
# Cleaning
dups = df[df.duplicated()]
print('duplicates across all columns: ', dups.shape)

print('duplicates in review text', df['Review Text'].duplicated().any())
duplicates across all columns:  (0, 8)
duplicates in review text False
In [15]:
# Remove duplicates across all columns
df = df.drop_duplicates()

# Remove duplicates in the 'Review Text' column, keeping the first occurrence
df = df.drop_duplicates(subset='Review Text', keep='first')
In [17]:
print('duplicates in review text', df['Review Text'].duplicated().any())
duplicates in review text False
In [18]:
# Check if it worked 
print(df.shape)
df.info()
(22635, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 22635 entries, 0 to 23485
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      22635 non-null  int64 
 1   Title                    19669 non-null  object
 2   Review Text              22634 non-null  object
 3   Rating                   22635 non-null  int64 
 4   Recommended IND          22635 non-null  int64 
 5   Positive Feedback Count  22635 non-null  int64 
 6   Department Name          22622 non-null  object
 7   Class Name               22622 non-null  object
dtypes: int64(4), object(4)
memory usage: 1.6+ MB
In [26]:
#Clean Text

def clean_text(text):
    if pd.isnull(text):  # Checks if the text is a NaN value
        return ""  # Returns an empty string for NaN values
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    clean_text = ' '.join(tokens)
    print("Cleaned text:", clean_text)
    return clean_text

# Next line works, it just returns absolutely everything so we're just
# gonna print df.head
# df['clean_text'] = df['Review Text'].apply(clean_text)

# Print the first few rows of the DataFrame to verify the changes
print(df.head())
   Age                    Title  \
0   33                      NaN   
1   34                      NaN   
2   60  Some major design flaws   
3   50         My favorite buy!   
4   47         Flattering shirt   

                                         Review Text  Rating  Recommended IND  \
0  Absolutely wonderful - silky and sexy and comf...       4                1   
1  Love this dress!  it's sooo pretty.  i happene...       5                1   
2  I had such high hopes for this dress and reall...       3                0   
3  I love, love, love this jumpsuit. it's fun, fl...       5                1   
4  This shirt is very flattering to all due to th...       5                1   

   Positive Feedback Count Department Name Class Name  \
0                        0        Intimate  Intimates   
1                        4         Dresses    Dresses   
2                        0         Dresses    Dresses   
3                        0         Bottoms      Pants   
4                        6            Tops    Blouses   

                                          clean_text  
0        absolutely wonderful silky sexy comfortable  
1  love dress sooo pretty happened find store im ...  
2  high hope dress really wanted work initially o...  
3  love love love jumpsuit fun flirty fabulous ev...  
4  shirt flattering due adjustable front tie perf...  
In [27]:
#Distribution of Ratings
plt.figure(figsize=(8, 6))
sns.countplot(x='Rating', data=df, palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()
In [28]:
#Distribution of Review lengths
In [34]:
# Drop rows with missing values in the 'Review Text' column
df = df.dropna(subset=['clean_text'])

# Calculate review lengths
df['Review Length'] = df['clean_text'].apply(lambda x: len(x.split()))

# Plot distribution of review lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['Review Length'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length')
plt.ylabel('Count')
plt.show()
In [35]:
# Tokenize each review into words and calculate word lengths
word_lengths = df['clean_text'].apply(lambda x: [len(word) for word in x.split()])

# Flatten the list of word lengths
word_lengths_flat = [length for sublist in word_lengths for length in sublist]

# Plot distribution of word lengths
plt.figure(figsize=(10, 6))
sns.histplot(word_lengths_flat, bins=20, kde=True, color='skyblue')
plt.title('Distribution of Word Lengths')
plt.xlabel('Word Length')
plt.ylabel('Count')
plt.show()
In [41]:
# Combine all review texts into a single string
all_reviews = ' '.join(df['clean_text'].tolist())

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(all_reviews)

# Plot word cloud
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Clothing Reviews')
plt.axis('off')
plt.show()
In [52]:
#Positive and negative word clouds

# Extract positive and negative reviews
positive_reviews = df[df['sentiment'] == 1]['clean_text'].tolist()
negative_reviews = df[df['sentiment'] == 0]['clean_text'].tolist()

# Combine positive and negative reviews into single strings
positive_text = ' '.join(positive_reviews)
negative_text = ' '.join(negative_reviews)

# Create word clouds for positive and negative reviews
positive_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(positive_text)
negative_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='inferno').generate(negative_text)

# Plot word clouds
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Positive Reviews')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Negative Reviews')
plt.axis('off')

plt.show()
In [53]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Function to calculate polarity score using VADER
def calculate_polarity_score(text):
    # Initialize variables to store compound scores
    compound_score = 0
    
    # Tokenize the text into sentences
    sentences = tokenize.sent_tokenize(text)
    
    # Calculate compound score for each sentence and aggregate
    for sentence in sentences:
        vs = analyzer.polarity_scores(sentence)
        compound_score += vs['compound']
    
    # Calculate average compound score
    avg_compound_score = compound_score / len(sentences)
    
    return avg_compound_score

# Apply sentiment analysis and calculate polarity score for each review
df['polarity_score'] = df['clean_text'].apply(calculate_polarity_score)

# Function to map polarity score to sentiment
def map_sentiment(score):
    if score >= 0.05:
        return 1  # Positive
    elif score <= -0.05:
        return 0  # Negative
    else:
        return -1  # Neutral

# Apply mapping to create binary sentiment column
df['sentiment'] = df['polarity_score'].apply(map_sentiment)

# Display the updated DataFrame with polarity score and sentiment columns
print(df[['clean_text', 'polarity_score', 'sentiment']].head())
                                          clean_text  polarity_score  \
0        absolutely wonderful silky sexy comfortable          0.8991   
1  love dress sooo pretty happened find store im ...          0.9710   
2  high hope dress really wanted work initially o...          0.9081   
3  love love love jumpsuit fun flirty fabulous ev...          0.9437   
4  shirt flattering due adjustable front tie perf...          0.9062   

   sentiment  
0          1  
1          1  
2          1  
3          1  
4          1  
In [40]:
# Distribution of Polarity Score
plt.figure(figsize=(8, 6))
sns.histplot(df['polarity_score'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Polarity Score')
plt.xlabel('Polarity Score')
plt.ylabel('Count')
plt.show()

# Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df, palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
In [46]:
# Ensure the NLTK pos_tag function is imported correctly
from nltk import pos_tag
# Attempt tagging
pos_tagged_corpus = [pos_tag(word_tokenize(review)) for review in df['clean_text']]
In [47]:
# Perform Part-of-Speech Tagging
pos_tagged_corpus = [pos_tag(word_tokenize(review)) for review in df['clean_text']]

# Show Word Counts for Different Parts of Speech
# Initialize dictionaries to store word counts for different parts of speech
noun_counts = {}
verb_counts = {}
adj_counts = {}
adv_counts = {}

# Loop through each review and count occurrences of different parts of speech
for pos_tags_review in pos_tagged_corpus:
    for word, pos_tag in pos_tags_review:
        if pos_tag.startswith('N'):  # Nouns
            noun_counts[word] = noun_counts.get(word, 0) + 1
        elif pos_tag.startswith('V'):  # Verbs
            verb_counts[word] = verb_counts.get(word, 0) + 1
        elif pos_tag.startswith('J'):  # Adjectives
            adj_counts[word] = adj_counts.get(word, 0) + 1
        elif pos_tag.startswith('R'):  # Adverbs
            adv_counts[word] = adv_counts.get(word, 0) + 1

# Sort dictionaries by counts in descending order
sorted_noun_counts = sorted(noun_counts.items(), key=lambda x: x[1], reverse=True)
sorted_verb_counts = sorted(verb_counts.items(), key=lambda x: x[1], reverse=True)
sorted_adj_counts = sorted(adj_counts.items(), key=lambda x: x[1], reverse=True)
sorted_adv_counts = sorted(adv_counts.items(), key=lambda x: x[1], reverse=True)

# Print top 10 words for each part of speech
print("Top 10 Nouns:")
print(sorted_noun_counts[:10])
print("\nTop 10 Verbs:")
print(sorted_verb_counts[:10])
print("\nTop 10 Adjectives:")
print(sorted_adj_counts[:10])
print("\nTop 10 Adverbs:")
print(sorted_adv_counts[:10])

# Identify Popular Products
# Identify nouns that can be used to tag products
product_nouns = ['dress', 'jacket', 'bottom', 'top', 'shirt', 'pants', 'skirt', 'blouse', 'coat', 'shoes']

# Filter noun counts for popular products
popular_products = {noun: count for noun, count in noun_counts.items() if noun in product_nouns}

# Sort popular products by counts in descending order
sorted_popular_products = sorted(popular_products.items(), key=lambda x: x[1], reverse=True)

# Print top popular products
print("\nTop Popular Products:")
print(sorted_popular_products[:10])

# Identify Top Adjectives and Adverbs for Positive vs Negative Reviews
# Extract adjectives and adverbs from positive and negative reviews separately
positive_adjectives = {}
negative_adjectives = {}
positive_adverbs = {}
negative_adverbs = {}

# Loop through each review and extract adjectives and adverbs for positive and negative sentiments
for idx, (pos_tags_review, sentiment) in enumerate(zip(pos_tagged_corpus, df['sentiment'])):
    for word, pos_tag in pos_tags_review:
        if pos_tag.startswith('J'):  # Adjectives
            if sentiment == 1:  # Positive sentiment
                positive_adjectives[word] = positive_adjectives.get(word, 0) + 1
            elif sentiment == 0:  # Negative sentiment
                negative_adjectives[word] = negative_adjectives.get(word, 0) + 1
        elif pos_tag.startswith('R'):  # Adverbs
            if sentiment == 1:  # Positive sentiment
                positive_adverbs[word] = positive_adverbs.get(word, 0) + 1
            elif sentiment == 0:  # Negative sentiment
                negative_adverbs[word] = negative_adverbs.get(word, 0) + 1

# Sort adjectives and adverbs by counts in descending order
sorted_positive_adjectives = sorted(positive_adjectives.items(), key=lambda x: x[1], reverse=True)
sorted_negative_adjectives = sorted(negative_adjectives.items(), key=lambda x: x[1], reverse=True)
sorted_positive_adverbs = sorted(positive_adverbs.items(), key=lambda x: x[1], reverse=True)
sorted_negative_adverbs = sorted(negative_adverbs.items(), key=lambda x: x[1], reverse=True)

# Print top 10 adjectives and adverbs for positive and negative reviews
print("\nTop 10 Adjectives for Positive Reviews:")
print(sorted_positive_adjectives[:10])
print("\nTop 10 Adjectives for Negative Reviews:")
print(sorted_negative_adjectives[:10])
print("\nTop 10 Adverbs for Positive Reviews:")
print(sorted_positive_adverbs[:10])
print("\nTop 10 Adverbs for Negative Reviews:")
print(sorted_negative_adverbs[:10])
Top 10 Nouns:
[('dress', 9774), ('size', 9352), ('color', 6739), ('fit', 5297), ('look', 4829), ('im', 4145), ('love', 3829), ('fabric', 2498), ('shirt', 2411), ('length', 2314)]

Top 10 Verbs:
[('love', 4375), ('ordered', 3406), ('flattering', 2748), ('bought', 2727), ('got', 2388), ('run', 2025), ('look', 1890), ('made', 1823), ('make', 1792), ('go', 1717)]

Top 10 Adjectives:
[('top', 6356), ('great', 6075), ('small', 4572), ('fit', 3751), ('little', 3263), ('soft', 3237), ('wear', 2896), ('comfortable', 2885), ('large', 2786), ('nice', 2742)]

Top 10 Adverbs:
[('really', 3920), ('back', 3116), ('well', 3081), ('also', 2574), ('even', 2148), ('usually', 2005), ('pretty', 1750), ('still', 1750), ('long', 1518), ('however', 1478)]

Top Popular Products:
[('dress', 9774), ('shirt', 2411), ('skirt', 1973), ('top', 1810), ('jacket', 1270), ('bottom', 842), ('blouse', 725), ('coat', 487)]

Top 10 Adjectives for Positive Reviews:
[('top', 6229), ('great', 6033), ('small', 4402), ('fit', 3691), ('little', 3184), ('soft', 3169), ('comfortable', 2863), ('wear', 2826), ('nice', 2702), ('large', 2660)]

Top 10 Adjectives for Negative Reviews:
[('small', 121), ('top', 110), ('large', 88), ('fabric', 80), ('short', 76), ('disappointed', 70), ('big', 70), ('thin', 58), ('little', 54), ('bad', 54)]

Top 10 Adverbs for Positive Reviews:
[('really', 3796), ('well', 3024), ('back', 2931), ('also', 2449), ('even', 2030), ('usually', 1952), ('pretty', 1721), ('still', 1696), ('long', 1469), ('however', 1421)]

Top 10 Adverbs for Negative Reviews:
[('back', 162), ('also', 107), ('really', 96), ('even', 92), ('unfortunately', 58), ('however', 44), ('well', 43), ('usually', 43), ('long', 37), ('sadly', 36)]
In [97]:
# Initialize dictionaries to store word counts for different parts of speech
pos_counts = {'Noun': 0, 'Verb': 0, 'Adjective': 0, 'Adverb': 0, 'Other': 0}

# Loop through each review and count occurrences of different parts of speech
for pos_tags_review in pos_tagged_corpus:
    for word, pos_tag in pos_tags_review:
        if pos_tag.startswith('N'):  # Nouns
            pos_counts['Noun'] += 1
        elif pos_tag.startswith('V'):  # Verbs
            pos_counts['Verb'] += 1
        elif pos_tag.startswith('J'):  # Adjectives
            pos_counts['Adjective'] += 1
        elif pos_tag.startswith('R'):  # Adverbs
            pos_counts['Adverb'] += 1
        else:
            pos_counts['Other'] += 1

# Plotting the counts for each part of speech
plt.figure(figsize=(10, 6))
plt.bar(pos_counts.keys(), pos_counts.values(), color='skyblue')
plt.title('Word Counts by Part of Speech')
plt.xlabel('Part of Speech')
plt.ylabel('Word Count')
plt.show()
In [49]:
# Visualize the popularity of products
def visualize_popular_products(products, counts, title):
    plt.figure(figsize=(10, 6))
    plt.bar(products, counts, color='skyblue')
    plt.title(title)
    plt.xlabel('Products')
    plt.ylabel('Counts')
    plt.xticks(rotation=45, ha='right')
    plt.show()

products, product_counts = zip(*sorted_popular_products[:10])
visualize_popular_products(products, product_counts, 'Top Popular Products')
In [94]:
import matplotlib.pyplot as plt

# Function to visualize word counts for different parts of speech
def visualize_word_counts(word_counts, title):
    parts_of_speech, counts = zip(*word_counts)
    plt.figure(figsize=(10, 6))
    plt.bar(parts_of_speech, counts, color='skyblue')
    plt.title(title)
    plt.xlabel('Part of Speech')
    plt.ylabel('Count')
    plt.show()

# Visualize the word counts for nouns
visualize_word_counts(sorted_noun_counts[:10], 'Top 10 Nouns')

# Visualize the word counts for verbs
visualize_word_counts(sorted_verb_counts[:10], 'Top 10 Verbs')

# Visualize the word counts for adjectives
visualize_word_counts(sorted_adj_counts[:10], 'Top 10 Adjectives')

# Visualize the word counts for adverbs
visualize_word_counts(sorted_adv_counts[:10], 'Top 10 Adverbs')
In [51]:
# Visualize the top adjectives and adverbs for positive and negative reviews
def visualize_top_words(word_counts, title):
    words, counts = zip(*word_counts)
    plt.figure(figsize=(10, 6))
    plt.bar(words, counts, color='skyblue')
    plt.title(title)
    plt.xlabel('Words')
    plt.ylabel('Counts')
    plt.xticks(rotation=45, ha='right')
    plt.show()

visualize_top_words(sorted_positive_adjectives[:10], 'Top 10 Adjectives for Positive Reviews')
visualize_top_words(sorted_negative_adjectives[:10], 'Top 10 Adjectives for Negative Reviews')
visualize_top_words(sorted_positive_adverbs[:10], 'Top 10 Adverbs for Positive Reviews')
visualize_top_words(sorted_negative_adverbs[:10], 'Top 10 Adverbs for Negative Reviews')
In [72]:
# Data Preprocessing
# Drop any missing values in the relevant columns
df = df.dropna(subset=['clean_text', 'Review Length', 'polarity_score'])
In [55]:
# Encode the target variable to binary format
df['Recommended IND'] = df['Recommended IND'].apply(lambda x: 1 if x == 1 else 0)
In [56]:
# Feature Selection
X = df[['clean_text', 'Review Length', 'polarity_score']]
y = df['Recommended IND']
In [57]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [58]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', RandomForestClassifier(random_state=42))
])
In [59]:
# Parameter grid for grid search
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000],
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30]
}
In [60]:
# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train['clean_text'], y_train)
Out[60]:
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 20, 30],
                         'model__n_estimators': [100, 200, 300],
                         'tfidf__max_features': [1000, 2000, 3000]},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('model',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'model__max_depth': [None, 10, 20, 30],
                         'model__n_estimators': [100, 200, 300],
                         'tfidf__max_features': [1000, 2000, 3000]},
             scoring='accuracy')
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('model', RandomForestClassifier(random_state=42))])
TfidfVectorizer()
RandomForestClassifier(random_state=42)
In [61]:
# Best parameters
print("Best Parameters:", grid_search.best_params_)
Best Parameters: {'model__max_depth': None, 'model__n_estimators': 300, 'tfidf__max_features': 1000}
In [64]:
# Best model
best_model = grid_search.best_estimator_
In [63]:
# Model evaluation
y_pred = best_model.predict(X_test['clean_text'])
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", cn(y_test, y_pred))
Accuracy: 0.8661365142478462
Precision: 0.8666031443544545
Recall: 0.987513572204126
F1 Score: 0.9231159604161382
Confusion Matrix:
 [[ 283  560]
 [  46 3638]]
In [116]:
from sklearn.metrics import classification_report
from sklearn.metrics.plot import ConfusionMatrixDisplay

# Plot confusion matrix
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(best_model, X_test['clean_text'], y_test, cmap=plt.cm.Blues, normalize='true')
plt.title('Confusion Matrix for Text Classification')
plt.show()

# Classification Report Visualization
y_pred = best_model.predict(X_test['clean_text'])
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()

plt.figure(figsize=(10, 6))
sns.heatmap(df_report.iloc[:-1, :3], annot=True, cmap='Blues', fmt=".2f")
plt.title('Classification Report for Text Classification')
plt.xlabel('Metrics')
plt.ylabel('Classes')
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[116], line 2
      1 from sklearn.metrics import classification_report
----> 2 from sklearn.metrics.plot import ConfusionMatrixDisplay
      4 # Plot confusion matrix
      5 plt.figure(figsize=(8, 6))

ModuleNotFoundError: No module named 'sklearn.metrics.plot'
In [68]:
# Sample a subset of the data
df_sample = df.sample(frac=0.5, random_state=42)

# Feature Selection for Text Classification Model
X_text_classification = df_sample['clean_text']
y_text_classification = df_sample['Department Name']

# Train-Test Split for Text Classification Model
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X_text_classification, y_text_classification, test_size=0.2, random_state=42)

# Define the pipeline for text classification with simplified parameters
pipeline_text_classification = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', LogisticRegression(random_state=42))
])

# Parameter grid for grid search with simplified parameters
param_grid_text_classification = {
    'tfidf__max_features': [1000],
    'model__C': [1.0, 10.0]
}

# Grid search with cross-validation for text classification with simplified parameters
grid_search_text_classification = GridSearchCV(pipeline_text_classification, param_grid_text_classification, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_text_classification.fit(X_train_text, y_train_text)

# Best parameters for text classification
print("Best Parameters for Text Classification:", grid_search_text_classification.best_params_)

# Best model for text classification
best_model_text_classification = grid_search_text_classification.best_estimator_

# Model evaluation for text classification
y_pred_text = best_model_text_classification.predict(X_test_text)
print("Accuracy for Text Classification:", accuracy_score(y_test_text, y_pred_text))
print("Classification Report:\n", classification_report(y_test_text, y_pred_text))
print("Confusion Matrix for Text Classification:\n", cn(y_test_text, y_pred_text))
Best Parameters for Text Classification: {'model__C': 1.0, 'tfidf__max_features': 1000}
Accuracy for Text Classification: 0.8116710875331565
Classification Report:
               precision    recall  f1-score   support

     Bottoms       0.79      0.74      0.76       356
     Dresses       0.89      0.89      0.89       653
    Intimate       0.83      0.23      0.36       169
     Jackets       0.66      0.41      0.51        99
        Tops       0.78      0.93      0.85       978
       Trend       0.00      0.00      0.00         7

    accuracy                           0.81      2262
   macro avg       0.66      0.53      0.56      2262
weighted avg       0.81      0.81      0.79      2262

Confusion Matrix for Text Classification:
 [[263  20   2   2  69   0]
 [ 14 579   2   1  57   0]
 [ 22  19  39   5  84   0]
 [ 10   7   0  41  41   0]
 [ 25  24   4  11 914   0]
 [  1   3   0   2   1   0]]
In [ ]:
 
In [112]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, cmap=plt.cm.Reds):
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='.2f', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

# Plot confusion matrix
plot_confusion_matrix(y_test_text, y_pred_text, classes=np.unique(y_test_text), normalize=True)


# Bottoms are predicted accurately 74% of the time
# Dresses are predicted accurately 89% of the time
# Intimates are predicted accurately 50% of the time
    # 23% of intimates are being misclassified as jackets
# Jackets are predicted correctly 41% of the time
# Tops are predicted correctly 93% of the time
# Trend is often confused with bottoms (43% of the time)
    # with a correct prediction rate of 14%
    # Trend was misclassified 100% of the time
In [66]:
# Check the data type of the target variable and ensure it contains only string values
print("Data type of y_text_classification:", y_text_classification.dtype)

# Check for missing values in the target variable
print("Missing values in y_text_classification:", y_text_classification.isnull().sum())

# If there are any missing values, drop them from the dataset
df.dropna(subset=['Department Name'], inplace=True)

# Convert the target variable to string data type
y_text_classification = df['Department Name'].astype(str)

# Rerun the train-test split and the grid search for text classification
Data type of y_text_classification: object
Missing values in y_text_classification: 13
In [70]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Text Classification')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
In [113]:
# Instantiate TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['clean_text'])

# Instantiate Naive Bayes model
naive_bayes = MultinomialNB()

# Fit the Naive Bayes model
naive_bayes.fit(X_train_tfidf, y_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test['clean_text'])

# Predict with Naive Bayes model
y_pred_naive_bayes = naive_bayes.predict(X_test_tfidf)

# Compute confusion matrix for Naive Bayes
cm_naive_bayes = confusion_matrix(y_test, y_pred_naive_bayes)

# Plot confusion matrix for Naive Bayes
plt.figure(figsize=(8, 6))
sns.heatmap(cm_naive_bayes, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
In [89]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate Random Forest model
random_forest = RandomForestClassifier(random_state=42)

# Fit and predict with Random Forest model
random_forest.fit(X_train_tfidf, y_train)
y_pred_random_forest = random_forest.predict(X_test_tfidf)

# Compute confusion matrix for Random Forest
cm_random_forest = confusion_matrix(y_test, y_pred_random_forest)

# Plot confusion matrix for Random Forest
plt.figure(figsize=(8, 6))
sns.heatmap(cm_random_forest, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Random Forest')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
In [79]:
 
In [90]:
# Calculate evaluation metrics for Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_random_forest)
precision_rf = precision_score(y_test, y_pred_random_forest)
recall_rf = recall_score(y_test, y_pred_random_forest)
f1_rf = f1_score(y_test, y_pred_random_forest)

# Calculate evaluation metrics for Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_naive_bayes)
precision_nb = precision_score(y_test, y_pred_naive_bayes)
recall_nb = recall_score(y_test, y_pred_naive_bayes)
f1_nb = f1_score(y_test, y_pred_naive_bayes)

# Confusion matrix for Random Forest
cm_rf = cn(y_test, y_pred_random_forest)

# Confusion matrix for Naive Bayes
cm_nb = cn(y_test, y_pred_naive_bayes)

# Conclusion
print("Based on the evaluation metrics obtained:")
print("Random Forest Model:")
print(f"- Accuracy: {accuracy_rf:.2f}")
print(f"- Precision: {precision_rf:.2f}")
print(f"- Recall: {recall_rf:.2f}")
print(f"- F1 Score: {f1_rf:.2f}")
print("\nNaive Bayes Model:")
print(f"- Accuracy: {accuracy_nb:.2f}")
print(f"- Precision: {precision_nb:.2f}")
print(f"- Recall: {recall_nb:.2f}")
print(f"- F1 Score: {f1_nb:.2f}")

# Print confusion matrices
print("\nConfusion Matrix for Random Forest:")
print(cm_rf)
print("\nConfusion Matrix for Naive Bayes:")
print(cm_nb)
Based on the evaluation metrics obtained:
Random Forest Model:
- Accuracy: 0.85
- Precision: 0.85
- Recall: 0.99
- F1 Score: 0.91

Naive Bayes Model:
- Accuracy: 0.82
- Precision: 0.82
- Recall: 1.00
- F1 Score: 0.90

Confusion Matrix for Random Forest:
[[ 171  672]
 [  20 3664]]

Confusion Matrix for Naive Bayes:
[[  19  824]
 [   0 3684]]
In [91]:
# Conclusion

# During EDA and Preprocessing, we cleaned the text,
# did sentiment analysis with VADER, part of speech tagging,
# and visualized the word distribution. 
# Because of this we were able to implement the classification models.

# Both Machine Learning models performed well with regards to accuracy, 
# precision, recall and F1 sscore. 
# The Random Forest had a higher accuracy and F1 score though, 
# Which means it has a better predictive capability. 

# While both models did well with regards to performance, 
# the Random Forest model was the best overall,
# because it offered a balance between precision and recall
# and the Naive Bayes did well in recall, but not precision. 
In [93]:
# References
# https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
# https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://stackoverflow.com/questions/67979512/how-to-find-nlp-words-count-and-plot-it
# https://www.datacamp.com/tutorial/wordcloud-python
# https://www.datacamp.com/tutorial/how-to-make-a-seaborn-histogram
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
# https://www.geeksforgeeks.org/generating-word-cloud-python/
# https://medium.com/product-ai/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [105]:
 
In [ ]:
 
In [ ]: