print('helloworld')
helloworld
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import tokenize
import string
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Added missing imports here
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix as cn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
# Begin EDA
df = pd.read_csv ('ClothingReviews.csv')
print(df.shape)
df.info()
(23486, 8) <class 'pandas.core.frame.DataFrame'> RangeIndex: 23486 entries, 0 to 23485 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 23486 non-null int64 1 Title 19676 non-null object 2 Review Text 22641 non-null object 3 Rating 23486 non-null int64 4 Recommended IND 23486 non-null int64 5 Positive Feedback Count 23486 non-null int64 6 Department Name 23472 non-null object 7 Class Name 23472 non-null object dtypes: int64(4), object(4) memory usage: 1.4+ MB
print(df.describe())
Age Rating Recommended IND Positive Feedback Count count 23486.000000 23486.000000 23486.000000 23486.000000 mean 43.198544 4.196032 0.822362 2.535936 std 12.279544 1.110031 0.382216 5.702202 min 18.000000 1.000000 0.000000 0.000000 25% 34.000000 4.000000 1.000000 0.000000 50% 41.000000 5.000000 1.000000 1.000000 75% 52.000000 5.000000 1.000000 3.000000 max 99.000000 5.000000 1.000000 122.000000
df.head(3)
| Age | Title | Review Text | Rating | Recommended IND | Positive Feedback Count | Department Name | Class Name | |
|---|---|---|---|---|---|---|---|---|
| 0 | 33 | NaN | Absolutely wonderful - silky and sexy and comf... | 4 | 1 | 0 | Intimate | Intimates |
| 1 | 34 | NaN | Love this dress! it's sooo pretty. i happene... | 5 | 1 | 4 | Dresses | Dresses |
| 2 | 60 | Some major design flaws | I had such high hopes for this dress and reall... | 3 | 0 | 0 | Dresses | Dresses |
def print_missing():
cols = df.columns[df.isnull().any()].tolist()
for c in cols:
print(c, df[c].isnull().sum())
print_missing()
Title 3810 Review Text 845 Department Name 14 Class Name 14
df['Review Text'].value_counts()
Perfect fit and i've gotten so many compliments. i buy all my suits from here now! 3
I purchased this and another eva franco dress during retailer's recent 20% off sale. i was looking for dresses that were work appropriate, but that would also transition well to happy hour or date night. they both seemed to be just what i was looking for. i ordered a 4 regular and a 6 regular, as i am usually in between sizes. the 4 was definitely too small. the 6 fit, technically, but was very ill fitting. not only is the dress itself short, but it is very short-waisted. i am only 5'3", but it fe 2
The sweater and skirt are so pretty! they're really soft and have such an easy, comfortable look together. really love this gorgeous outfit.\n\ni am borderline small/medium and kept the size small after trying both on. 2
Love, love these jeans. being short they come right to my ankle. super soft and don?t require any hemming. i ordered my typical jean size of 26 and they fit like a glove. would love to have these in black and grey. 2
Lightweight, soft cotton top and shorts. i think it's meant to be a beach cover-up but i'm wearing it as a thin, light-weight summer outfit on these hot hot days. the top has a loose elastic around the bottom which i didn't realize when i ordered it, but i like it and it matches the look in the photos. and the shorts are very low-cut - don't expect them up around your waist. again, i like that. some might want to wear a cami underneath because it's a thin cotton but i'm fine as-is. i bought it i 2
..
I love charlie pants. i've bought a few in the past, and i've never been disappointed with quality, style, or fit. i love the side zip on these and the nice flat front as a result. i wish more pants were side zip- so much more flattering. i got these in a 4, my usual size, and the fit is perfect. i also got these in the regular, rather than petite, but the length works just fine for my height. i bought these in the grey stripe, but like them so much i think i'll order another pair in the orange. 1
I'm a size 4 all day everyday. but, these pants wouldn't even meet around my hips. sadly, i'm returning them tomorrow. 1
These are beautiful pants, especially when you walk, they are simply beautiful and flow nicely. however, they are soooo very long (i am 5 5 1/2) and even with a big wedge's i would have to get these altered. the waist seemed very small, i am a sz 12 (10 in some stores) and i got the large and it was way too tight in the waist and have no stretch or give at all. . probably a sz 10 at most. so unfortunetly they have to go back. 1
I like the pattern and material on this top but the bubble hem isn't flattering on my figure. i would recommend this top if you are tiny and petite. 1
This dress in a lovely platinum is feminine and fits perfectly, easy to wear and comfy, too! highly recommend! 1
Name: Review Text, Length: 22634, dtype: int64
print(df.columns)
Index(['Age', 'Title', 'Review Text', 'Rating', 'Recommended IND',
'Positive Feedback Count', 'Department Name', 'Class Name'],
dtype='object')
# Cleaning
dups = df[df.duplicated()]
print('duplicates across all columns: ', dups.shape)
print('duplicates in review text', df['Review Text'].duplicated().any())
duplicates across all columns: (0, 8) duplicates in review text False
# Remove duplicates across all columns
df = df.drop_duplicates()
# Remove duplicates in the 'Review Text' column, keeping the first occurrence
df = df.drop_duplicates(subset='Review Text', keep='first')
print('duplicates in review text', df['Review Text'].duplicated().any())
duplicates in review text False
# Check if it worked
print(df.shape)
df.info()
(22635, 8) <class 'pandas.core.frame.DataFrame'> Int64Index: 22635 entries, 0 to 23485 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 22635 non-null int64 1 Title 19669 non-null object 2 Review Text 22634 non-null object 3 Rating 22635 non-null int64 4 Recommended IND 22635 non-null int64 5 Positive Feedback Count 22635 non-null int64 6 Department Name 22622 non-null object 7 Class Name 22622 non-null object dtypes: int64(4), object(4) memory usage: 1.6+ MB
#Clean Text
def clean_text(text):
if pd.isnull(text): # Checks if the text is a NaN value
return "" # Returns an empty string for NaN values
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
text = ''.join([i for i in text if not i.isdigit()])
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
clean_text = ' '.join(tokens)
print("Cleaned text:", clean_text)
return clean_text
# Next line works, it just returns absolutely everything so we're just
# gonna print df.head
# df['clean_text'] = df['Review Text'].apply(clean_text)
# Print the first few rows of the DataFrame to verify the changes
print(df.head())
Age Title \
0 33 NaN
1 34 NaN
2 60 Some major design flaws
3 50 My favorite buy!
4 47 Flattering shirt
Review Text Rating Recommended IND \
0 Absolutely wonderful - silky and sexy and comf... 4 1
1 Love this dress! it's sooo pretty. i happene... 5 1
2 I had such high hopes for this dress and reall... 3 0
3 I love, love, love this jumpsuit. it's fun, fl... 5 1
4 This shirt is very flattering to all due to th... 5 1
Positive Feedback Count Department Name Class Name \
0 0 Intimate Intimates
1 4 Dresses Dresses
2 0 Dresses Dresses
3 0 Bottoms Pants
4 6 Tops Blouses
clean_text
0 absolutely wonderful silky sexy comfortable
1 love dress sooo pretty happened find store im ...
2 high hope dress really wanted work initially o...
3 love love love jumpsuit fun flirty fabulous ev...
4 shirt flattering due adjustable front tie perf...
#Distribution of Ratings
plt.figure(figsize=(8, 6))
sns.countplot(x='Rating', data=df, palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()
#Distribution of Review lengths
# Drop rows with missing values in the 'Review Text' column
df = df.dropna(subset=['clean_text'])
# Calculate review lengths
df['Review Length'] = df['clean_text'].apply(lambda x: len(x.split()))
# Plot distribution of review lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['Review Length'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length')
plt.ylabel('Count')
plt.show()
# Tokenize each review into words and calculate word lengths
word_lengths = df['clean_text'].apply(lambda x: [len(word) for word in x.split()])
# Flatten the list of word lengths
word_lengths_flat = [length for sublist in word_lengths for length in sublist]
# Plot distribution of word lengths
plt.figure(figsize=(10, 6))
sns.histplot(word_lengths_flat, bins=20, kde=True, color='skyblue')
plt.title('Distribution of Word Lengths')
plt.xlabel('Word Length')
plt.ylabel('Count')
plt.show()
# Combine all review texts into a single string
all_reviews = ' '.join(df['clean_text'].tolist())
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(all_reviews)
# Plot word cloud
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Clothing Reviews')
plt.axis('off')
plt.show()
#Positive and negative word clouds
# Extract positive and negative reviews
positive_reviews = df[df['sentiment'] == 1]['clean_text'].tolist()
negative_reviews = df[df['sentiment'] == 0]['clean_text'].tolist()
# Combine positive and negative reviews into single strings
positive_text = ' '.join(positive_reviews)
negative_text = ' '.join(negative_reviews)
# Create word clouds for positive and negative reviews
positive_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(positive_text)
negative_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='inferno').generate(negative_text)
# Plot word clouds
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Positive Reviews')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Negative Reviews')
plt.axis('off')
plt.show()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
# Function to calculate polarity score using VADER
def calculate_polarity_score(text):
# Initialize variables to store compound scores
compound_score = 0
# Tokenize the text into sentences
sentences = tokenize.sent_tokenize(text)
# Calculate compound score for each sentence and aggregate
for sentence in sentences:
vs = analyzer.polarity_scores(sentence)
compound_score += vs['compound']
# Calculate average compound score
avg_compound_score = compound_score / len(sentences)
return avg_compound_score
# Apply sentiment analysis and calculate polarity score for each review
df['polarity_score'] = df['clean_text'].apply(calculate_polarity_score)
# Function to map polarity score to sentiment
def map_sentiment(score):
if score >= 0.05:
return 1 # Positive
elif score <= -0.05:
return 0 # Negative
else:
return -1 # Neutral
# Apply mapping to create binary sentiment column
df['sentiment'] = df['polarity_score'].apply(map_sentiment)
# Display the updated DataFrame with polarity score and sentiment columns
print(df[['clean_text', 'polarity_score', 'sentiment']].head())
clean_text polarity_score \ 0 absolutely wonderful silky sexy comfortable 0.8991 1 love dress sooo pretty happened find store im ... 0.9710 2 high hope dress really wanted work initially o... 0.9081 3 love love love jumpsuit fun flirty fabulous ev... 0.9437 4 shirt flattering due adjustable front tie perf... 0.9062 sentiment 0 1 1 1 2 1 3 1 4 1
# Distribution of Polarity Score
plt.figure(figsize=(8, 6))
sns.histplot(df['polarity_score'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Polarity Score')
plt.xlabel('Polarity Score')
plt.ylabel('Count')
plt.show()
# Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df, palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
# Ensure the NLTK pos_tag function is imported correctly
from nltk import pos_tag
# Attempt tagging
pos_tagged_corpus = [pos_tag(word_tokenize(review)) for review in df['clean_text']]
# Perform Part-of-Speech Tagging
pos_tagged_corpus = [pos_tag(word_tokenize(review)) for review in df['clean_text']]
# Show Word Counts for Different Parts of Speech
# Initialize dictionaries to store word counts for different parts of speech
noun_counts = {}
verb_counts = {}
adj_counts = {}
adv_counts = {}
# Loop through each review and count occurrences of different parts of speech
for pos_tags_review in pos_tagged_corpus:
for word, pos_tag in pos_tags_review:
if pos_tag.startswith('N'): # Nouns
noun_counts[word] = noun_counts.get(word, 0) + 1
elif pos_tag.startswith('V'): # Verbs
verb_counts[word] = verb_counts.get(word, 0) + 1
elif pos_tag.startswith('J'): # Adjectives
adj_counts[word] = adj_counts.get(word, 0) + 1
elif pos_tag.startswith('R'): # Adverbs
adv_counts[word] = adv_counts.get(word, 0) + 1
# Sort dictionaries by counts in descending order
sorted_noun_counts = sorted(noun_counts.items(), key=lambda x: x[1], reverse=True)
sorted_verb_counts = sorted(verb_counts.items(), key=lambda x: x[1], reverse=True)
sorted_adj_counts = sorted(adj_counts.items(), key=lambda x: x[1], reverse=True)
sorted_adv_counts = sorted(adv_counts.items(), key=lambda x: x[1], reverse=True)
# Print top 10 words for each part of speech
print("Top 10 Nouns:")
print(sorted_noun_counts[:10])
print("\nTop 10 Verbs:")
print(sorted_verb_counts[:10])
print("\nTop 10 Adjectives:")
print(sorted_adj_counts[:10])
print("\nTop 10 Adverbs:")
print(sorted_adv_counts[:10])
# Identify Popular Products
# Identify nouns that can be used to tag products
product_nouns = ['dress', 'jacket', 'bottom', 'top', 'shirt', 'pants', 'skirt', 'blouse', 'coat', 'shoes']
# Filter noun counts for popular products
popular_products = {noun: count for noun, count in noun_counts.items() if noun in product_nouns}
# Sort popular products by counts in descending order
sorted_popular_products = sorted(popular_products.items(), key=lambda x: x[1], reverse=True)
# Print top popular products
print("\nTop Popular Products:")
print(sorted_popular_products[:10])
# Identify Top Adjectives and Adverbs for Positive vs Negative Reviews
# Extract adjectives and adverbs from positive and negative reviews separately
positive_adjectives = {}
negative_adjectives = {}
positive_adverbs = {}
negative_adverbs = {}
# Loop through each review and extract adjectives and adverbs for positive and negative sentiments
for idx, (pos_tags_review, sentiment) in enumerate(zip(pos_tagged_corpus, df['sentiment'])):
for word, pos_tag in pos_tags_review:
if pos_tag.startswith('J'): # Adjectives
if sentiment == 1: # Positive sentiment
positive_adjectives[word] = positive_adjectives.get(word, 0) + 1
elif sentiment == 0: # Negative sentiment
negative_adjectives[word] = negative_adjectives.get(word, 0) + 1
elif pos_tag.startswith('R'): # Adverbs
if sentiment == 1: # Positive sentiment
positive_adverbs[word] = positive_adverbs.get(word, 0) + 1
elif sentiment == 0: # Negative sentiment
negative_adverbs[word] = negative_adverbs.get(word, 0) + 1
# Sort adjectives and adverbs by counts in descending order
sorted_positive_adjectives = sorted(positive_adjectives.items(), key=lambda x: x[1], reverse=True)
sorted_negative_adjectives = sorted(negative_adjectives.items(), key=lambda x: x[1], reverse=True)
sorted_positive_adverbs = sorted(positive_adverbs.items(), key=lambda x: x[1], reverse=True)
sorted_negative_adverbs = sorted(negative_adverbs.items(), key=lambda x: x[1], reverse=True)
# Print top 10 adjectives and adverbs for positive and negative reviews
print("\nTop 10 Adjectives for Positive Reviews:")
print(sorted_positive_adjectives[:10])
print("\nTop 10 Adjectives for Negative Reviews:")
print(sorted_negative_adjectives[:10])
print("\nTop 10 Adverbs for Positive Reviews:")
print(sorted_positive_adverbs[:10])
print("\nTop 10 Adverbs for Negative Reviews:")
print(sorted_negative_adverbs[:10])
Top 10 Nouns:
[('dress', 9774), ('size', 9352), ('color', 6739), ('fit', 5297), ('look', 4829), ('im', 4145), ('love', 3829), ('fabric', 2498), ('shirt', 2411), ('length', 2314)]
Top 10 Verbs:
[('love', 4375), ('ordered', 3406), ('flattering', 2748), ('bought', 2727), ('got', 2388), ('run', 2025), ('look', 1890), ('made', 1823), ('make', 1792), ('go', 1717)]
Top 10 Adjectives:
[('top', 6356), ('great', 6075), ('small', 4572), ('fit', 3751), ('little', 3263), ('soft', 3237), ('wear', 2896), ('comfortable', 2885), ('large', 2786), ('nice', 2742)]
Top 10 Adverbs:
[('really', 3920), ('back', 3116), ('well', 3081), ('also', 2574), ('even', 2148), ('usually', 2005), ('pretty', 1750), ('still', 1750), ('long', 1518), ('however', 1478)]
Top Popular Products:
[('dress', 9774), ('shirt', 2411), ('skirt', 1973), ('top', 1810), ('jacket', 1270), ('bottom', 842), ('blouse', 725), ('coat', 487)]
Top 10 Adjectives for Positive Reviews:
[('top', 6229), ('great', 6033), ('small', 4402), ('fit', 3691), ('little', 3184), ('soft', 3169), ('comfortable', 2863), ('wear', 2826), ('nice', 2702), ('large', 2660)]
Top 10 Adjectives for Negative Reviews:
[('small', 121), ('top', 110), ('large', 88), ('fabric', 80), ('short', 76), ('disappointed', 70), ('big', 70), ('thin', 58), ('little', 54), ('bad', 54)]
Top 10 Adverbs for Positive Reviews:
[('really', 3796), ('well', 3024), ('back', 2931), ('also', 2449), ('even', 2030), ('usually', 1952), ('pretty', 1721), ('still', 1696), ('long', 1469), ('however', 1421)]
Top 10 Adverbs for Negative Reviews:
[('back', 162), ('also', 107), ('really', 96), ('even', 92), ('unfortunately', 58), ('however', 44), ('well', 43), ('usually', 43), ('long', 37), ('sadly', 36)]
# Initialize dictionaries to store word counts for different parts of speech
pos_counts = {'Noun': 0, 'Verb': 0, 'Adjective': 0, 'Adverb': 0, 'Other': 0}
# Loop through each review and count occurrences of different parts of speech
for pos_tags_review in pos_tagged_corpus:
for word, pos_tag in pos_tags_review:
if pos_tag.startswith('N'): # Nouns
pos_counts['Noun'] += 1
elif pos_tag.startswith('V'): # Verbs
pos_counts['Verb'] += 1
elif pos_tag.startswith('J'): # Adjectives
pos_counts['Adjective'] += 1
elif pos_tag.startswith('R'): # Adverbs
pos_counts['Adverb'] += 1
else:
pos_counts['Other'] += 1
# Plotting the counts for each part of speech
plt.figure(figsize=(10, 6))
plt.bar(pos_counts.keys(), pos_counts.values(), color='skyblue')
plt.title('Word Counts by Part of Speech')
plt.xlabel('Part of Speech')
plt.ylabel('Word Count')
plt.show()
# Visualize the popularity of products
def visualize_popular_products(products, counts, title):
plt.figure(figsize=(10, 6))
plt.bar(products, counts, color='skyblue')
plt.title(title)
plt.xlabel('Products')
plt.ylabel('Counts')
plt.xticks(rotation=45, ha='right')
plt.show()
products, product_counts = zip(*sorted_popular_products[:10])
visualize_popular_products(products, product_counts, 'Top Popular Products')
import matplotlib.pyplot as plt
# Function to visualize word counts for different parts of speech
def visualize_word_counts(word_counts, title):
parts_of_speech, counts = zip(*word_counts)
plt.figure(figsize=(10, 6))
plt.bar(parts_of_speech, counts, color='skyblue')
plt.title(title)
plt.xlabel('Part of Speech')
plt.ylabel('Count')
plt.show()
# Visualize the word counts for nouns
visualize_word_counts(sorted_noun_counts[:10], 'Top 10 Nouns')
# Visualize the word counts for verbs
visualize_word_counts(sorted_verb_counts[:10], 'Top 10 Verbs')
# Visualize the word counts for adjectives
visualize_word_counts(sorted_adj_counts[:10], 'Top 10 Adjectives')
# Visualize the word counts for adverbs
visualize_word_counts(sorted_adv_counts[:10], 'Top 10 Adverbs')
# Visualize the top adjectives and adverbs for positive and negative reviews
def visualize_top_words(word_counts, title):
words, counts = zip(*word_counts)
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='skyblue')
plt.title(title)
plt.xlabel('Words')
plt.ylabel('Counts')
plt.xticks(rotation=45, ha='right')
plt.show()
visualize_top_words(sorted_positive_adjectives[:10], 'Top 10 Adjectives for Positive Reviews')
visualize_top_words(sorted_negative_adjectives[:10], 'Top 10 Adjectives for Negative Reviews')
visualize_top_words(sorted_positive_adverbs[:10], 'Top 10 Adverbs for Positive Reviews')
visualize_top_words(sorted_negative_adverbs[:10], 'Top 10 Adverbs for Negative Reviews')
# Data Preprocessing
# Drop any missing values in the relevant columns
df = df.dropna(subset=['clean_text', 'Review Length', 'polarity_score'])
# Encode the target variable to binary format
df['Recommended IND'] = df['Recommended IND'].apply(lambda x: 1 if x == 1 else 0)
# Feature Selection
X = df[['clean_text', 'Review Length', 'polarity_score']]
y = df['Recommended IND']
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the pipeline
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('model', RandomForestClassifier(random_state=42))
])
# Parameter grid for grid search
param_grid = {
'tfidf__max_features': [1000, 2000, 3000],
'model__n_estimators': [100, 200, 300],
'model__max_depth': [None, 10, 20, 30]
}
# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train['clean_text'], y_train)
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
('model',
RandomForestClassifier(random_state=42))]),
n_jobs=-1,
param_grid={'model__max_depth': [None, 10, 20, 30],
'model__n_estimators': [100, 200, 300],
'tfidf__max_features': [1000, 2000, 3000]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
('model',
RandomForestClassifier(random_state=42))]),
n_jobs=-1,
param_grid={'model__max_depth': [None, 10, 20, 30],
'model__n_estimators': [100, 200, 300],
'tfidf__max_features': [1000, 2000, 3000]},
scoring='accuracy')Pipeline(steps=[('tfidf', TfidfVectorizer()),
('model', RandomForestClassifier(random_state=42))])TfidfVectorizer()
RandomForestClassifier(random_state=42)
# Best parameters
print("Best Parameters:", grid_search.best_params_)
Best Parameters: {'model__max_depth': None, 'model__n_estimators': 300, 'tfidf__max_features': 1000}
# Best model
best_model = grid_search.best_estimator_
# Model evaluation
y_pred = best_model.predict(X_test['clean_text'])
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", cn(y_test, y_pred))
Accuracy: 0.8661365142478462 Precision: 0.8666031443544545 Recall: 0.987513572204126 F1 Score: 0.9231159604161382 Confusion Matrix: [[ 283 560] [ 46 3638]]
from sklearn.metrics import classification_report
from sklearn.metrics.plot import ConfusionMatrixDisplay
# Plot confusion matrix
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(best_model, X_test['clean_text'], y_test, cmap=plt.cm.Blues, normalize='true')
plt.title('Confusion Matrix for Text Classification')
plt.show()
# Classification Report Visualization
y_pred = best_model.predict(X_test['clean_text'])
report = classification_report(y_test, y_pred, output_dict=True)
df_report = pd.DataFrame(report).transpose()
plt.figure(figsize=(10, 6))
sns.heatmap(df_report.iloc[:-1, :3], annot=True, cmap='Blues', fmt=".2f")
plt.title('Classification Report for Text Classification')
plt.xlabel('Metrics')
plt.ylabel('Classes')
plt.show()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[116], line 2 1 from sklearn.metrics import classification_report ----> 2 from sklearn.metrics.plot import ConfusionMatrixDisplay 4 # Plot confusion matrix 5 plt.figure(figsize=(8, 6)) ModuleNotFoundError: No module named 'sklearn.metrics.plot'
# Sample a subset of the data
df_sample = df.sample(frac=0.5, random_state=42)
# Feature Selection for Text Classification Model
X_text_classification = df_sample['clean_text']
y_text_classification = df_sample['Department Name']
# Train-Test Split for Text Classification Model
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X_text_classification, y_text_classification, test_size=0.2, random_state=42)
# Define the pipeline for text classification with simplified parameters
pipeline_text_classification = Pipeline([
('tfidf', TfidfVectorizer()),
('model', LogisticRegression(random_state=42))
])
# Parameter grid for grid search with simplified parameters
param_grid_text_classification = {
'tfidf__max_features': [1000],
'model__C': [1.0, 10.0]
}
# Grid search with cross-validation for text classification with simplified parameters
grid_search_text_classification = GridSearchCV(pipeline_text_classification, param_grid_text_classification, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_text_classification.fit(X_train_text, y_train_text)
# Best parameters for text classification
print("Best Parameters for Text Classification:", grid_search_text_classification.best_params_)
# Best model for text classification
best_model_text_classification = grid_search_text_classification.best_estimator_
# Model evaluation for text classification
y_pred_text = best_model_text_classification.predict(X_test_text)
print("Accuracy for Text Classification:", accuracy_score(y_test_text, y_pred_text))
print("Classification Report:\n", classification_report(y_test_text, y_pred_text))
print("Confusion Matrix for Text Classification:\n", cn(y_test_text, y_pred_text))
Best Parameters for Text Classification: {'model__C': 1.0, 'tfidf__max_features': 1000}
Accuracy for Text Classification: 0.8116710875331565
Classification Report:
precision recall f1-score support
Bottoms 0.79 0.74 0.76 356
Dresses 0.89 0.89 0.89 653
Intimate 0.83 0.23 0.36 169
Jackets 0.66 0.41 0.51 99
Tops 0.78 0.93 0.85 978
Trend 0.00 0.00 0.00 7
accuracy 0.81 2262
macro avg 0.66 0.53 0.56 2262
weighted avg 0.81 0.81 0.79 2262
Confusion Matrix for Text Classification:
[[263 20 2 2 69 0]
[ 14 579 2 1 57 0]
[ 22 19 39 5 84 0]
[ 10 7 0 41 41 0]
[ 25 24 4 11 914 0]
[ 1 3 0 2 1 0]]
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, cmap=plt.cm.Reds):
cm = confusion_matrix(y_true, y_pred)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='.2f', cmap=cmap, xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Plot confusion matrix
plot_confusion_matrix(y_test_text, y_pred_text, classes=np.unique(y_test_text), normalize=True)
# Bottoms are predicted accurately 74% of the time
# Dresses are predicted accurately 89% of the time
# Intimates are predicted accurately 50% of the time
# 23% of intimates are being misclassified as jackets
# Jackets are predicted correctly 41% of the time
# Tops are predicted correctly 93% of the time
# Trend is often confused with bottoms (43% of the time)
# with a correct prediction rate of 14%
# Trend was misclassified 100% of the time
# Check the data type of the target variable and ensure it contains only string values
print("Data type of y_text_classification:", y_text_classification.dtype)
# Check for missing values in the target variable
print("Missing values in y_text_classification:", y_text_classification.isnull().sum())
# If there are any missing values, drop them from the dataset
df.dropna(subset=['Department Name'], inplace=True)
# Convert the target variable to string data type
y_text_classification = df['Department Name'].astype(str)
# Rerun the train-test split and the grid search for text classification
Data type of y_text_classification: object Missing values in y_text_classification: 13
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Text Classification')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Instantiate TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Fit and transform the text data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['clean_text'])
# Instantiate Naive Bayes model
naive_bayes = MultinomialNB()
# Fit the Naive Bayes model
naive_bayes.fit(X_train_tfidf, y_train)
# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test['clean_text'])
# Predict with Naive Bayes model
y_pred_naive_bayes = naive_bayes.predict(X_test_tfidf)
# Compute confusion matrix for Naive Bayes
cm_naive_bayes = confusion_matrix(y_test, y_pred_naive_bayes)
# Plot confusion matrix for Naive Bayes
plt.figure(figsize=(8, 6))
sns.heatmap(cm_naive_bayes, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
from sklearn.ensemble import RandomForestClassifier
# Instantiate Random Forest model
random_forest = RandomForestClassifier(random_state=42)
# Fit and predict with Random Forest model
random_forest.fit(X_train_tfidf, y_train)
y_pred_random_forest = random_forest.predict(X_test_tfidf)
# Compute confusion matrix for Random Forest
cm_random_forest = confusion_matrix(y_test, y_pred_random_forest)
# Plot confusion matrix for Random Forest
plt.figure(figsize=(8, 6))
sns.heatmap(cm_random_forest, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Random Forest')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Calculate evaluation metrics for Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_random_forest)
precision_rf = precision_score(y_test, y_pred_random_forest)
recall_rf = recall_score(y_test, y_pred_random_forest)
f1_rf = f1_score(y_test, y_pred_random_forest)
# Calculate evaluation metrics for Naive Bayes model
accuracy_nb = accuracy_score(y_test, y_pred_naive_bayes)
precision_nb = precision_score(y_test, y_pred_naive_bayes)
recall_nb = recall_score(y_test, y_pred_naive_bayes)
f1_nb = f1_score(y_test, y_pred_naive_bayes)
# Confusion matrix for Random Forest
cm_rf = cn(y_test, y_pred_random_forest)
# Confusion matrix for Naive Bayes
cm_nb = cn(y_test, y_pred_naive_bayes)
# Conclusion
print("Based on the evaluation metrics obtained:")
print("Random Forest Model:")
print(f"- Accuracy: {accuracy_rf:.2f}")
print(f"- Precision: {precision_rf:.2f}")
print(f"- Recall: {recall_rf:.2f}")
print(f"- F1 Score: {f1_rf:.2f}")
print("\nNaive Bayes Model:")
print(f"- Accuracy: {accuracy_nb:.2f}")
print(f"- Precision: {precision_nb:.2f}")
print(f"- Recall: {recall_nb:.2f}")
print(f"- F1 Score: {f1_nb:.2f}")
# Print confusion matrices
print("\nConfusion Matrix for Random Forest:")
print(cm_rf)
print("\nConfusion Matrix for Naive Bayes:")
print(cm_nb)
Based on the evaluation metrics obtained: Random Forest Model: - Accuracy: 0.85 - Precision: 0.85 - Recall: 0.99 - F1 Score: 0.91 Naive Bayes Model: - Accuracy: 0.82 - Precision: 0.82 - Recall: 1.00 - F1 Score: 0.90 Confusion Matrix for Random Forest: [[ 171 672] [ 20 3664]] Confusion Matrix for Naive Bayes: [[ 19 824] [ 0 3684]]
# Conclusion
# During EDA and Preprocessing, we cleaned the text,
# did sentiment analysis with VADER, part of speech tagging,
# and visualized the word distribution.
# Because of this we were able to implement the classification models.
# Both Machine Learning models performed well with regards to accuracy,
# precision, recall and F1 sscore.
# The Random Forest had a higher accuracy and F1 score though,
# Which means it has a better predictive capability.
# While both models did well with regards to performance,
# the Random Forest model was the best overall,
# because it offered a balance between precision and recall
# and the Naive Bayes did well in recall, but not precision.
# References
# https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
# https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://stackoverflow.com/questions/67979512/how-to-find-nlp-words-count-and-plot-it
# https://www.datacamp.com/tutorial/wordcloud-python
# https://www.datacamp.com/tutorial/how-to-make-a-seaborn-histogram
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
# https://www.geeksforgeeks.org/generating-word-cloud-python/
# https://medium.com/product-ai/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908