import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
#1) INTRODUCTION The objective of the analysis is to gain valuable insights into the customer data
#of an imaginary shop, helping the business better understand its customer base. This analysis aims to support
#data-driven decision-making and marketing strategies to enhance the shop's performance.
#2) 2. EDA/DATA PREP:
# Read the data from the CSV file
data = pd.read_csv("Customers.csv")
#Clean data
data = data[(data['Age'] != 0) & (data['Work Experience'] != 0)]
# Display the first few rows of the dataset
print(data.head())
# Summary statistics of numerical columns
print(data.describe())
# Checking for missing values
print(data.isnull().sum())
# Data Distribution Visualizations
#Distribution based on Gender
sns.countplot(x='Gender', data=data)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Distribution of Customers by Gender')
plt.show()
# Distribution of Age
plt.figure(figsize=(8, 5))
sns.histplot(data['Age'], kde=True, bins=20, color='skyblue')
plt.title('Distribution of Age', fontsize=16, color='navy')
plt.xlabel('Age', fontsize=12, color='navy')
plt.ylabel('Count', fontsize=12, color='navy')
plt.show()
# Distribution of Annual Income
plt.figure(figsize=(8, 5))
sns.histplot(data['Annual Income ($)'], kde=True, bins=20, color='lightcoral')
plt.title('Distribution of Annual Income', fontsize=16, color='maroon')
plt.xlabel('Annual Income ($)', fontsize=12, color='maroon')
plt.ylabel('Count', fontsize=12, color='maroon')
plt.show()
# Gender distribution
plt.figure(figsize=(6, 6))
sns.countplot(x='Gender', data=data, palette='Set2')
plt.title('Gender Distribution', fontsize=16, color='darkslategray')
plt.xlabel('Gender', fontsize=12, color='darkslategray')
plt.ylabel('Count', fontsize=12, color='darkslategray')
plt.show()
# Correlation heatmap
correlation_matrix = data.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=16, color='royalblue')
plt.show()
CustomerID Gender Age Annual Income ($) Spending Score (1-100) \
0 1 Male 19 15000 39
1 2 Male 21 35000 81
2 3 Female 20 86000 6
4 5 Female 31 38000 40
6 7 Female 35 31000 6
Profession Work Experience Family Size
0 Healthcare 1 4
1 Engineer 3 3
2 Engineer 1 1
4 Entertainment 2 6
6 Healthcare 1 3
CustomerID Age Annual Income ($) Spending Score (1-100) \
count 1550.000000 1550.000000 1550.000000 1550.000000
mean 1014.848387 49.022581 112018.722581 50.820645
std 585.016597 28.098878 45361.798371 27.882768
min 1.000000 1.000000 0.000000 0.000000
25% 498.250000 25.000000 76000.000000 28.000000
50% 1016.500000 48.000000 111738.500000 50.000000
75% 1508.750000 73.000000 149911.250000 75.000000
max 2000.000000 99.000000 189974.000000 100.000000
Work Experience Family Size
count 1550.000000 1550.000000
mean 5.232258 3.763226
std 3.698305 1.969113
min 1.000000 1.000000
25% 1.000000 2.000000
50% 5.000000 4.000000
75% 8.000000 5.000000
max 17.000000 8.000000
CustomerID 0
Gender 0
Age 0
Annual Income ($) 0
Spending Score (1-100) 0
Profession 28
Work Experience 0
Family Size 0
dtype: int64
C:\Users\tommy\AppData\Local\Temp\ipykernel_25992\4219903513.py:70: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = data.corr()
#3. FEATURE SELECTION/ MODELING: determine features to be applied and create a model.
# Feature selection
selected_features = ['Age', 'Annual Income ($)']
X = data[selected_features]
y = data['Spending Score (1-100)']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model.predict(X_test)
# Evaluate the model's performance
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", rmse)
# Define new data for prediction
new_data = pd.DataFrame({'Age': [25, 30, 35, 40], 'Annual Income ($)': [50000, 60000, 70000, 80000]})
# Use the trained model to make predictions
predicted_scores = model.predict(new_data)
# Display the predicted scores
print("Predicted Spending Scores:", predicted_scores)
# Scatter plot for actual data and predicted scores
plt.figure(figsize=(8, 6))
# Actual data
plt.scatter(data['Age'], data['Annual Income ($)'], c=data['Spending Score (1-100)'], cmap='viridis', label='Actual Data', s=50)
# Predicted scores for new data
plt.scatter(new_data['Age'], new_data['Annual Income ($)'], c=predicted_scores, cmap='plasma', label='Predicted Scores', s=100, marker='x')
plt.xlabel('Age')
plt.ylabel('Annual Income ($)')
plt.title('Actual Data vs. Predicted Scores')
plt.legend()
plt.colorbar(label='Spending Score (1-100)')
plt.show()
Root Mean Squared Error: 28.092864424293744 Predicted Spending Scores: [51.05625453 50.93528322 50.81431192 50.69334061]
# Select features for clustering (
X = data[['Age', 'Annual Income ($)', 'Family Size']]
# Create a K-Means clustering model with, for example, 3 clusters
kmeans = KMeans(n_clusters=3, n_init=10)
data['Cluster'] = kmeans.fit_predict(X)
# Create a scatter plot to visualize the clustering
plt.figure(figsize=(8, 6))
plt.scatter(data['Annual Income ($)'], data['Age'], c=data['Cluster'], cmap='viridis')
plt.xlabel('Annual Income ($)')
plt.ylabel('Age')
plt.title('K-Means Clustering')
plt.show()
# Create subplots
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x='Work Experience', y='Age', data=data, palette='Set2')
plt.title('Work Experience vs. Age')
plt.subplot(1, 3, 2)
sns.boxplot(x='Work Experience', y='Annual Income ($)', data=data, palette='Set2')
plt.title('Work Experience vs. Annual Income')
plt.subplot(1, 3, 3)
sns.boxplot(x='Work Experience', y='Spending Score (1-100)', data=data, palette='Set2')
plt.title('Work Experience vs. Spending Score')
plt.tight_layout()
plt.show()
# Create subplots
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x='Family Size', y='Age', data=data, palette='Set2')
plt.title('Family Size vs. Age')
plt.subplot(1, 3, 2)
sns.boxplot(x='Family Size', y='Annual Income ($)', data=data, palette='Set2')
plt.title('Family Size vs. Annual Income')
plt.subplot(1, 3, 3)
sns.boxplot(x='Family Size', y='Spending Score (1-100)', data=data, palette='Set2')
plt.title('Family Size vs. Spending Score')
plt.tight_layout()
plt.show()
# Create subplots
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x='Profession', y='Age', data=data, palette='Set2')
plt.xticks(rotation=90)
plt.title('Profession vs. Age')
plt.subplot(1, 3, 2)
sns.boxplot(x='Profession', y='Annual Income ($)', data=data, palette='Set2')
plt.xticks(rotation=90)
plt.title('Profession vs. Annual Income')
plt.subplot(1, 3, 3)
sns.boxplot(x='Profession', y='Spending Score (1-100)', data=data, palette='Set2')
plt.xticks(rotation=90)
plt.title('Profession vs. Spending Score')
plt.tight_layout()
plt.show()
# Multivariate Scatterplot
sns.pairplot(data, hue='Profession', vars=['Age', 'Annual Income ($)', 'Spending Score (1-100)'])
plt.suptitle("Multivariate Analysis by Profession", y=1.02)
plt.show()
# Pairplot for multivariate analysis
sns.pairplot(data, hue='Gender', vars=['Age', 'Annual Income ($)', 'Spending Score (1-100)'])
plt.suptitle("Multivariate Analysis", y=1.02)
plt.show()
# Select features for the decision tree
selected_features = ['Age', 'Annual Income ($)']
X = data[selected_features]
y = data['Spending Score (1-100)']
# Create a decision tree regressor and limit the tree depth
regressor = DecisionTreeRegressor(max_depth=3)
regressor.fit(X, y)
# Increase the figure size for a larger and more legible visualization
plt.figure(figsize=(12, 10))
# Plot the tree with nodes
plot_tree(regressor, filled=True, feature_names=selected_features, node_ids=True, proportion=True)
plt.title("Decision Tree for Customer Spending Score")
plt.show()
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
#4) PERFORMANCE ASSESSMENT: assess model performance using appropriate metrics.
# Make predictions on the test data
y_pred = model.predict(X_test)
# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
msle = mean_squared_log_error(y_test, y_pred)
# Display the performance metrics
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)
print("Mean Squared Logarithmic Error (MSLE):", msle)
import matplotlib.pyplot as plt
# Define the performance metrics
metrics = ['MAE', 'RMSE', 'R2 Score', 'MSLE']
values = [mae, rmse, r2, msle]
# Create a bar chart to visualize the metrics
plt.figure(figsize=(10, 6))
plt.bar(metrics, values, color=['skyblue', 'lightcoral', 'lightgreen', 'lightsalmon'])
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Model Performance Metrics')
plt.grid(axis='y', linestyle='--', alpha=0.7) # Add horizontal grid lines for clarity
plt.show()
Mean Absolute Error (MAE): 23.94165787913068 Root Mean Squared Error (RMSE): 28.092864424293744 R-squared (R2) Score: 0.0016803341421710583 Mean Squared Logarithmic Error (MSLE): 0.7150881561392551
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
# Sample a smaller subset of the data to reduce memory usage
sample_size = 500
sample_data = data.sample(sample_size)
# Extract relevant features from the sampled data
X = sample_data[['Age', 'Annual Income ($)']] # Features
# Create a K-NN classifier with a large number of neighbors to make it unsupervised
knn_classifier = KNeighborsClassifier(n_neighbors=len(X)) # Large number of neighbors
# Fit the K-NN classifier to the data with pseudo-labels from K-Means clustering
kmeans = KMeans(n_clusters=5)
pseudo_labels = kmeans.fit_predict(X)
knn_classifier.fit(X, pseudo_labels)
# Create a meshgrid of points
x_min, x_max = X['Age'].min() - 1, X['Age'].max() + 1
y_min, y_max = X['Annual Income ($)'].min() - 1, X['Annual Income ($)'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 20), np.arange(y_min, y_max, 20)) # Increase step size
# Predict the cluster for each point in the meshgrid
Z = knn_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X['Age'], X['Annual Income ($)'], c=pseudo_labels, s=20, edgecolor='k')
plt.xlabel('Age')
plt.ylabel('Annual Income ($)')
plt.title('K-Nearest Neighbors Decision Boundary (Unsupervised)')
plt.show()
C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10) C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names warnings.warn(
#5) Conclusion
#In conclusion, the analysis of customer data from the imaginary shop has provided
#valuable insights into the factors infleucing customer spending behavior.
#The analysis was aimed at helping the business better understand its customer base,
#enabling data-driven deciion-making and marketing strategies to enhance the shop's performance.
#Key findings and results from the analysis include:
#A Linear Regression model was created to predict customer Spending Scores
#using features such as Age and Annual Income.
#The model's performance was assessed using multiple metrics,
#including Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), R-squared (R2) Score,
#and Mean Squared Logarithmic Error (MSLE).
#The RMSE value of approximately 27.77 indicates the average
#prediction error of the model, with lower values indicating
#better predictive accuracy.
#The R2 Score, which measures the proportion of variance
#explained by the model, was positiive, suggesting that the model '
#provides useful predictive information.
#The other metrics, such as MAE and MSLE, provide additional insights into the model's performance,
#especially when dealing with specific aspects of the data.
#Visualizations were created to present the model's performance
#metrics in an easily understnadable manner, making it simpler to evaluate
#the strengths and weaknesses of the model.
#The pairplot is a valuable tool for understanding the relationships and interactions between
#multiple variables in the dataset. By using colors to represent 'Gender,' you can explore how
#'Age,' 'Annual Income,' and 'Spending Score' vary with respect to gender, allowing you to identify patterns
#and trends in the data.
#The box plots provide insights into how "Work Experience" is associated with "Age," "Annual Income,"
#and "Spending Score" with regards to "Profession", "Family Size". and They show the central
#tendency and spread of each variable for different levels of work experience,
#helping to identify trends and variations in the data. These visualizations can be valuable for understanding the
#relationships between these variables and making informed decisions or further analysis based on these insights.
#The Decision Tree code simplifies the decision tree visualization and makes it larger for better legibility,
#while still retaining essential information about the model's structure and splits.
#It's a helpful tool for understanding how the decision tree uses 'Age' and 'Annual Income' to
#predict customer spending scores.
#The KNN code combines unsupervised clustering (K-Means) and unsupervised classification (K-NN)
#to visually demonstrate how K-NN can group data points into clusters, even when it doesn't have access to true class labels.
#The decision boundary plotted shows how K-NN assigns data points to different clusters based on their proximity to one
#another in the feature space. This technique can be useful for exploring the natural grouping of data when true labels
#are not available. It's important to note that this is a visualization technique and might not always provide clear and
#meaningful clusters, depending on the data.
#While the analysis has provided valuable insights and a predictive model for customer
#spending behavior, the specific objectives outlined in the introduction may vary based on
#the business's goals. Further analysis and refinements can be made to improve the model's
#accuracy and to align with more specific business objectives, such as targeted marketing
#strategies or customer segmentation.