import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


#1) INTRODUCTION The objective of the analysis is to gain valuable insights into the customer data 
#of an imaginary shop, helping the business better understand its customer base. This analysis aims to support 
#data-driven decision-making and marketing strategies to enhance the shop's performance.

#2) 2. EDA/DATA PREP:

# Read the data from the CSV file
data = pd.read_csv("Customers.csv")

#Clean data
data = data[(data['Age'] != 0) & (data['Work Experience'] != 0)]


# Display the first few rows of the dataset
print(data.head())

# Summary statistics of numerical columns
print(data.describe())

# Checking for missing values
print(data.isnull().sum())

# Data Distribution Visualizations

#Distribution based on Gender
sns.countplot(x='Gender', data=data)
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Distribution of Customers by Gender')
plt.show()

# Distribution of Age
plt.figure(figsize=(8, 5))
sns.histplot(data['Age'], kde=True, bins=20, color='skyblue')
plt.title('Distribution of Age', fontsize=16, color='navy')
plt.xlabel('Age', fontsize=12, color='navy')
plt.ylabel('Count', fontsize=12, color='navy')
plt.show()

# Distribution of Annual Income
plt.figure(figsize=(8, 5))
sns.histplot(data['Annual Income ($)'], kde=True, bins=20, color='lightcoral')
plt.title('Distribution of Annual Income', fontsize=16, color='maroon')
plt.xlabel('Annual Income ($)', fontsize=12, color='maroon')
plt.ylabel('Count', fontsize=12, color='maroon')
plt.show()

# Gender distribution
plt.figure(figsize=(6, 6))
sns.countplot(x='Gender', data=data, palette='Set2')
plt.title('Gender Distribution', fontsize=16, color='darkslategray')
plt.xlabel('Gender', fontsize=12, color='darkslategray')
plt.ylabel('Count', fontsize=12, color='darkslategray')
plt.show()

# Correlation heatmap
correlation_matrix = data.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap', fontsize=16, color='royalblue')
plt.show()

   CustomerID  Gender  Age  Annual Income ($)  Spending Score (1-100)  \
0           1    Male   19              15000                      39   
1           2    Male   21              35000                      81   
2           3  Female   20              86000                       6   
4           5  Female   31              38000                      40   
6           7  Female   35              31000                       6   

      Profession  Work Experience  Family Size  
0     Healthcare                1            4  
1       Engineer                3            3  
2       Engineer                1            1  
4  Entertainment                2            6  
6     Healthcare                1            3  
        CustomerID          Age  Annual Income ($)  Spending Score (1-100)  \
count  1550.000000  1550.000000        1550.000000             1550.000000   
mean   1014.848387    49.022581      112018.722581               50.820645   
std     585.016597    28.098878       45361.798371               27.882768   
min       1.000000     1.000000           0.000000                0.000000   
25%     498.250000    25.000000       76000.000000               28.000000   
50%    1016.500000    48.000000      111738.500000               50.000000   
75%    1508.750000    73.000000      149911.250000               75.000000   
max    2000.000000    99.000000      189974.000000              100.000000   

       Work Experience  Family Size  
count      1550.000000  1550.000000  
mean          5.232258     3.763226  
std           3.698305     1.969113  
min           1.000000     1.000000  
25%           1.000000     2.000000  
50%           5.000000     4.000000  
75%           8.000000     5.000000  
max          17.000000     8.000000  
CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                28
Work Experience            0
Family Size                0
dtype: int64

C:\Users\tommy\AppData\Local\Temp\ipykernel_25992\4219903513.py:70: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  correlation_matrix = data.corr()


#3. FEATURE SELECTION/ MODELING: determine features to be applied and create a model.


# Feature selection 
selected_features = ['Age', 'Annual Income ($)']
X = data[selected_features]
y = data['Spending Score (1-100)']


# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance 
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", rmse)

# Define new data for prediction
new_data = pd.DataFrame({'Age': [25, 30, 35, 40], 'Annual Income ($)': [50000, 60000, 70000, 80000]})

# Use the trained model to make predictions
predicted_scores = model.predict(new_data)

# Display the predicted scores
print("Predicted Spending Scores:", predicted_scores)


# Scatter plot for actual data and predicted scores
plt.figure(figsize=(8, 6))

# Actual data
plt.scatter(data['Age'], data['Annual Income ($)'], c=data['Spending Score (1-100)'], cmap='viridis', label='Actual Data', s=50)

# Predicted scores for new data
plt.scatter(new_data['Age'], new_data['Annual Income ($)'], c=predicted_scores, cmap='plasma', label='Predicted Scores', s=100, marker='x')

plt.xlabel('Age')
plt.ylabel('Annual Income ($)')
plt.title('Actual Data vs. Predicted Scores')
plt.legend()
plt.colorbar(label='Spending Score (1-100)')
plt.show()

Root Mean Squared Error: 28.092864424293744
Predicted Spending Scores: [51.05625453 50.93528322 50.81431192 50.69334061]


# Select features for clustering (
X = data[['Age', 'Annual Income ($)', 'Family Size']]

# Create a K-Means clustering model with, for example, 3 clusters
kmeans = KMeans(n_clusters=3, n_init=10)

data['Cluster'] = kmeans.fit_predict(X)

# Create a scatter plot to visualize the clustering
plt.figure(figsize=(8, 6))
plt.scatter(data['Annual Income ($)'], data['Age'], c=data['Cluster'], cmap='viridis')
plt.xlabel('Annual Income ($)')
plt.ylabel('Age')
plt.title('K-Means Clustering')
plt.show()


# Create subplots
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x='Work Experience', y='Age', data=data, palette='Set2')
plt.title('Work Experience vs. Age')

plt.subplot(1, 3, 2)
sns.boxplot(x='Work Experience', y='Annual Income ($)', data=data, palette='Set2')
plt.title('Work Experience vs. Annual Income')

plt.subplot(1, 3, 3)
sns.boxplot(x='Work Experience', y='Spending Score (1-100)', data=data, palette='Set2')
plt.title('Work Experience vs. Spending Score')
plt.tight_layout()
plt.show()


# Create subplots
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x='Family Size', y='Age', data=data, palette='Set2')
plt.title('Family Size vs. Age')

plt.subplot(1, 3, 2)
sns.boxplot(x='Family Size', y='Annual Income ($)', data=data, palette='Set2')
plt.title('Family Size vs. Annual Income')

plt.subplot(1, 3, 3)
sns.boxplot(x='Family Size', y='Spending Score (1-100)', data=data, palette='Set2')
plt.title('Family Size vs. Spending Score')
plt.tight_layout()
plt.show()


# Create subplots
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.boxplot(x='Profession', y='Age', data=data, palette='Set2')
plt.xticks(rotation=90)
plt.title('Profession vs. Age')

plt.subplot(1, 3, 2)
sns.boxplot(x='Profession', y='Annual Income ($)', data=data, palette='Set2')
plt.xticks(rotation=90)
plt.title('Profession vs. Annual Income')

plt.subplot(1, 3, 3)
sns.boxplot(x='Profession', y='Spending Score (1-100)', data=data, palette='Set2')
plt.xticks(rotation=90)
plt.title('Profession vs. Spending Score')
plt.tight_layout()
plt.show()


# Multivariate Scatterplot
sns.pairplot(data, hue='Profession', vars=['Age', 'Annual Income ($)', 'Spending Score (1-100)'])
plt.suptitle("Multivariate Analysis by Profession", y=1.02)
plt.show()


# Pairplot for multivariate analysis
sns.pairplot(data, hue='Gender', vars=['Age', 'Annual Income ($)', 'Spending Score (1-100)'])
plt.suptitle("Multivariate Analysis", y=1.02)
plt.show()


# Select features for the decision tree 
selected_features = ['Age', 'Annual Income ($)']
X = data[selected_features]
y = data['Spending Score (1-100)']

# Create a decision tree regressor and limit the tree depth
regressor = DecisionTreeRegressor(max_depth=3)
regressor.fit(X, y)

# Increase the figure size for a larger and more legible visualization
plt.figure(figsize=(12, 10))

# Plot the tree with nodes
plot_tree(regressor, filled=True, feature_names=selected_features, node_ids=True, proportion=True)

plt.title("Decision Tree for Customer Spending Score")
plt.show()


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

#4) PERFORMANCE ASSESSMENT: assess model performance using appropriate metrics.

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
msle = mean_squared_log_error(y_test, y_pred)

# Display the performance metrics
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)
print("Mean Squared Logarithmic Error (MSLE):", msle)


import matplotlib.pyplot as plt

# Define the performance metrics
metrics = ['MAE', 'RMSE', 'R2 Score', 'MSLE']
values = [mae, rmse, r2, msle]

# Create a bar chart to visualize the metrics
plt.figure(figsize=(10, 6))
plt.bar(metrics, values, color=['skyblue', 'lightcoral', 'lightgreen', 'lightsalmon'])
plt.xlabel('Metric')
plt.ylabel('Value')
plt.title('Model Performance Metrics')
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines for clarity
plt.show()

Mean Absolute Error (MAE): 23.94165787913068
Root Mean Squared Error (RMSE): 28.092864424293744
R-squared (R2) Score: 0.0016803341421710583
Mean Squared Logarithmic Error (MSLE): 0.7150881561392551


import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# Sample a smaller subset of the data to reduce memory usage
sample_size = 500  
sample_data = data.sample(sample_size)  

# Extract relevant features from the sampled data
X = sample_data[['Age', 'Annual Income ($)']]  # Features

# Create a K-NN classifier with a large number of neighbors to make it unsupervised
knn_classifier = KNeighborsClassifier(n_neighbors=len(X))  # Large number of neighbors

# Fit the K-NN classifier to the data with pseudo-labels from K-Means clustering
kmeans = KMeans(n_clusters=5)
pseudo_labels = kmeans.fit_predict(X)
knn_classifier.fit(X, pseudo_labels)

# Create a meshgrid of points
x_min, x_max = X['Age'].min() - 1, X['Age'].max() + 1
y_min, y_max = X['Annual Income ($)'].min() - 1, X['Annual Income ($)'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 20), np.arange(y_min, y_max, 20))  # Increase step size

# Predict the cluster for each point in the meshgrid
Z = knn_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X['Age'], X['Annual Income ($)'], c=pseudo_labels, s=20, edgecolor='k')
plt.xlabel('Age')
plt.ylabel('Annual Income ($)')
plt.title('K-Nearest Neighbors Decision Boundary (Unsupervised)')
plt.show()

C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
C:\Users\tommy\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
  warnings.warn(


#5) Conclusion

#In conclusion, the analysis of customer data from the imaginary shop has provided 
#valuable insights into the factors infleucing customer spending behavior. 
#The analysis was aimed at helping the business better understand its customer base, 
#enabling data-driven deciion-making and marketing strategies to enhance the shop's performance.

#Key findings and results from the analysis include:

#A Linear Regression model was created to predict customer Spending Scores 
#using features such as Age and Annual Income.

#The model's performance was assessed using multiple metrics, 
#including Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), R-squared (R2) Score, 
#and Mean Squared Logarithmic Error (MSLE).

#The RMSE value of approximately 27.77 indicates the average 
#prediction error of the model, with lower values indicating 
#better predictive accuracy.

#The R2 Score, which measures the proportion of variance 
#explained by the model, was positiive, suggesting that the model '
#provides useful predictive information.

#The other metrics, such as MAE and MSLE, provide additional insights into the model's performance, 
#especially when dealing with specific aspects of the data.

#Visualizations were created to present the model's performance 
#metrics in an easily understnadable manner, making it simpler to evaluate 
#the strengths and weaknesses of the model.

#The pairplot is a valuable tool for understanding the relationships and interactions between 
#multiple variables in the dataset. By using colors to represent 'Gender,' you can explore how 
#'Age,' 'Annual Income,' and 'Spending Score' vary with respect to gender, allowing you to identify patterns 
#and trends in the data.

#The box plots provide insights into how "Work Experience" is associated with "Age," "Annual Income," 
#and "Spending Score" with regards to "Profession", "Family Size". and  They show the central 
#tendency and spread of each  variable for different levels of work experience, 
#helping to identify trends and variations in the data. These visualizations can be valuable for understanding the 
#relationships between these variables and making informed decisions or further analysis based on these insights.

#The Decision Tree code simplifies the decision tree visualization and makes it larger for better legibility, 
#while still retaining essential information about the model's structure and splits. 
#It's a helpful tool for understanding how the decision tree uses 'Age' and 'Annual Income' to 
#predict customer spending scores.

#The KNN code combines unsupervised clustering (K-Means) and unsupervised classification (K-NN) 
#to visually demonstrate how K-NN can group data points into clusters, even when it doesn't have access to true class labels.
#The decision boundary plotted shows how K-NN assigns data points to different clusters based on their proximity to one 
#another in the feature space. This technique can be useful for exploring the natural grouping of data when true labels 
#are not available. It's important to note that this is a visualization technique and might not always provide clear and 
#meaningful clusters, depending on the data.

#While the analysis has provided valuable insights and a predictive model for customer 
#spending behavior, the specific objectives outlined in the introduction may vary based on 
#the business's goals. Further analysis and refinements can be made to improve the model's 
#accuracy and to align with more specific business objectives, such as targeted marketing 
#strategies or customer segmentation.