import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_csv("irisdata.csv")
print(data)
sepal-length sepal-width petal-length petal-width species 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa 5 5.4 3.9 1.7 0.4 Iris-setosa 6 4.6 3.4 1.4 0.3 Iris-setosa 7 5.0 3.4 1.5 0.2 Iris-setosa 8 4.4 2.9 1.4 0.2 Iris-setosa 9 4.9 3.1 1.5 0.1 Iris-setosa 10 5.4 3.7 1.5 0.2 Iris-setosa 11 4.8 3.4 1.6 0.2 Iris-setosa 12 4.8 3.0 1.4 0.1 Iris-setosa 13 4.3 3.0 1.1 0.1 Iris-setosa 14 5.8 4.0 1.2 0.2 Iris-setosa 15 5.7 4.4 1.5 0.4 Iris-setosa 16 5.4 3.9 1.3 0.4 Iris-setosa 17 5.1 3.5 1.4 0.3 Iris-setosa 18 5.7 3.8 1.7 0.3 Iris-setosa 19 5.1 3.8 1.5 0.3 Iris-setosa 20 5.4 3.4 1.7 0.2 Iris-setosa 21 5.1 3.7 1.5 0.4 Iris-setosa 22 4.6 3.6 1.0 0.2 Iris-setosa 23 5.1 3.3 1.7 0.5 Iris-setosa 24 4.8 3.4 1.9 0.2 Iris-setosa 25 5.0 3.0 1.6 0.2 Iris-setosa 26 5.0 3.4 1.6 0.4 Iris-setosa 27 5.2 3.5 1.5 0.2 Iris-setosa 28 5.2 3.4 1.4 0.2 Iris-setosa 29 4.7 3.2 1.6 0.2 Iris-setosa 30 4.8 3.1 1.6 0.2 Iris-setosa 31 5.4 3.4 1.5 0.4 Iris-setosa 32 5.2 4.1 1.5 0.1 Iris-setosa 33 5.5 4.2 1.4 0.2 Iris-setosa 34 4.9 3.1 1.5 0.1 Iris-setosa 35 5.0 3.2 1.2 0.2 Iris-setosa 36 5.5 3.5 1.3 0.2 Iris-setosa 37 4.9 3.1 1.5 0.1 Iris-setosa 38 4.4 3.0 1.3 0.2 Iris-setosa 39 5.1 3.4 1.5 0.2 Iris-setosa 40 5.0 3.5 1.3 0.3 Iris-setosa 41 4.5 2.3 1.3 0.3 Iris-setosa 42 4.4 3.2 1.3 0.2 Iris-setosa 43 5.0 3.5 1.6 0.6 Iris-setosa 44 5.1 3.8 1.9 0.4 Iris-setosa 45 4.8 3.0 1.4 0.3 Iris-setosa 46 5.1 3.8 1.6 0.2 Iris-setosa 47 4.6 3.2 1.4 0.2 Iris-setosa 48 5.3 3.7 1.5 0.2 Iris-setosa 49 5.0 3.3 1.4 0.2 Iris-setosa 50 7.0 3.2 4.7 1.4 Iris-versicolor 51 6.4 3.2 4.5 1.5 Iris-versicolor 52 6.9 3.1 4.9 1.5 Iris-versicolor 53 5.5 2.3 4.0 1.3 Iris-versicolor 54 6.5 2.8 4.6 1.5 Iris-versicolor 55 5.7 2.8 4.5 1.3 Iris-versicolor 56 6.3 3.3 4.7 1.6 Iris-versicolor 57 4.9 2.4 3.3 1.0 Iris-versicolor 58 6.6 2.9 4.6 1.3 Iris-versicolor 59 5.2 2.7 3.9 1.4 Iris-versicolor 60 5.0 2.0 3.5 1.0 Iris-versicolor 61 5.9 3.0 4.2 1.5 Iris-versicolor 62 6.0 2.2 4.0 1.0 Iris-versicolor 63 6.1 2.9 4.7 1.4 Iris-versicolor 64 5.6 2.9 3.6 1.3 Iris-versicolor 65 6.7 3.1 4.4 1.4 Iris-versicolor 66 5.6 3.0 4.5 1.5 Iris-versicolor 67 5.8 2.7 4.1 1.0 Iris-versicolor 68 6.2 2.2 4.5 1.5 Iris-versicolor 69 5.6 2.5 3.9 1.1 Iris-versicolor 70 5.9 3.2 4.8 1.8 Iris-versicolor 71 6.1 2.8 4.0 1.3 Iris-versicolor 72 6.3 2.5 4.9 1.5 Iris-versicolor 73 6.1 2.8 4.7 1.2 Iris-versicolor 74 6.4 2.9 4.3 1.3 Iris-versicolor 75 6.6 3.0 4.4 1.4 Iris-versicolor 76 6.8 2.8 4.8 1.4 Iris-versicolor 77 6.7 3.0 5.0 1.7 Iris-versicolor 78 6.0 2.9 4.5 1.5 Iris-versicolor 79 5.7 2.6 3.5 1.0 Iris-versicolor 80 5.5 2.4 3.8 1.1 Iris-versicolor 81 5.5 2.4 3.7 1.0 Iris-versicolor 82 5.8 2.7 3.9 1.2 Iris-versicolor 83 6.0 2.7 5.1 1.6 Iris-versicolor 84 5.4 3.0 4.5 1.5 Iris-versicolor 85 6.0 3.4 4.5 1.6 Iris-versicolor 86 6.7 3.1 4.7 1.5 Iris-versicolor 87 6.3 2.3 4.4 1.3 Iris-versicolor 88 5.6 3.0 4.1 1.3 Iris-versicolor 89 5.5 2.5 4.0 1.3 Iris-versicolor 90 5.5 2.6 4.4 1.2 Iris-versicolor 91 6.1 3.0 4.6 1.4 Iris-versicolor 92 5.8 2.6 4.0 1.2 Iris-versicolor 93 5.0 2.3 3.3 1.0 Iris-versicolor 94 5.6 2.7 4.2 1.3 Iris-versicolor 95 5.7 3.0 4.2 1.2 Iris-versicolor 96 5.7 2.9 4.2 1.3 Iris-versicolor 97 6.2 2.9 4.3 1.3 Iris-versicolor 98 5.1 2.5 3.0 1.1 Iris-versicolor 99 5.7 2.8 4.1 1.3 Iris-versicolor 100 6.3 3.3 6.0 2.5 Iris-virginica 101 5.8 2.7 5.1 1.9 Iris-virginica 102 7.1 3.0 5.9 2.1 Iris-virginica 103 6.3 2.9 5.6 1.8 Iris-virginica 104 6.5 3.0 5.8 2.2 Iris-virginica 105 7.6 3.0 6.6 2.1 Iris-virginica 106 4.9 2.5 4.5 1.7 Iris-virginica 107 7.3 2.9 6.3 1.8 Iris-virginica 108 6.7 2.5 5.8 1.8 Iris-virginica 109 7.2 3.6 6.1 2.5 Iris-virginica 110 6.5 3.2 5.1 2.0 Iris-virginica 111 6.4 2.7 5.3 1.9 Iris-virginica 112 6.8 3.0 5.5 2.1 Iris-virginica 113 5.7 2.5 5.0 2.0 Iris-virginica 114 5.8 2.8 5.1 2.4 Iris-virginica 115 6.4 3.2 5.3 2.3 Iris-virginica 116 6.5 3.0 5.5 1.8 Iris-virginica 117 7.7 3.8 6.7 2.2 Iris-virginica 118 7.7 2.6 6.9 2.3 Iris-virginica 119 6.0 2.2 5.0 1.5 Iris-virginica 120 6.9 3.2 5.7 2.3 Iris-virginica 121 5.6 2.8 4.9 2.0 Iris-virginica 122 7.7 2.8 6.7 2.0 Iris-virginica 123 6.3 2.7 4.9 1.8 Iris-virginica 124 6.7 3.3 5.7 2.1 Iris-virginica 125 7.2 3.2 6.0 1.8 Iris-virginica 126 6.2 2.8 4.8 1.8 Iris-virginica 127 6.1 3.0 4.9 1.8 Iris-virginica 128 6.4 2.8 5.6 2.1 Iris-virginica 129 7.2 3.0 5.8 1.6 Iris-virginica 130 7.4 2.8 6.1 1.9 Iris-virginica 131 7.9 3.8 6.4 2.0 Iris-virginica 132 6.4 2.8 5.6 2.2 Iris-virginica 133 6.3 2.8 5.1 1.5 Iris-virginica 134 6.1 2.6 5.6 1.4 Iris-virginica 135 7.7 3.0 6.1 2.3 Iris-virginica 136 6.3 3.4 5.6 2.4 Iris-virginica 137 6.4 3.1 5.5 1.8 Iris-virginica 138 6.0 3.0 4.8 1.8 Iris-virginica 139 6.9 3.1 5.4 2.1 Iris-virginica 140 6.7 3.1 5.6 2.4 Iris-virginica 141 6.9 3.1 5.1 2.3 Iris-virginica 142 5.8 2.7 5.1 1.9 Iris-virginica 143 6.8 3.2 5.9 2.3 Iris-virginica 144 6.7 3.3 5.7 2.5 Iris-virginica 145 6.7 3.0 5.2 2.3 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica
# Dimensions of dataset
dimensions = (data.shape)
print(dimensions)
(150, 5)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sepal-length 150 non-null float64 1 sepal-width 150 non-null float64 2 petal-length 150 non-null float64 3 petal-width 150 non-null float64 4 species 150 non-null object dtypes: float64(4), object(1) memory usage: 6.0+ KB
#Peek at the first 5 Rows
print(data.head())
print("\n")
#Peek at the Last 5 Rows
print(data.tail())
sepal-length sepal-width petal-length petal-width species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
sepal-length sepal-width petal-length petal-width species
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)
# Check for duplicate rows
duplicates = data.duplicated().sum()
print("\nNumber of Duplicate Rows:", duplicates)
# Data Cleansing: Remove Duplicate Rows
dataset = data.drop_duplicates()
# Display the first few rows of the cleaned dataset
print("\nCleaned Dataset (First few rows):")
print(dataset.head())
print("\n")
# # Set display options to show all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# # Print the entire DataFrame
# print(data)
Missing Values: sepal-length 0 sepal-width 0 petal-length 0 petal-width 0 species 0 dtype: int64 Number of Duplicate Rows: 3 Cleaned Dataset (First few rows): sepal-length sepal-width petal-length petal-width species 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa
#Statistical summary of attributes
data.describe()
| sepal-length | sepal-width | petal-length | petal-width | |
|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 5.843333 | 3.054000 | 3.758667 | 1.198667 |
| std | 0.828066 | 0.433594 | 1.764420 | 0.763161 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
| 50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
| 75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
# Group the data by the "species" column and calculate summary statistics
species_summary = data.groupby("species").describe()
# Print the summary statistics
print(species_summary)
sepal-length \
count mean std min 25% 50% 75% max
species
Iris-setosa 50.0 5.006 0.352490 4.3 4.800 5.0 5.2 5.8
Iris-versicolor 50.0 5.936 0.516171 4.9 5.600 5.9 6.3 7.0
Iris-virginica 50.0 6.588 0.635880 4.9 6.225 6.5 6.9 7.9
sepal-width \
count mean std min 25% 50% 75% max
species
Iris-setosa 50.0 3.418 0.381024 2.3 3.125 3.4 3.675 4.4
Iris-versicolor 50.0 2.770 0.313798 2.0 2.525 2.8 3.000 3.4
Iris-virginica 50.0 2.974 0.322497 2.2 2.800 3.0 3.175 3.8
petal-length \
count mean std min 25% 50% 75% max
species
Iris-setosa 50.0 1.464 0.173511 1.0 1.4 1.50 1.575 1.9
Iris-versicolor 50.0 4.260 0.469911 3.0 4.0 4.35 4.600 5.1
Iris-virginica 50.0 5.552 0.551895 4.5 5.1 5.55 5.875 6.9
petal-width
count mean std min 25% 50% 75% max
species
Iris-setosa 50.0 0.244 0.107210 0.1 0.2 0.2 0.3 0.6
Iris-versicolor 50.0 1.326 0.197753 1.0 1.2 1.3 1.5 1.8
Iris-virginica 50.0 2.026 0.274650 1.4 1.8 2.0 2.3 2.5
# Set the style for Seaborn
sns.set(style="whitegrid")
# Create subplots for univariate plots
plt.figure(figsize=(12, 6))
plt.subplot(2, 2, 1)
sns.histplot(data['sepal-length'], kde=True, color='b')
plt.title('Sepal Length')
plt.subplot(2, 2, 2)
sns.histplot(data['sepal-width'], kde=True, color='g')
plt.title('Sepal Width')
plt.subplot(2, 2, 3)
sns.histplot(data['petal-length'], kde=True, color='r')
plt.title('Petal Length')
plt.subplot(2, 2, 4)
sns.histplot(data['petal-width'], kde=True, color='purple')
plt.title('Petal Width')
plt.tight_layout()
plt.show()
# Create subplots for histograms
plt.figure(figsize=(12, 6))
plt.subplot(2, 2, 1)
plt.hist(data['sepal-length'], bins=15, color='b', alpha=0.7)
plt.title('Sepal Length Histogram')
plt.xlabel('Sepal Length')
plt.ylabel('Frequency')
plt.subplot(2, 2, 2)
plt.hist(data['sepal-width'], bins=15, color='g', alpha=0.7)
plt.title('Sepal Width Histogram')
plt.xlabel('Sepal Width')
plt.ylabel('Frequency')
plt.subplot(2, 2, 3)
plt.hist(data['petal-length'], bins=15, color='r', alpha=0.7)
plt.title('Petal Length Histogram')
plt.xlabel('Petal Length')
plt.ylabel('Frequency')
plt.subplot(2, 2, 4)
plt.hist(data['petal-width'], bins=15, color='purple', alpha=0.7)
plt.title('Petal Width Histogram')
plt.xlabel('Petal Width')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Load the Iris dataset
iris = sns.load_dataset("iris")
# Create pair plots for multivariate analysis
sns.set(style="whitegrid")
sns.pairplot(iris, hue="species", size=3)
plt.show()
C:\Users\tommy\anaconda3\Lib\site-packages\seaborn\axisgrid.py:2095: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Create a decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X, y)
# Visualize the decision tree
plt.figure(figsize=(12, 8))
plot_tree(clf, filled=True, feature_names=iris.feature_names, class_names=list(iris.target_names))
plt.title("Decision Tree for Iris Dataset")
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data[:, :2] # Use only the first two features
y = iris.target
# Create a meshgrid for the feature space
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
# Create a K-NN classifier
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
# Predict for each point in the mesh
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Create a color map for the plot
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# Plot the decision boundaries
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('K-Nearest Neighbors (K-NN) Decision Boundaries (k=5)')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import seaborn as sns
# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Use only the first two features (sepal length and sepal width)
X = X[:, :2]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Naive Bayes model
gnb = GaussianNB()
# Train the model
gnb.fit(X_train, y_train)
# Generate some data points for visualization
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
# Predict the class probabilities for each point in the mesh
Z = gnb.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Create a contour plot
plt.contourf(xx, yy, Z, cmap='YlGnBu', alpha=0.8)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris.target_names[y], palette="Set1")
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title('Naive Bayes Classification')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Use only the first two features (sepal length and sepal width)
X = X[:, :2]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Random Forest classifier with multiple decision trees
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)
# Generate some data points for visualization
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
# Predict the class probabilities for each point in the mesh
Z = rf_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Create a contour plot
plt.contourf(xx, yy, Z, cmap='YlGnBu', alpha=0.8)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris.target_names[y], palette="Set1")
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title('Random Forests Classification')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import seaborn as sns
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data[:, :2] # Consider only the first two features for visualization
y = iris.target
# Split the data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create SVM and Logistic Regression classifiers
svm_classifier = SVC(kernel='linear', C=1, random_state=42)
logistic_regression = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, random_state=42)
# Fit the models to the training data
svm_classifier.fit(X_train, y_train)
logistic_regression.fit(X_train, y_train)
# Create a meshgrid for visualization
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02))
# Predict the class labels for the meshgrid points
Z_svm = svm_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z_svm = Z_svm.reshape(xx.shape)
Z_logistic = logistic_regression.predict(np.c_[xx.ravel(), yy.ravel()])
Z_logistic = Z_logistic.reshape(xx.shape)
# Create subplots to visualize both models
plt.figure(figsize=(12, 5))
# SVM decision boundary
plt.subplot(1, 2, 1)
plt.contourf(xx, yy, Z_svm, cmap=plt.cm.YlGnBu, alpha=0.8)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris.target_names[y], palette="Set1")
plt.title("SVM Decision Boundary")
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
# Multinomial Logistic Regression decision boundary
plt.subplot(1, 2, 2)
plt.contourf(xx, yy, Z_logistic, cmap=plt.cm.YlGnBu, alpha=0.8)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=iris.target_names[y], palette="Set1")
plt.title("Multinomial Logistic Regression Decision Boundary")
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.tight_layout()
plt.show()