import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# create columns
columns = ["age", "work-class", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
           
# create a dataframe, data loaded from url
data = pd.read_csv("adult.data", names=columns, sep=',')

print(data)

       age         work-class  fnlwgt    education  education-num  \
0       39          State-gov   77516    Bachelors             13   
1       50   Self-emp-not-inc   83311    Bachelors             13   
2       38            Private  215646      HS-grad              9   
3       53            Private  234721         11th              7   
4       28            Private  338409    Bachelors             13   
...    ...                ...     ...          ...            ...   
32556   27            Private  257302   Assoc-acdm             12   
32557   40            Private  154374      HS-grad              9   
32558   58            Private  151910      HS-grad              9   
32559   22            Private  201490      HS-grad              9   
32560   52       Self-emp-inc  287927      HS-grad              9   

            marital-status          occupation    relationship    race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1       Married-civ-spouse     Exec-managerial         Husband   White   
2                 Divorced   Handlers-cleaners   Not-in-family   White   
3       Married-civ-spouse   Handlers-cleaners         Husband   Black   
4       Married-civ-spouse      Prof-specialty            Wife   Black   
...                    ...                 ...             ...     ...   
32556   Married-civ-spouse        Tech-support            Wife   White   
32557   Married-civ-spouse   Machine-op-inspct         Husband   White   
32558              Widowed        Adm-clerical       Unmarried   White   
32559        Never-married        Adm-clerical       Own-child   White   
32560   Married-civ-spouse     Exec-managerial            Wife   White   

           sex  capital-gain  capital-loss  hours-per-week  native-country  \
0         Male          2174             0              40   United-States   
1         Male             0             0              13   United-States   
2         Male             0             0              40   United-States   
3         Male             0             0              40   United-States   
4       Female             0             0              40            Cuba   
...        ...           ...           ...             ...             ...   
32556   Female             0             0              38   United-States   
32557     Male             0             0              40   United-States   
32558   Female             0             0              40   United-States   
32559     Male             0             0              20   United-States   
32560   Female         15024             0              40   United-States   

       income  
0       <=50K  
1       <=50K  
2       <=50K  
3       <=50K  
4       <=50K  
...       ...  
32556   <=50K  
32557    >50K  
32558   <=50K  
32559   <=50K  
32560    >50K  

[32561 rows x 15 columns]


# shape of the data
print (data.shape, '\n')

# get details of the data type for each variable
data.info()

(32561, 15) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   work-class      32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


# get numerical data only
numeric_data = data.select_dtypes(include=['int'])

numeric_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   fnlwgt          32561 non-null  int64
 2   education-num   32561 non-null  int64
 3   capital-gain    32561 non-null  int64
 4   capital-loss    32561 non-null  int64
 5   hours-per-week  32561 non-null  int64
dtypes: int64(6)
memory usage: 1.5 MB


# get categorical data
cat_data = data.select_dtypes(include=['object'])

cat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   work-class      32561 non-null  object
 1   education       32561 non-null  object
 2   marital-status  32561 non-null  object
 3   occupation      32561 non-null  object
 4   relationship    32561 non-null  object
 5   race            32561 non-null  object
 6   sex             32561 non-null  object
 7   native-country  32561 non-null  object
 8   income          32561 non-null  object
dtypes: object(9)
memory usage: 2.2+ MB


# show counts values of each categorical variable
for colname in cat_data.columns:
    print (colname)
    print (cat_data[colname].value_counts(), '\n')

work-class
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: work-class, dtype: int64 

education
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64 

marital-status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64 

occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64 

relationship
 Husband           13193
 Not-in-family      8305
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: relationship, dtype: int64 

race
 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64 

sex
 Male      21790
 Female    10771
Name: sex, dtype: int64 

native-country
 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 France                           29
 Greece                           29
 Ecuador                          28
 Ireland                          24
 Hong                             20
 Cambodia                         19
 Trinadad&Tobago                  19
 Laos                             18
 Thailand                         18
 Yugoslavia                       16
 Outlying-US(Guam-USVI-etc)       14
 Honduras                         13
 Hungary                          13
 Scotland                         12
 Holand-Netherlands                1
Name: native-country, dtype: int64 

income
 <=50K    24720
 >50K      7841
Name: income, dtype: int64


# first 5 lines of the dataset. (you can specify how many lines by providing an int parameter )
data.head()


# show unique values for education
data['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)


# show counts values of each categorical variable
for colname in cat_data.columns:
    print (colname)
    print (cat_data[colname].unique(), '\n')

work-class
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 

education
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 

marital-status
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

occupation
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] 

relationship
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 

race
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

sex
[' Male' ' Female'] 

native-country
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic'
 ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia'
 ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago'
 ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary'
 ' Holand-Netherlands'] 

income
[' <=50K' ' >50K']


# Cleanup by strip leading/trailing spaces
data[cat_data.columns] = cat_data.apply(lambda x: x.str.strip())

# verify
cat_data = data.select_dtypes(include=['object'])
for colname in cat_data.columns:
    print (colname)
    print (cat_data[colname].unique(), '\n')

work-class
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked'] 

education
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th'] 

marital-status
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed'] 

occupation
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv'] 

relationship
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative'] 

race
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other'] 

sex
['Male' 'Female'] 

native-country
['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador'
 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands'] 

income
['<=50K' '>50K']


#data2 = data.copy()
data['education'].replace('Preschool', 'LessHS', inplace=True)

data['education'].replace('10th', 'LessHS',inplace=True)
data['education'].replace('11th', 'LessHS',inplace=True)
data['education'].replace('12th', 'LessHS',inplace=True)
data['education'].replace('1st-4th', 'LessHS',inplace=True)
data['education'].replace('5th-6th', 'LessHS',inplace=True)
data['education'].replace('7th-8th', 'LessHS',inplace=True)
data['education'].replace('9th', 'LessHS',inplace=True)
data['education'].replace('Assoc-acdm', 'Some-college',inplace=True)
data['education'].replace('Assoc-voc', 'Some-college',inplace=True)
data['education'].replace('Prof-school', 'Masters',inplace=True)

data['education'].unique()

array(['Bachelors', 'HS-grad', 'LessHS', 'Masters', 'Some-college',
       'Doctorate'], dtype=object)


print(data.columns)

# determins if there are any nulls
print (data.isnull().values.any(), '\n\ncount of nulls')

# count number of nulls
print ( data.isnull().sum())

Index(['age', 'work-class', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')
False 

count of nulls
age               0
work-class        0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


# missing values by columns
print("Missing value counts for occupation: ", data['occupation'].isnull().sum())

Missing value counts for occupation:  0


# if there were missing values we can drop them if we decided
data = data.dropna()


# check duplicates across all rows

duplicateRows = data[data.duplicated()]
print('duplicates size: ', len(duplicateRows ))
duplicateRows.head()

duplicates size:  24


# check duplicates for a specific columns

duplicateRows = data[data.duplicated(['age', 'marital-status'])]
duplicateRows.sort_values(by=['age']).head()


# drop duplicates
data = data.drop_duplicates()

# Dropping outliers based on condition
#i = data[data['capital-gain'] > 80000].index
#data = data.drop(i)


# drop un-used columns
del data['fnlwgt'] 

# display new structure
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32537 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   work-class      32537 non-null  object
 2   education       32537 non-null  object
 3   education-num   32537 non-null  int64 
 4   marital-status  32537 non-null  object
 5   occupation      32537 non-null  object
 6   relationship    32537 non-null  object
 7   race            32537 non-null  object
 8   sex             32537 non-null  object
 9   capital-gain    32537 non-null  int64 
 10  capital-loss    32537 non-null  int64 
 11  hours-per-week  32537 non-null  int64 
 12  native-country  32537 non-null  object
 13  income          32537 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.7+ MB


# gets all statistics for all numeric values via describe mehtod.  
# To suppress scientific notation, I'm using round function also
data.describe().round()


# age statistics
print("Age Statistics")
print(data['age'].describe())
print("Median Age: ", data['age'].median())

Age Statistics
count    32537.000000
mean        38.585549
std         13.637984
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64
Median Age:  37.0


#statistical summary by class variable (use round to suppress scientif notation as needed)
print("Statistics by Gender")

gender = data.groupby('sex').describe().round()

# transposed index and columns for long format (easier to view) vs wide
gender.transpose()

Statistics by Gender


def custom_describe(df):
    return pd.concat([df.describe().T,               # transpose
                      df.median().rename('median'),  # include median
                      df.skew().rename('skew'),      # include skewness
                      df.kurt().rename('kurt'),      # include kurtosis 
                     ], axis=1).T

custom_describe(data).round()  # round values

C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:3: FutureWarning: The default value of numeric_only in DataFrame.median is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.median().rename('median'),  # include median
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:4: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.skew().rename('skew'),      # include skewness
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:5: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.kurt().rename('kurt'),      # include kurtosis


# histogram
data.hist(figsize=(12,10));


data.plot(figsize=(15,10), kind='density', subplots=True, layout=(2,3), sharex=False)

array([[<Axes: ylabel='Density'>, <Axes: ylabel='Density'>,
        <Axes: ylabel='Density'>],
       [<Axes: ylabel='Density'>, <Axes: ylabel='Density'>,
        <Axes: ylabel='Density'>]], dtype=object)


data.plot(figsize=(12,5), kind='box');


# using seaborn library, plot each individually
fig, axes = plt.subplots(2, 2, figsize=(14,10))

ax = sns.boxplot(data['age'], orient='v', ax=axes[0, 0])
ax = sns.boxplot(data['education-num'], orient='v', ax=axes[0, 1])
ax = sns.boxplot(data['hours-per-week'], orient='v', ax=axes[1, 0])
ax = sns.boxplot(data['capital-gain'], orient='v',  ax=axes[1, 1])


# show columns for categorical dataset created earlier
cat_data.columns

Index(['work-class', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')


plt.figure(figsize=(10,5))
ax=sns.countplot(data = cat_data, x = "work-class")
ax.set_title('Counts Work Class')
plt.xticks(rotation=45)
ax.set(xlabel='work classification', ylabel='');


from pandas.plotting import scatter_matrix

scatter_matrix(data, figsize=(12, 12));


# Using seaborn, pairwise plot by income

g=sns.pairplot(data, diag_kind = 'kde', hue='income')
g.fig.set_size_inches(12,12);

# rotate labels
for ax in g.axes.flatten():
    # rotate y axis labels
    ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    # set y labels alignment
    ax.yaxis.get_label().set_horizontalalignment('right')


# Compute the correlation matrix
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\4176814396.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = data.corr()


# using seaboarn to plot correlation heat map

f, ax = plt.subplots(figsize=(12, 6))

# generate a custom diverging colormap
cmap = sns.diverging_palette(220,10, as_cmap=True)

sns.heatmap(corr, cmap=cmap, annot=True, square=True, ax=ax,  linewidth = 0.1)
plt.title('Pearson Correlation of Features')
plt.yticks(rotation=45)
plt.xticks(rotation=45);


# scatter plots

sns.relplot(x="education-num", y="capital-gain", hue='income', data=data);


# Cross tabulation between work-class and sex

pd.crosstab(data['work-class'],data['sex'], margins=True)


# Distributions of observations within categories
sns.catplot(x="work-class", y="age", kind="box", data=data,
           height=5, aspect=11/8)
plt.title('Work Class distribution by Age')
plt.xticks(rotation=45);


# Box plot between work-class and age for different sex

plt.figure(figsize=(12,6))
sns.boxplot(x="work-class",y="age", hue="sex", data=data)
plt.title('Work Class distribution by Age and Gender')
plt.xticks(rotation=45);


data['income'].value_counts()

<=50K    24698
>50K      7839
Name: income, dtype: int64


low = len(data[data['income'] == '<=50K']) / len(data)
high = len(data[data['income'] == '>50K']) / len(data)
print(f'<=50K: {round(low, 2)}  >50K {round(high, 2)}')

<=50K: 0.76  >50K 0.24


data['education'].value_counts()

HS-grad         10494
Some-college     9731
Bachelors        5353
LessHS           4248
Masters          2298
Doctorate         413
Name: education, dtype: int64


# cross tab as percentages
pd.crosstab(data['education'],data['income'], margins=True, normalize='index')


# datafrom cross tab
df = pd.crosstab(data['education-num'],data['income'], margins=True, normalize='index')
df = df[:-1] # exclude last row (All)

# scatter plot
Y=df.index
X=df['>50K']

plt.scatter(X, Y, marker='o');
plt.xlabel('% of income > 50K')
plt.ylabel('years of education');


fig = plt.figure(figsize=(12,5))
ax=sns.countplot(x="education", hue="income", data=data)
ax.set_title('Income by Education');

# change the colors

# Define a custom color palette with your desired colors
custom_palette = ["#FF5733", "#33FF57"]  

fig = plt.figure(figsize=(12, 5))
ax = sns.countplot(x="education", hue="income", data=data, palette=custom_palette)
ax.set_title('Income by Education')

plt.show()


import matplotlib.pyplot as plt
import pandas as pd


column_name = 'age'  

# Create a customized histogram
plt.figure(figsize=(12, 10))
plt.hist(data[column_name], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel(column_name.capitalize())  # Use the column name as the x-axis label
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_name.capitalize()}')
plt.grid(True)

# Show the histogram
plt.show()


# Group data by 'sex' and calculate statistics
gender_stats = data.groupby('sex').describe().round()

# Transpose the DataFrame for a long format
gender_stats_long = gender_stats.transpose()

# Create a bar plot for the statistical summary
plt.figure(figsize=(12, 6))
gender_stats_long.plot(kind='bar', rot=0, colormap='viridis')
plt.title('Statistics by Gender')
plt.xlabel('Statistics')
plt.ylabel('Values')
plt.legend(title='Gender')

# Show the plot
plt.show()

<Figure size 1200x600 with 0 Axes>


# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()


import matplotlib.pyplot as plt
import numpy as np

# Extract the first 5 rows of the "age" column
first_5_age = data['age'].head()

# Define custom colors for the bars
colors = ['skyblue', 'salmon', 'lightgreen', 'gold', 'lightcoral']

# Create a bar chart with custom colors
plt.figure(figsize=(8, 6))
bars = plt.bar(range(len(first_5_age)), first_5_age, color=colors)
plt.xlabel('Row Index')
plt.ylabel('Age')
plt.title('Age Distribution for the First 5 Rows')
plt.xticks(range(len(first_5_age)), ['Row 1', 'Row 2', 'Row 3', 'Row 4', 'Row 5'])

# Add data labels above each bar
for bar, age in zip(bars, first_5_age):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(age), ha='center', va='bottom')

plt.show()

	age	education-num	capital-gain	capital-loss	hours-per-week
age	1.000000	0.036224	0.077676	0.057745	0.068515
education-num	0.036224	1.000000	0.122664	0.079892	0.148422
capital-gain	0.077676	0.122664	1.000000	-0.031639	0.078408
capital-loss	0.057745	0.079892	-0.031639	1.000000	0.054229
hours-per-week	0.068515	0.148422	0.078408	0.054229	1.000000

income	<=50K	>50K
education
Bachelors	0.585092	0.414908
Doctorate	0.259080	0.740920
HS-grad	0.840480	0.159520
LessHS	0.942561	0.057439
Masters	0.398607	0.601393
Some-college	0.793238	0.206762
All	0.759074	0.240926

	age	work-class	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	income
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

	age	work-class	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	hours-per-week	native-country	income
4881	25	Private	308144	Bachelors	13	Never-married	Craft-repair	Not-in-family	White	Male	40	Mexico	<=50K
5104	90	Private	52386	Some-college	10	Never-married	Other-service	Not-in-family	Asian-Pac-Islander	Male	35	United-States	<=50K
9171	21	Private	250051	Some-college	10	Never-married	Prof-specialty	Own-child	White	Female	10	United-States	<=50K
11631	20	Private	107658	Some-college	10	Never-married	Tech-support	Not-in-family	White	Female	10	United-States	<=50K
13084	25	Private	195994	LessHS	2	Never-married	Priv-house-serv	Not-in-family	White	Female	40	Guatemala	<=50K

	age	work-class	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	hours-per-week	native-country	income
3618	17	Private	354201	HS-grad	9	Never-married	Other-service	Own-child	White	Male	40	United-States	<=50K
14514	17	Self-emp-inc	61838	LessHS	6	Never-married	Craft-repair	Own-child	White	Male	40	United-States	<=50K
21888	17	Private	171080	LessHS	8	Never-married	Handlers-cleaners	Own-child	White	Male	20	United-States	<=50K
14544	17	Private	73145	LessHS	5	Never-married	Craft-repair	Own-child	White	Female	16	United-States	<=50K
28945	17	Private	147069	LessHS	6	Never-married	Sales	Own-child	White	Female	16	United-States	<=50K

	age	education-num	capital-gain	capital-loss	hours-per-week
count	32537.0	32537.0	32537.0	32537.0	32537.0
mean	39.0	10.0	1078.0	87.0	40.0
std	14.0	3.0	7388.0	403.0	12.0
min	17.0	1.0	0.0	0.0	1.0
25%	28.0	9.0	0.0	0.0	40.0
50%	37.0	10.0	0.0	0.0	40.0
75%	48.0	12.0	0.0	0.0	45.0
max	90.0	16.0	99999.0	4356.0	99.0

	sex	Female	Male
age	count	10762.0	21775.0
	mean	37.0	39.0
	std	14.0	13.0
	min	17.0	17.0
	25%	25.0	29.0
	50%	35.0	38.0
	75%	46.0	48.0
	max	90.0	90.0
education-num	count	10762.0	21775.0
	mean	10.0	10.0
	std	2.0	3.0
	min	1.0	1.0
	25%	9.0	9.0
	50%	10.0	10.0
	75%	12.0	13.0
	max	16.0	16.0
capital-gain	count	10762.0	21775.0
	mean	569.0	1330.0
	std	4926.0	8329.0
	min	0.0	0.0
	25%	0.0	0.0
	50%	0.0	0.0
	75%	0.0	0.0
	max	99999.0	99999.0
capital-loss	count	10762.0	21775.0
	mean	61.0	100.0
	std	341.0	430.0
	min	0.0	0.0
	25%	0.0	0.0
	50%	0.0	0.0
	75%	0.0	0.0
	max	4356.0	3770.0
hours-per-week	count	10762.0	21775.0
	mean	36.0	42.0
	std	12.0	12.0
	min	1.0	1.0
	25%	30.0	40.0
	50%	40.0	40.0
	75%	40.0	49.0
	max	99.0	99.0

sex	Female	Male	All
work-class
?	839	997	1836
Federal-gov	315	645	960
Local-gov	835	1258	2093
Never-worked	2	5	7
Private	7743	14930	22673
Self-emp-inc	135	981	1116
Self-emp-not-inc	399	2141	2540
State-gov	489	809	1298
Without-pay	5	9	14
All	10762	21775	32537