In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
# create columns
columns = ["age", "work-class", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
           
# create a dataframe, data loaded from url
data = pd.read_csv("adult.data", names=columns, sep=',')

print(data)
       age         work-class  fnlwgt    education  education-num  \
0       39          State-gov   77516    Bachelors             13   
1       50   Self-emp-not-inc   83311    Bachelors             13   
2       38            Private  215646      HS-grad              9   
3       53            Private  234721         11th              7   
4       28            Private  338409    Bachelors             13   
...    ...                ...     ...          ...            ...   
32556   27            Private  257302   Assoc-acdm             12   
32557   40            Private  154374      HS-grad              9   
32558   58            Private  151910      HS-grad              9   
32559   22            Private  201490      HS-grad              9   
32560   52       Self-emp-inc  287927      HS-grad              9   

            marital-status          occupation    relationship    race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1       Married-civ-spouse     Exec-managerial         Husband   White   
2                 Divorced   Handlers-cleaners   Not-in-family   White   
3       Married-civ-spouse   Handlers-cleaners         Husband   Black   
4       Married-civ-spouse      Prof-specialty            Wife   Black   
...                    ...                 ...             ...     ...   
32556   Married-civ-spouse        Tech-support            Wife   White   
32557   Married-civ-spouse   Machine-op-inspct         Husband   White   
32558              Widowed        Adm-clerical       Unmarried   White   
32559        Never-married        Adm-clerical       Own-child   White   
32560   Married-civ-spouse     Exec-managerial            Wife   White   

           sex  capital-gain  capital-loss  hours-per-week  native-country  \
0         Male          2174             0              40   United-States   
1         Male             0             0              13   United-States   
2         Male             0             0              40   United-States   
3         Male             0             0              40   United-States   
4       Female             0             0              40            Cuba   
...        ...           ...           ...             ...             ...   
32556   Female             0             0              38   United-States   
32557     Male             0             0              40   United-States   
32558   Female             0             0              40   United-States   
32559     Male             0             0              20   United-States   
32560   Female         15024             0              40   United-States   

       income  
0       <=50K  
1       <=50K  
2       <=50K  
3       <=50K  
4       <=50K  
...       ...  
32556   <=50K  
32557    >50K  
32558   <=50K  
32559   <=50K  
32560    >50K  

[32561 rows x 15 columns]
In [3]:
# shape of the data
print (data.shape, '\n')

# get details of the data type for each variable
data.info()
(32561, 15) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   work-class      32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
In [4]:
# get numerical data only
numeric_data = data.select_dtypes(include=['int'])

numeric_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   fnlwgt          32561 non-null  int64
 2   education-num   32561 non-null  int64
 3   capital-gain    32561 non-null  int64
 4   capital-loss    32561 non-null  int64
 5   hours-per-week  32561 non-null  int64
dtypes: int64(6)
memory usage: 1.5 MB
In [5]:
# get categorical data
cat_data = data.select_dtypes(include=['object'])

cat_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   work-class      32561 non-null  object
 1   education       32561 non-null  object
 2   marital-status  32561 non-null  object
 3   occupation      32561 non-null  object
 4   relationship    32561 non-null  object
 5   race            32561 non-null  object
 6   sex             32561 non-null  object
 7   native-country  32561 non-null  object
 8   income          32561 non-null  object
dtypes: object(9)
memory usage: 2.2+ MB
In [6]:
# show counts values of each categorical variable
for colname in cat_data.columns:
    print (colname)
    print (cat_data[colname].value_counts(), '\n')
work-class
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: work-class, dtype: int64 

education
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64 

marital-status
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64 

occupation
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: occupation, dtype: int64 

relationship
 Husband           13193
 Not-in-family      8305
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: relationship, dtype: int64 

race
 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64 

sex
 Male      21790
 Female    10771
Name: sex, dtype: int64 

native-country
 United-States                 29170
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 France                           29
 Greece                           29
 Ecuador                          28
 Ireland                          24
 Hong                             20
 Cambodia                         19
 Trinadad&Tobago                  19
 Laos                             18
 Thailand                         18
 Yugoslavia                       16
 Outlying-US(Guam-USVI-etc)       14
 Honduras                         13
 Hungary                          13
 Scotland                         12
 Holand-Netherlands                1
Name: native-country, dtype: int64 

income
 <=50K    24720
 >50K      7841
Name: income, dtype: int64 

In [7]:
# first 5 lines of the dataset. (you can specify how many lines by providing an int parameter )
data.head()
Out[7]:
age work-class fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [8]:
# show unique values for education
data['education'].unique()
Out[8]:
array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)
In [9]:
# show counts values of each categorical variable
for colname in cat_data.columns:
    print (colname)
    print (cat_data[colname].unique(), '\n')
    
work-class
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 

education
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 

marital-status
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

occupation
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] 

relationship
[' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 

race
[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

sex
[' Male' ' Female'] 

native-country
[' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic'
 ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia'
 ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago'
 ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary'
 ' Holand-Netherlands'] 

income
[' <=50K' ' >50K'] 

In [10]:
# Cleanup by strip leading/trailing spaces
data[cat_data.columns] = cat_data.apply(lambda x: x.str.strip())

# verify
cat_data = data.select_dtypes(include=['object'])
for colname in cat_data.columns:
    print (colname)
    print (cat_data[colname].unique(), '\n')
work-class
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked'] 

education
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th'] 

marital-status
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed'] 

occupation
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv'] 

relationship
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative'] 

race
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other'] 

sex
['Male' 'Female'] 

native-country
['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador'
 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador'
 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru'
 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece'
 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands'] 

income
['<=50K' '>50K'] 

In [11]:
#data2 = data.copy()
data['education'].replace('Preschool', 'LessHS', inplace=True)

data['education'].replace('10th', 'LessHS',inplace=True)
data['education'].replace('11th', 'LessHS',inplace=True)
data['education'].replace('12th', 'LessHS',inplace=True)
data['education'].replace('1st-4th', 'LessHS',inplace=True)
data['education'].replace('5th-6th', 'LessHS',inplace=True)
data['education'].replace('7th-8th', 'LessHS',inplace=True)
data['education'].replace('9th', 'LessHS',inplace=True)
data['education'].replace('Assoc-acdm', 'Some-college',inplace=True)
data['education'].replace('Assoc-voc', 'Some-college',inplace=True)
data['education'].replace('Prof-school', 'Masters',inplace=True)

data['education'].unique()
Out[11]:
array(['Bachelors', 'HS-grad', 'LessHS', 'Masters', 'Some-college',
       'Doctorate'], dtype=object)
In [12]:
print(data.columns)

# determins if there are any nulls
print (data.isnull().values.any(), '\n\ncount of nulls')

# count number of nulls
print ( data.isnull().sum())
Index(['age', 'work-class', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')
False 

count of nulls
age               0
work-class        0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64
In [13]:
# missing values by columns
print("Missing value counts for occupation: ", data['occupation'].isnull().sum())
Missing value counts for occupation:  0
In [14]:
# if there were missing values we can drop them if we decided
data = data.dropna()
In [15]:
# check duplicates across all rows

duplicateRows = data[data.duplicated()]
print('duplicates size: ', len(duplicateRows ))
duplicateRows.head()
duplicates size:  24
Out[15]:
age work-class fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
4881 25 Private 308144 Bachelors 13 Never-married Craft-repair Not-in-family White Male 0 0 40 Mexico <=50K
5104 90 Private 52386 Some-college 10 Never-married Other-service Not-in-family Asian-Pac-Islander Male 0 0 35 United-States <=50K
9171 21 Private 250051 Some-college 10 Never-married Prof-specialty Own-child White Female 0 0 10 United-States <=50K
11631 20 Private 107658 Some-college 10 Never-married Tech-support Not-in-family White Female 0 0 10 United-States <=50K
13084 25 Private 195994 LessHS 2 Never-married Priv-house-serv Not-in-family White Female 0 0 40 Guatemala <=50K
In [16]:
# check duplicates for a specific columns

duplicateRows = data[data.duplicated(['age', 'marital-status'])]
duplicateRows.sort_values(by=['age']).head()
Out[16]:
age work-class fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
3618 17 Private 354201 HS-grad 9 Never-married Other-service Own-child White Male 0 0 40 United-States <=50K
14514 17 Self-emp-inc 61838 LessHS 6 Never-married Craft-repair Own-child White Male 0 0 40 United-States <=50K
21888 17 Private 171080 LessHS 8 Never-married Handlers-cleaners Own-child White Male 0 0 20 United-States <=50K
14544 17 Private 73145 LessHS 5 Never-married Craft-repair Own-child White Female 0 0 16 United-States <=50K
28945 17 Private 147069 LessHS 6 Never-married Sales Own-child White Female 0 0 16 United-States <=50K
In [17]:
# drop duplicates
data = data.drop_duplicates()

# Dropping outliers based on condition
#i = data[data['capital-gain'] > 80000].index
#data = data.drop(i)
In [18]:
# drop un-used columns
del data['fnlwgt'] 

# display new structure
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 32537 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   work-class      32537 non-null  object
 2   education       32537 non-null  object
 3   education-num   32537 non-null  int64 
 4   marital-status  32537 non-null  object
 5   occupation      32537 non-null  object
 6   relationship    32537 non-null  object
 7   race            32537 non-null  object
 8   sex             32537 non-null  object
 9   capital-gain    32537 non-null  int64 
 10  capital-loss    32537 non-null  int64 
 11  hours-per-week  32537 non-null  int64 
 12  native-country  32537 non-null  object
 13  income          32537 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.7+ MB
In [19]:
# gets all statistics for all numeric values via describe mehtod.  
# To suppress scientific notation, I'm using round function also
data.describe().round()
Out[19]:
age education-num capital-gain capital-loss hours-per-week
count 32537.0 32537.0 32537.0 32537.0 32537.0
mean 39.0 10.0 1078.0 87.0 40.0
std 14.0 3.0 7388.0 403.0 12.0
min 17.0 1.0 0.0 0.0 1.0
25% 28.0 9.0 0.0 0.0 40.0
50% 37.0 10.0 0.0 0.0 40.0
75% 48.0 12.0 0.0 0.0 45.0
max 90.0 16.0 99999.0 4356.0 99.0
In [20]:
# age statistics
print("Age Statistics")
print(data['age'].describe())
print("Median Age: ", data['age'].median())
Age Statistics
count    32537.000000
mean        38.585549
std         13.637984
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64
Median Age:  37.0
In [21]:
#statistical summary by class variable (use round to suppress scientif notation as needed)
print("Statistics by Gender")

gender = data.groupby('sex').describe().round()

# transposed index and columns for long format (easier to view) vs wide
gender.transpose() 
Statistics by Gender
Out[21]:
sex Female Male
age count 10762.0 21775.0
mean 37.0 39.0
std 14.0 13.0
min 17.0 17.0
25% 25.0 29.0
50% 35.0 38.0
75% 46.0 48.0
max 90.0 90.0
education-num count 10762.0 21775.0
mean 10.0 10.0
std 2.0 3.0
min 1.0 1.0
25% 9.0 9.0
50% 10.0 10.0
75% 12.0 13.0
max 16.0 16.0
capital-gain count 10762.0 21775.0
mean 569.0 1330.0
std 4926.0 8329.0
min 0.0 0.0
25% 0.0 0.0
50% 0.0 0.0
75% 0.0 0.0
max 99999.0 99999.0
capital-loss count 10762.0 21775.0
mean 61.0 100.0
std 341.0 430.0
min 0.0 0.0
25% 0.0 0.0
50% 0.0 0.0
75% 0.0 0.0
max 4356.0 3770.0
hours-per-week count 10762.0 21775.0
mean 36.0 42.0
std 12.0 12.0
min 1.0 1.0
25% 30.0 40.0
50% 40.0 40.0
75% 40.0 49.0
max 99.0 99.0
In [22]:
def custom_describe(df):
    return pd.concat([df.describe().T,               # transpose
                      df.median().rename('median'),  # include median
                      df.skew().rename('skew'),      # include skewness
                      df.kurt().rename('kurt'),      # include kurtosis 
                     ], axis=1).T

custom_describe(data).round()  # round values
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:3: FutureWarning: The default value of numeric_only in DataFrame.median is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.median().rename('median'),  # include median
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:4: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.skew().rename('skew'),      # include skewness
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:5: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.kurt().rename('kurt'),      # include kurtosis
Out[22]:
age education-num capital-gain capital-loss hours-per-week
count 32537.0 32537.0 32537.0 32537.0 32537.0
mean 39.0 10.0 1078.0 87.0 40.0
std 14.0 3.0 7388.0 403.0 12.0
min 17.0 1.0 0.0 0.0 1.0
25% 28.0 9.0 0.0 0.0 40.0
50% 37.0 10.0 0.0 0.0 40.0
75% 48.0 12.0 0.0 0.0 45.0
max 90.0 16.0 99999.0 4356.0 99.0
median 37.0 10.0 0.0 0.0 40.0
skew 1.0 -0.0 12.0 5.0 0.0
kurt -0.0 1.0 155.0 20.0 3.0
In [23]:
# histogram
data.hist(figsize=(12,10));
In [24]:
data.plot(figsize=(15,10), kind='density', subplots=True, layout=(2,3), sharex=False)
Out[24]:
array([[<Axes: ylabel='Density'>, <Axes: ylabel='Density'>,
        <Axes: ylabel='Density'>],
       [<Axes: ylabel='Density'>, <Axes: ylabel='Density'>,
        <Axes: ylabel='Density'>]], dtype=object)
In [25]:
data.plot(figsize=(12,5), kind='box');
In [40]:
# using seaborn library, plot each individually
fig, axes = plt.subplots(2, 2, figsize=(14,10))

ax = sns.boxplot(data['age'], orient='v', ax=axes[0, 0])
ax = sns.boxplot(data['education-num'], orient='v', ax=axes[0, 1])
ax = sns.boxplot(data['hours-per-week'], orient='v', ax=axes[1, 0])
ax = sns.boxplot(data['capital-gain'], orient='v',  ax=axes[1, 1])
In [29]:
# show columns for categorical dataset created earlier
cat_data.columns
Out[29]:
Index(['work-class', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')
In [30]:
plt.figure(figsize=(10,5))
ax=sns.countplot(data = cat_data, x = "work-class")
ax.set_title('Counts Work Class')
plt.xticks(rotation=45)
ax.set(xlabel='work classification', ylabel='');
In [31]:
from pandas.plotting import scatter_matrix

scatter_matrix(data, figsize=(12, 12));
In [32]:
# Using seaborn, pairwise plot by income

g=sns.pairplot(data, diag_kind = 'kde', hue='income')
g.fig.set_size_inches(12,12);

# rotate labels
for ax in g.axes.flatten():
    # rotate y axis labels
    ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    # set y labels alignment
    ax.yaxis.get_label().set_horizontalalignment('right')
In [74]:
# Compute the correlation matrix
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\4176814396.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  corr = data.corr()
Out[74]:
  age education-num capital-gain capital-loss hours-per-week
age 1.000000 0.036224 0.077676 0.057745 0.068515
education-num 0.036224 1.000000 0.122664 0.079892 0.148422
capital-gain 0.077676 0.122664 1.000000 -0.031639 0.078408
capital-loss 0.057745 0.079892 -0.031639 1.000000 0.054229
hours-per-week 0.068515 0.148422 0.078408 0.054229 1.000000
In [34]:
# using seaboarn to plot correlation heat map

f, ax = plt.subplots(figsize=(12, 6))

# generate a custom diverging colormap
cmap = sns.diverging_palette(220,10, as_cmap=True)

sns.heatmap(corr, cmap=cmap, annot=True, square=True, ax=ax,  linewidth = 0.1)
plt.title('Pearson Correlation of Features')
plt.yticks(rotation=45)
plt.xticks(rotation=45);
In [38]:
# scatter plots

sns.relplot(x="education-num", y="capital-gain", hue='income', data=data);
In [41]:
# Cross tabulation between work-class and sex

pd.crosstab(data['work-class'],data['sex'], margins=True)
Out[41]:
sex Female Male All
work-class
? 839 997 1836
Federal-gov 315 645 960
Local-gov 835 1258 2093
Never-worked 2 5 7
Private 7743 14930 22673
Self-emp-inc 135 981 1116
Self-emp-not-inc 399 2141 2540
State-gov 489 809 1298
Without-pay 5 9 14
All 10762 21775 32537
In [42]:
# Distributions of observations within categories
sns.catplot(x="work-class", y="age", kind="box", data=data,
           height=5, aspect=11/8)
plt.title('Work Class distribution by Age')
plt.xticks(rotation=45);
In [43]:
# Box plot between work-class and age for different sex

plt.figure(figsize=(12,6))
sns.boxplot(x="work-class",y="age", hue="sex", data=data)
plt.title('Work Class distribution by Age and Gender')
plt.xticks(rotation=45);
In [44]:
data['income'].value_counts()
Out[44]:
<=50K    24698
>50K      7839
Name: income, dtype: int64
In [45]:
low = len(data[data['income'] == '<=50K']) / len(data)
high = len(data[data['income'] == '>50K']) / len(data)
print(f'<=50K: {round(low, 2)}  >50K {round(high, 2)}')
<=50K: 0.76  >50K 0.24
In [46]:
data['education'].value_counts()
Out[46]:
HS-grad         10494
Some-college     9731
Bachelors        5353
LessHS           4248
Masters          2298
Doctorate         413
Name: education, dtype: int64
In [47]:
# cross tab as percentages
pd.crosstab(data['education'],data['income'], margins=True, normalize='index')
Out[47]:
income <=50K >50K
education
Bachelors 0.585092 0.414908
Doctorate 0.259080 0.740920
HS-grad 0.840480 0.159520
LessHS 0.942561 0.057439
Masters 0.398607 0.601393
Some-college 0.793238 0.206762
All 0.759074 0.240926
In [73]:
# datafrom cross tab
df = pd.crosstab(data['education-num'],data['income'], margins=True, normalize='index')
df = df[:-1] # exclude last row (All)

# scatter plot
Y=df.index
X=df['>50K']

plt.scatter(X, Y, marker='o');
plt.xlabel('% of income > 50K')
plt.ylabel('years of education');
In [61]:
fig = plt.figure(figsize=(12,5))
ax=sns.countplot(x="education", hue="income", data=data)
ax.set_title('Income by Education');

# change the colors

# Define a custom color palette with your desired colors
custom_palette = ["#FF5733", "#33FF57"]  

fig = plt.figure(figsize=(12, 5))
ax = sns.countplot(x="education", hue="income", data=data, palette=custom_palette)
ax.set_title('Income by Education')

plt.show()
In [63]:
import matplotlib.pyplot as plt
import pandas as pd


column_name = 'age'  

# Create a customized histogram
plt.figure(figsize=(12, 10))
plt.hist(data[column_name], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel(column_name.capitalize())  # Use the column name as the x-axis label
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_name.capitalize()}')
plt.grid(True)

# Show the histogram
plt.show()
In [64]:
# Group data by 'sex' and calculate statistics
gender_stats = data.groupby('sex').describe().round()

# Transpose the DataFrame for a long format
gender_stats_long = gender_stats.transpose()

# Create a bar plot for the statistical summary
plt.figure(figsize=(12, 6))
gender_stats_long.plot(kind='bar', rot=0, colormap='viridis')
plt.title('Statistics by Gender')
plt.xlabel('Statistics')
plt.ylabel('Values')
plt.legend(title='Gender')

# Show the plot
plt.show()
<Figure size 1200x600 with 0 Axes>
In [65]:
# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()
In [72]:
import matplotlib.pyplot as plt
import numpy as np

# Extract the first 5 rows of the "age" column
first_5_age = data['age'].head()

# Define custom colors for the bars
colors = ['skyblue', 'salmon', 'lightgreen', 'gold', 'lightcoral']

# Create a bar chart with custom colors
plt.figure(figsize=(8, 6))
bars = plt.bar(range(len(first_5_age)), first_5_age, color=colors)
plt.xlabel('Row Index')
plt.ylabel('Age')
plt.title('Age Distribution for the First 5 Rows')
plt.xticks(range(len(first_5_age)), ['Row 1', 'Row 2', 'Row 3', 'Row 4', 'Row 5'])

# Add data labels above each bar
for bar, age in zip(bars, first_5_age):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(age), ha='center', va='bottom')

plt.show()
In [ ]: