import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# create columns
columns = ["age", "work-class", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
"race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
# create a dataframe, data loaded from url
data = pd.read_csv("adult.data", names=columns, sep=',')
print(data)
age work-class fnlwgt education education-num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
2 38 Private 215646 HS-grad 9
3 53 Private 234721 11th 7
4 28 Private 338409 Bachelors 13
... ... ... ... ... ...
32556 27 Private 257302 Assoc-acdm 12
32557 40 Private 154374 HS-grad 9
32558 58 Private 151910 HS-grad 9
32559 22 Private 201490 HS-grad 9
32560 52 Self-emp-inc 287927 HS-grad 9
marital-status occupation relationship race \
0 Never-married Adm-clerical Not-in-family White
1 Married-civ-spouse Exec-managerial Husband White
2 Divorced Handlers-cleaners Not-in-family White
3 Married-civ-spouse Handlers-cleaners Husband Black
4 Married-civ-spouse Prof-specialty Wife Black
... ... ... ... ...
32556 Married-civ-spouse Tech-support Wife White
32557 Married-civ-spouse Machine-op-inspct Husband White
32558 Widowed Adm-clerical Unmarried White
32559 Never-married Adm-clerical Own-child White
32560 Married-civ-spouse Exec-managerial Wife White
sex capital-gain capital-loss hours-per-week native-country \
0 Male 2174 0 40 United-States
1 Male 0 0 13 United-States
2 Male 0 0 40 United-States
3 Male 0 0 40 United-States
4 Female 0 0 40 Cuba
... ... ... ... ... ...
32556 Female 0 0 38 United-States
32557 Male 0 0 40 United-States
32558 Female 0 0 40 United-States
32559 Male 0 0 20 United-States
32560 Female 15024 0 40 United-States
income
0 <=50K
1 <=50K
2 <=50K
3 <=50K
4 <=50K
... ...
32556 <=50K
32557 >50K
32558 <=50K
32559 <=50K
32560 >50K
[32561 rows x 15 columns]
# shape of the data
print (data.shape, '\n')
# get details of the data type for each variable
data.info()
(32561, 15) <class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 32561 non-null int64 1 work-class 32561 non-null object 2 fnlwgt 32561 non-null int64 3 education 32561 non-null object 4 education-num 32561 non-null int64 5 marital-status 32561 non-null object 6 occupation 32561 non-null object 7 relationship 32561 non-null object 8 race 32561 non-null object 9 sex 32561 non-null object 10 capital-gain 32561 non-null int64 11 capital-loss 32561 non-null int64 12 hours-per-week 32561 non-null int64 13 native-country 32561 non-null object 14 income 32561 non-null object dtypes: int64(6), object(9) memory usage: 3.7+ MB
# get numerical data only
numeric_data = data.select_dtypes(include=['int'])
numeric_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 32561 non-null int64 1 fnlwgt 32561 non-null int64 2 education-num 32561 non-null int64 3 capital-gain 32561 non-null int64 4 capital-loss 32561 non-null int64 5 hours-per-week 32561 non-null int64 dtypes: int64(6) memory usage: 1.5 MB
# get categorical data
cat_data = data.select_dtypes(include=['object'])
cat_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work-class 32561 non-null object 1 education 32561 non-null object 2 marital-status 32561 non-null object 3 occupation 32561 non-null object 4 relationship 32561 non-null object 5 race 32561 non-null object 6 sex 32561 non-null object 7 native-country 32561 non-null object 8 income 32561 non-null object dtypes: object(9) memory usage: 2.2+ MB
# show counts values of each categorical variable
for colname in cat_data.columns:
print (colname)
print (cat_data[colname].value_counts(), '\n')
work-class Private 22696 Self-emp-not-inc 2541 Local-gov 2093 ? 1836 State-gov 1298 Self-emp-inc 1116 Federal-gov 960 Without-pay 14 Never-worked 7 Name: work-class, dtype: int64 education HS-grad 10501 Some-college 7291 Bachelors 5355 Masters 1723 Assoc-voc 1382 11th 1175 Assoc-acdm 1067 10th 933 7th-8th 646 Prof-school 576 9th 514 12th 433 Doctorate 413 5th-6th 333 1st-4th 168 Preschool 51 Name: education, dtype: int64 marital-status Married-civ-spouse 14976 Never-married 10683 Divorced 4443 Separated 1025 Widowed 993 Married-spouse-absent 418 Married-AF-spouse 23 Name: marital-status, dtype: int64 occupation Prof-specialty 4140 Craft-repair 4099 Exec-managerial 4066 Adm-clerical 3770 Sales 3650 Other-service 3295 Machine-op-inspct 2002 ? 1843 Transport-moving 1597 Handlers-cleaners 1370 Farming-fishing 994 Tech-support 928 Protective-serv 649 Priv-house-serv 149 Armed-Forces 9 Name: occupation, dtype: int64 relationship Husband 13193 Not-in-family 8305 Own-child 5068 Unmarried 3446 Wife 1568 Other-relative 981 Name: relationship, dtype: int64 race White 27816 Black 3124 Asian-Pac-Islander 1039 Amer-Indian-Eskimo 311 Other 271 Name: race, dtype: int64 sex Male 21790 Female 10771 Name: sex, dtype: int64 native-country United-States 29170 Mexico 643 ? 583 Philippines 198 Germany 137 Canada 121 Puerto-Rico 114 El-Salvador 106 India 100 Cuba 95 England 90 Jamaica 81 South 80 China 75 Italy 73 Dominican-Republic 70 Vietnam 67 Guatemala 64 Japan 62 Poland 60 Columbia 59 Taiwan 51 Haiti 44 Iran 43 Portugal 37 Nicaragua 34 Peru 31 France 29 Greece 29 Ecuador 28 Ireland 24 Hong 20 Cambodia 19 Trinadad&Tobago 19 Laos 18 Thailand 18 Yugoslavia 16 Outlying-US(Guam-USVI-etc) 14 Honduras 13 Hungary 13 Scotland 12 Holand-Netherlands 1 Name: native-country, dtype: int64 income <=50K 24720 >50K 7841 Name: income, dtype: int64
# first 5 lines of the dataset. (you can specify how many lines by providing an int parameter )
data.head()
| age | work-class | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
# show unique values for education
data['education'].unique()
array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
' Preschool', ' 12th'], dtype=object)
# show counts values of each categorical variable
for colname in cat_data.columns:
print (colname)
print (cat_data[colname].unique(), '\n')
work-class [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov' ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] education [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college' ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school' ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] marital-status [' Never-married' ' Married-civ-spouse' ' Divorced' ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] occupation [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty' ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving' ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?' ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] relationship [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried' ' Other-relative'] race [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] sex [' Male' ' Female'] native-country [' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South' ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran' ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand' ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic' ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia' ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago' ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary' ' Holand-Netherlands'] income [' <=50K' ' >50K']
# Cleanup by strip leading/trailing spaces
data[cat_data.columns] = cat_data.apply(lambda x: x.str.strip())
# verify
cat_data = data.select_dtypes(include=['object'])
for colname in cat_data.columns:
print (colname)
print (cat_data[colname].unique(), '\n')
work-class ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?' 'Self-emp-inc' 'Without-pay' 'Never-worked'] education ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm' 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th' '1st-4th' 'Preschool' '12th'] marital-status ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent' 'Separated' 'Married-AF-spouse' 'Widowed'] occupation ['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty' 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving' 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?' 'Protective-serv' 'Armed-Forces' 'Priv-house-serv'] relationship ['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative'] race ['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other'] sex ['Male' 'Female'] native-country ['United-States' 'Cuba' 'Jamaica' 'India' '?' 'Mexico' 'South' 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran' 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' 'Ecuador' 'Laos' 'Taiwan' 'Haiti' 'Portugal' 'Dominican-Republic' 'El-Salvador' 'France' 'Guatemala' 'China' 'Japan' 'Yugoslavia' 'Peru' 'Outlying-US(Guam-USVI-etc)' 'Scotland' 'Trinadad&Tobago' 'Greece' 'Nicaragua' 'Vietnam' 'Hong' 'Ireland' 'Hungary' 'Holand-Netherlands'] income ['<=50K' '>50K']
#data2 = data.copy()
data['education'].replace('Preschool', 'LessHS', inplace=True)
data['education'].replace('10th', 'LessHS',inplace=True)
data['education'].replace('11th', 'LessHS',inplace=True)
data['education'].replace('12th', 'LessHS',inplace=True)
data['education'].replace('1st-4th', 'LessHS',inplace=True)
data['education'].replace('5th-6th', 'LessHS',inplace=True)
data['education'].replace('7th-8th', 'LessHS',inplace=True)
data['education'].replace('9th', 'LessHS',inplace=True)
data['education'].replace('Assoc-acdm', 'Some-college',inplace=True)
data['education'].replace('Assoc-voc', 'Some-college',inplace=True)
data['education'].replace('Prof-school', 'Masters',inplace=True)
data['education'].unique()
array(['Bachelors', 'HS-grad', 'LessHS', 'Masters', 'Some-college',
'Doctorate'], dtype=object)
print(data.columns)
# determins if there are any nulls
print (data.isnull().values.any(), '\n\ncount of nulls')
# count number of nulls
print ( data.isnull().sum())
Index(['age', 'work-class', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'income'],
dtype='object')
False
count of nulls
age 0
work-class 0
fnlwgt 0
education 0
education-num 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 0
income 0
dtype: int64
# missing values by columns
print("Missing value counts for occupation: ", data['occupation'].isnull().sum())
Missing value counts for occupation: 0
# if there were missing values we can drop them if we decided
data = data.dropna()
# check duplicates across all rows
duplicateRows = data[data.duplicated()]
print('duplicates size: ', len(duplicateRows ))
duplicateRows.head()
duplicates size: 24
| age | work-class | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4881 | 25 | Private | 308144 | Bachelors | 13 | Never-married | Craft-repair | Not-in-family | White | Male | 0 | 0 | 40 | Mexico | <=50K |
| 5104 | 90 | Private | 52386 | Some-college | 10 | Never-married | Other-service | Not-in-family | Asian-Pac-Islander | Male | 0 | 0 | 35 | United-States | <=50K |
| 9171 | 21 | Private | 250051 | Some-college | 10 | Never-married | Prof-specialty | Own-child | White | Female | 0 | 0 | 10 | United-States | <=50K |
| 11631 | 20 | Private | 107658 | Some-college | 10 | Never-married | Tech-support | Not-in-family | White | Female | 0 | 0 | 10 | United-States | <=50K |
| 13084 | 25 | Private | 195994 | LessHS | 2 | Never-married | Priv-house-serv | Not-in-family | White | Female | 0 | 0 | 40 | Guatemala | <=50K |
# check duplicates for a specific columns
duplicateRows = data[data.duplicated(['age', 'marital-status'])]
duplicateRows.sort_values(by=['age']).head()
| age | work-class | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3618 | 17 | Private | 354201 | HS-grad | 9 | Never-married | Other-service | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 14514 | 17 | Self-emp-inc | 61838 | LessHS | 6 | Never-married | Craft-repair | Own-child | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 21888 | 17 | Private | 171080 | LessHS | 8 | Never-married | Handlers-cleaners | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
| 14544 | 17 | Private | 73145 | LessHS | 5 | Never-married | Craft-repair | Own-child | White | Female | 0 | 0 | 16 | United-States | <=50K |
| 28945 | 17 | Private | 147069 | LessHS | 6 | Never-married | Sales | Own-child | White | Female | 0 | 0 | 16 | United-States | <=50K |
# drop duplicates
data = data.drop_duplicates()
# Dropping outliers based on condition
#i = data[data['capital-gain'] > 80000].index
#data = data.drop(i)
# drop un-used columns
del data['fnlwgt']
# display new structure
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 32537 entries, 0 to 32560 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 32537 non-null int64 1 work-class 32537 non-null object 2 education 32537 non-null object 3 education-num 32537 non-null int64 4 marital-status 32537 non-null object 5 occupation 32537 non-null object 6 relationship 32537 non-null object 7 race 32537 non-null object 8 sex 32537 non-null object 9 capital-gain 32537 non-null int64 10 capital-loss 32537 non-null int64 11 hours-per-week 32537 non-null int64 12 native-country 32537 non-null object 13 income 32537 non-null object dtypes: int64(5), object(9) memory usage: 3.7+ MB
# gets all statistics for all numeric values via describe mehtod.
# To suppress scientific notation, I'm using round function also
data.describe().round()
| age | education-num | capital-gain | capital-loss | hours-per-week | |
|---|---|---|---|---|---|
| count | 32537.0 | 32537.0 | 32537.0 | 32537.0 | 32537.0 |
| mean | 39.0 | 10.0 | 1078.0 | 87.0 | 40.0 |
| std | 14.0 | 3.0 | 7388.0 | 403.0 | 12.0 |
| min | 17.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 25% | 28.0 | 9.0 | 0.0 | 0.0 | 40.0 |
| 50% | 37.0 | 10.0 | 0.0 | 0.0 | 40.0 |
| 75% | 48.0 | 12.0 | 0.0 | 0.0 | 45.0 |
| max | 90.0 | 16.0 | 99999.0 | 4356.0 | 99.0 |
# age statistics
print("Age Statistics")
print(data['age'].describe())
print("Median Age: ", data['age'].median())
Age Statistics count 32537.000000 mean 38.585549 std 13.637984 min 17.000000 25% 28.000000 50% 37.000000 75% 48.000000 max 90.000000 Name: age, dtype: float64 Median Age: 37.0
#statistical summary by class variable (use round to suppress scientif notation as needed)
print("Statistics by Gender")
gender = data.groupby('sex').describe().round()
# transposed index and columns for long format (easier to view) vs wide
gender.transpose()
Statistics by Gender
| sex | Female | Male | |
|---|---|---|---|
| age | count | 10762.0 | 21775.0 |
| mean | 37.0 | 39.0 | |
| std | 14.0 | 13.0 | |
| min | 17.0 | 17.0 | |
| 25% | 25.0 | 29.0 | |
| 50% | 35.0 | 38.0 | |
| 75% | 46.0 | 48.0 | |
| max | 90.0 | 90.0 | |
| education-num | count | 10762.0 | 21775.0 |
| mean | 10.0 | 10.0 | |
| std | 2.0 | 3.0 | |
| min | 1.0 | 1.0 | |
| 25% | 9.0 | 9.0 | |
| 50% | 10.0 | 10.0 | |
| 75% | 12.0 | 13.0 | |
| max | 16.0 | 16.0 | |
| capital-gain | count | 10762.0 | 21775.0 |
| mean | 569.0 | 1330.0 | |
| std | 4926.0 | 8329.0 | |
| min | 0.0 | 0.0 | |
| 25% | 0.0 | 0.0 | |
| 50% | 0.0 | 0.0 | |
| 75% | 0.0 | 0.0 | |
| max | 99999.0 | 99999.0 | |
| capital-loss | count | 10762.0 | 21775.0 |
| mean | 61.0 | 100.0 | |
| std | 341.0 | 430.0 | |
| min | 0.0 | 0.0 | |
| 25% | 0.0 | 0.0 | |
| 50% | 0.0 | 0.0 | |
| 75% | 0.0 | 0.0 | |
| max | 4356.0 | 3770.0 | |
| hours-per-week | count | 10762.0 | 21775.0 |
| mean | 36.0 | 42.0 | |
| std | 12.0 | 12.0 | |
| min | 1.0 | 1.0 | |
| 25% | 30.0 | 40.0 | |
| 50% | 40.0 | 40.0 | |
| 75% | 40.0 | 49.0 | |
| max | 99.0 | 99.0 |
def custom_describe(df):
return pd.concat([df.describe().T, # transpose
df.median().rename('median'), # include median
df.skew().rename('skew'), # include skewness
df.kurt().rename('kurt'), # include kurtosis
], axis=1).T
custom_describe(data).round() # round values
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:3: FutureWarning: The default value of numeric_only in DataFrame.median is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
df.median().rename('median'), # include median
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:4: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
df.skew().rename('skew'), # include skewness
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\3915157787.py:5: FutureWarning: The default value of numeric_only in DataFrame.kurt is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.
df.kurt().rename('kurt'), # include kurtosis
| age | education-num | capital-gain | capital-loss | hours-per-week | |
|---|---|---|---|---|---|
| count | 32537.0 | 32537.0 | 32537.0 | 32537.0 | 32537.0 |
| mean | 39.0 | 10.0 | 1078.0 | 87.0 | 40.0 |
| std | 14.0 | 3.0 | 7388.0 | 403.0 | 12.0 |
| min | 17.0 | 1.0 | 0.0 | 0.0 | 1.0 |
| 25% | 28.0 | 9.0 | 0.0 | 0.0 | 40.0 |
| 50% | 37.0 | 10.0 | 0.0 | 0.0 | 40.0 |
| 75% | 48.0 | 12.0 | 0.0 | 0.0 | 45.0 |
| max | 90.0 | 16.0 | 99999.0 | 4356.0 | 99.0 |
| median | 37.0 | 10.0 | 0.0 | 0.0 | 40.0 |
| skew | 1.0 | -0.0 | 12.0 | 5.0 | 0.0 |
| kurt | -0.0 | 1.0 | 155.0 | 20.0 | 3.0 |
# histogram
data.hist(figsize=(12,10));
data.plot(figsize=(15,10), kind='density', subplots=True, layout=(2,3), sharex=False)
array([[<Axes: ylabel='Density'>, <Axes: ylabel='Density'>,
<Axes: ylabel='Density'>],
[<Axes: ylabel='Density'>, <Axes: ylabel='Density'>,
<Axes: ylabel='Density'>]], dtype=object)
data.plot(figsize=(12,5), kind='box');
# using seaborn library, plot each individually
fig, axes = plt.subplots(2, 2, figsize=(14,10))
ax = sns.boxplot(data['age'], orient='v', ax=axes[0, 0])
ax = sns.boxplot(data['education-num'], orient='v', ax=axes[0, 1])
ax = sns.boxplot(data['hours-per-week'], orient='v', ax=axes[1, 0])
ax = sns.boxplot(data['capital-gain'], orient='v', ax=axes[1, 1])
# show columns for categorical dataset created earlier
cat_data.columns
Index(['work-class', 'education', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'native-country', 'income'],
dtype='object')
plt.figure(figsize=(10,5))
ax=sns.countplot(data = cat_data, x = "work-class")
ax.set_title('Counts Work Class')
plt.xticks(rotation=45)
ax.set(xlabel='work classification', ylabel='');
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(12, 12));
# Using seaborn, pairwise plot by income
g=sns.pairplot(data, diag_kind = 'kde', hue='income')
g.fig.set_size_inches(12,12);
# rotate labels
for ax in g.axes.flatten():
# rotate y axis labels
ax.set_ylabel(ax.get_ylabel(), rotation = 0)
# set y labels alignment
ax.yaxis.get_label().set_horizontalalignment('right')
# Compute the correlation matrix
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')
C:\Users\tommy\AppData\Local\Temp\ipykernel_9752\4176814396.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. corr = data.corr()
| age | education-num | capital-gain | capital-loss | hours-per-week | |
|---|---|---|---|---|---|
| age | 1.000000 | 0.036224 | 0.077676 | 0.057745 | 0.068515 |
| education-num | 0.036224 | 1.000000 | 0.122664 | 0.079892 | 0.148422 |
| capital-gain | 0.077676 | 0.122664 | 1.000000 | -0.031639 | 0.078408 |
| capital-loss | 0.057745 | 0.079892 | -0.031639 | 1.000000 | 0.054229 |
| hours-per-week | 0.068515 | 0.148422 | 0.078408 | 0.054229 | 1.000000 |
# using seaboarn to plot correlation heat map
f, ax = plt.subplots(figsize=(12, 6))
# generate a custom diverging colormap
cmap = sns.diverging_palette(220,10, as_cmap=True)
sns.heatmap(corr, cmap=cmap, annot=True, square=True, ax=ax, linewidth = 0.1)
plt.title('Pearson Correlation of Features')
plt.yticks(rotation=45)
plt.xticks(rotation=45);
# scatter plots
sns.relplot(x="education-num", y="capital-gain", hue='income', data=data);
# Cross tabulation between work-class and sex
pd.crosstab(data['work-class'],data['sex'], margins=True)
| sex | Female | Male | All |
|---|---|---|---|
| work-class | |||
| ? | 839 | 997 | 1836 |
| Federal-gov | 315 | 645 | 960 |
| Local-gov | 835 | 1258 | 2093 |
| Never-worked | 2 | 5 | 7 |
| Private | 7743 | 14930 | 22673 |
| Self-emp-inc | 135 | 981 | 1116 |
| Self-emp-not-inc | 399 | 2141 | 2540 |
| State-gov | 489 | 809 | 1298 |
| Without-pay | 5 | 9 | 14 |
| All | 10762 | 21775 | 32537 |
# Distributions of observations within categories
sns.catplot(x="work-class", y="age", kind="box", data=data,
height=5, aspect=11/8)
plt.title('Work Class distribution by Age')
plt.xticks(rotation=45);
# Box plot between work-class and age for different sex
plt.figure(figsize=(12,6))
sns.boxplot(x="work-class",y="age", hue="sex", data=data)
plt.title('Work Class distribution by Age and Gender')
plt.xticks(rotation=45);
data['income'].value_counts()
<=50K 24698 >50K 7839 Name: income, dtype: int64
low = len(data[data['income'] == '<=50K']) / len(data)
high = len(data[data['income'] == '>50K']) / len(data)
print(f'<=50K: {round(low, 2)} >50K {round(high, 2)}')
<=50K: 0.76 >50K 0.24
data['education'].value_counts()
HS-grad 10494 Some-college 9731 Bachelors 5353 LessHS 4248 Masters 2298 Doctorate 413 Name: education, dtype: int64
# cross tab as percentages
pd.crosstab(data['education'],data['income'], margins=True, normalize='index')
| income | <=50K | >50K |
|---|---|---|
| education | ||
| Bachelors | 0.585092 | 0.414908 |
| Doctorate | 0.259080 | 0.740920 |
| HS-grad | 0.840480 | 0.159520 |
| LessHS | 0.942561 | 0.057439 |
| Masters | 0.398607 | 0.601393 |
| Some-college | 0.793238 | 0.206762 |
| All | 0.759074 | 0.240926 |
# datafrom cross tab
df = pd.crosstab(data['education-num'],data['income'], margins=True, normalize='index')
df = df[:-1] # exclude last row (All)
# scatter plot
Y=df.index
X=df['>50K']
plt.scatter(X, Y, marker='o');
plt.xlabel('% of income > 50K')
plt.ylabel('years of education');
fig = plt.figure(figsize=(12,5))
ax=sns.countplot(x="education", hue="income", data=data)
ax.set_title('Income by Education');
# change the colors
# Define a custom color palette with your desired colors
custom_palette = ["#FF5733", "#33FF57"]
fig = plt.figure(figsize=(12, 5))
ax = sns.countplot(x="education", hue="income", data=data, palette=custom_palette)
ax.set_title('Income by Education')
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
column_name = 'age'
# Create a customized histogram
plt.figure(figsize=(12, 10))
plt.hist(data[column_name], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel(column_name.capitalize()) # Use the column name as the x-axis label
plt.ylabel('Frequency')
plt.title(f'Histogram of {column_name.capitalize()}')
plt.grid(True)
# Show the histogram
plt.show()
# Group data by 'sex' and calculate statistics
gender_stats = data.groupby('sex').describe().round()
# Transpose the DataFrame for a long format
gender_stats_long = gender_stats.transpose()
# Create a bar plot for the statistical summary
plt.figure(figsize=(12, 6))
gender_stats_long.plot(kind='bar', rot=0, colormap='viridis')
plt.title('Statistics by Gender')
plt.xlabel('Statistics')
plt.ylabel('Values')
plt.legend(title='Gender')
# Show the plot
plt.show()
<Figure size 1200x600 with 0 Axes>
# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()
# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# Extract the first 5 rows of the "age" column
first_5_age = data['age'].head()
# Define custom colors for the bars
colors = ['skyblue', 'salmon', 'lightgreen', 'gold', 'lightcoral']
# Create a bar chart with custom colors
plt.figure(figsize=(8, 6))
bars = plt.bar(range(len(first_5_age)), first_5_age, color=colors)
plt.xlabel('Row Index')
plt.ylabel('Age')
plt.title('Age Distribution for the First 5 Rows')
plt.xticks(range(len(first_5_age)), ['Row 1', 'Row 2', 'Row 3', 'Row 4', 'Row 5'])
# Add data labels above each bar
for bar, age in zip(bars, first_5_age):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(age), ha='center', va='bottom')
plt.show()