In [1]:
# COVID-19 Global Statistics Analysis
## Exploratory Data Analysis, Data Preparation, and Inferential Analysis

#Name: Thomas Hollingshead
#Course: MSBA 320
#Project:* COVID-19 Global Statistics with IMF GDP Data
In [2]:
# Import Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

#ignore seaborn warnings
warnings.filterwarnings("ignore")

# statsmodels and scipy will be useful later for regression/inferential analysis
import statsmodels.api as sm
import scipy.stats as stats

# Display options for cleaner notebook output
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)

# Plot style
sns.set_theme()
In [3]:
## 1. Data Read / Loading

# In this section, the dataset is loaded into a pandas DataFrame.  
# After loading, the data is inspected to understand its structure, variable types, 
# and overall contents.
In [4]:
# 2. Load the Dataset
file_path = "covid19_global_statistics_2026_with_imf_gdp.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows
df.head()
Out[4]:
continent country population cases_per_million total_cases deaths_per_million total_deaths GDP ( USD billions) GDP Per Capita(USD)
0 Africa Niger 26083660 381 9931 12 312 26.11 864.48
1 Asia China 1448471400 347 503302 4 5272 20,650.75 14,730.31
2 Africa Liberia 5305117 1525 8090 56 295 5.59 955.67
3 Oceania Nauru 10903 494635 5393 92 1 0.18 14,958.98
4 Africa Comoros 907419 10038 9109 177 161 1.77 1,904.28
In [5]:
# 3. Structure of the Data

# Number of rows and columns
print("Shape of dataset:", df.shape)

# Column names
print("\nColumns:")
print(df.columns.tolist())
Shape of dataset: (184, 9)

Columns:
['continent', 'country', 'population', 'cases_per_million', 'total_cases', 'deaths_per_million', 'total_deaths', 'GDP ( USD billions)', 'GDP Per Capita(USD)']
In [6]:
# View data types and non-null counts
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   continent            184 non-null    object 
 1   country              184 non-null    object 
 2   population           184 non-null    int64  
 3   cases_per_million    184 non-null    int64  
 4   total_cases          184 non-null    int64  
 5   deaths_per_million   184 non-null    int64  
 6   total_deaths         184 non-null    int64  
 7   GDP ( USD billions)  184 non-null    float64
 8   GDP Per Capita(USD)  184 non-null    float64
dtypes: float64(2), int64(5), object(2)
memory usage: 13.1+ KB
In [7]:
# Structural Observations:

# The main categories are "Country" and "Continent"
# The remaining columns are numerical and can be used for descriptive/inferential analysis 

# This structure is appropriate for:
# country comparisons
# economic relationship analysis
# correlation and regression analysis 
In [8]:
# 4. Check for Missing Values 
missing_values = df.isnull().sum()

print("Missing values by column:")
print(missing_values)

print("\nTotal missing values in dataset:", missing_values.sum())
Missing values by column:
continent              0
country                0
population             0
cases_per_million      0
total_cases            0
deaths_per_million     0
total_deaths           0
GDP ( USD billions)    0
GDP Per Capita(USD)    0
dtype: int64

Total missing values in dataset: 0
In [9]:
# 5. Summary Statistics

#numerical columns
df.describe()
Out[9]:
population cases_per_million total_cases deaths_per_million total_deaths GDP ( USD billions) GDP Per Capita(USD)
count 184.00 184.00 184.00 184.00 184.00 184.00 184.00
mean 40,830,702.17 178,202.89 3,570,773.47 1,303.53 37,295.62 656.42 22,403.25
std 153,150,656.87 197,194.72 10,883,346.97 1,389.99 121,415.94 2,878.40 32,239.24
min 10,903.00 347.00 2,943.00 3.00 1.00 0.06 368.83
25% 1,832,623.50 12,820.50 43,973.75 132.75 415.25 15.94 2,954.81
50% 8,727,237.00 100,561.00 346,717.00 775.50 3,618.50 51.59 8,700.21
75% 29,506,638.75 284,247.25 1,920,548.00 2,160.25 19,127.50 334.64 29,996.03
max 1,448,471,400.00 771,655.00 111,820,082.00 6,595.00 1,219,487.00 31,821.29 246,738.26
In [10]:
# Transposed for readability 

df.describe().T
Out[10]:
count mean std min 25% 50% 75% max
population 184.00 40,830,702.17 153,150,656.87 10,903.00 1,832,623.50 8,727,237.00 29,506,638.75 1,448,471,400.00
cases_per_million 184.00 178,202.89 197,194.72 347.00 12,820.50 100,561.00 284,247.25 771,655.00
total_cases 184.00 3,570,773.47 10,883,346.97 2,943.00 43,973.75 346,717.00 1,920,548.00 111,820,082.00
deaths_per_million 184.00 1,303.53 1,389.99 3.00 132.75 775.50 2,160.25 6,595.00
total_deaths 184.00 37,295.62 121,415.94 1.00 415.25 3,618.50 19,127.50 1,219,487.00
GDP ( USD billions) 184.00 656.42 2,878.40 0.06 15.94 51.59 334.64 31,821.29
GDP Per Capita(USD) 184.00 22,403.25 32,239.24 368.83 2,954.81 8,700.21 29,996.03 246,738.26
In [11]:
# Key descriptive Statistics for Numerical Variables
df.describe().T[['mean','50%','std','min','max']]
Out[11]:
mean 50% std min max
population 40,830,702.17 8,727,237.00 153,150,656.87 10,903.00 1,448,471,400.00
cases_per_million 178,202.89 100,561.00 197,194.72 347.00 771,655.00
total_cases 3,570,773.47 346,717.00 10,883,346.97 2,943.00 111,820,082.00
deaths_per_million 1,303.53 775.50 1,389.99 3.00 6,595.00
total_deaths 37,295.62 3,618.50 121,415.94 1.00 1,219,487.00
GDP ( USD billions) 656.42 51.59 2,878.40 0.06 31,821.29
GDP Per Capita(USD) 22,403.25 8,700.21 32,239.24 368.83 246,738.26
In [ ]:
 
In [12]:
# 6. Select Numerical Columns

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns:", num_cols)
Numerical columns: ['population', 'cases_per_million', 'total_cases', 'deaths_per_million', 'total_deaths', 'GDP ( USD billions)', 'GDP Per Capita(USD)']
In [13]:
# 7. Histograms for numerical columns and create histograms

# Loop through all numerical columns and create histograms
for col in num_cols:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [14]:
# 8. Boxplots for detecting outliers

for col in num_cols:
    plt.figure(figsize=(8, 3))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [15]:
### Distribution Observations

# Based on the histograms and boxplots generated above, several clear patterns emerge across the numerical variables.

# Population
 # The population distribution is extremely right skewed, with most countries having relatively small populations and a few countries having very large populations.
 # The boxplot confirms the presence of multiple extreme outliers, representing highly populated countries such as those exceeding one billion people.
 # Because population varies so dramatically between countries, it is expected that total case and death counts will also vary widely.

# Cases per Million
 # The distribution of cases per million is right skewed but more spread out than population.
 # Most countries fall within the lower range of cases per million, while a smaller number of countries exhibit much higher infection rates.
 # The boxplot shows several high #end outliers, indicating that some countries experienced significantly higher infection rates relative to their population size.

# Total Cases
 # Total COVID #19 case counts are highly right skewed, with most countries reporting relatively low total case numbers compared to a small group of countries with extremely large totals.
 # The boxplot highlights several large outliers, reflecting countries with massive outbreaks or very large populations.
 # This confirms that raw totals are heavily influenced by population size.

# Deaths per Million
 # Deaths per million also show a right skewed distribution, although the spread is somewhat more gradual than total death counts.
 # Most countries have relatively low deaths per million, while a smaller group shows significantly higher mortality rates.
 # The boxplot indicates a few extreme values above 5000 deaths per million, suggesting substantial variation in pandemic severity and health system outcomes across countries.

# Total Deaths
 # Similar to total cases, the distribution of total deaths is heavily right skewed.
 # The majority of countries have relatively low death totals, while a small number of countries exhibit very high counts.
 # The boxplot reveals numerous high #value outliers, again indicating that population size and pandemic scale differ greatly across countries.

# GDP (USD billions)
 # GDP values are strongly right #skewed, with most countries having relatively small economies and a few countries having extremely large economies.
 # The boxplot shows clear outliers representing the largest global economies.
 # This distribution is typical in global economic datasets where a few countries dominate total economic output.

# GDP per Capita (USD)
 # GDP per capita also shows right skewness, though less extreme than total GDP.
 # Most countries fall within a lower to middle income range, while a smaller number of countries have very high per #capita income levels.
 # The boxplot indicates several high #income outliers, suggesting strong economic inequality between countries.

# Overall Observation
 # Nearly all numerical variables exhibit right #skewed distributions and visible outliers.
 # These patterns are expected in global datasets where country sizes, economic power, and pandemic impacts vary widely.
 # Because of these skewed distributions, relationships between variables will be further explored using correlation analysis and regression models in the inferential analysis section.
In [16]:
# 9. Countries by Continent

plt.figure(figsize=(8, 5))
sns.countplot(data=df, x="continent", order=df["continent"].value_counts().index)
plt.title("Number of Countries in Dataset by Continent")
plt.xlabel("Continent")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
# 10. Reusable Plot function for Section B

def plot_bar(data, x, y, title, xlabel, ylabel, figsize=(12, 8)):
    """
    Creates a horizontal bar plot for easier comparison across countries.
    """
    plt.figure(figsize=figsize)
    sns.barplot(data=data, x=x, y=y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()
In [18]:
# B - Data Visualization

# In this section, we visualize explorations in COVID-19 data and explore outcomes
# and economic indicators across countries

# Variables to be analyzed:
# Total COVID-19 Cases
# Total Deaths
# Deaths per million
# GDP (USD Billions)
# GDP per capita (USD)

# Bar plots are used to compare countries, and the data is sorted so that patterns and extreme values 
# can be more easily identified.
In [19]:
# Total COIVD Cases by Country (Top 20)

# Styling
sns.set_style("whitegrid")

df_cases = df.sort_values(by="total_cases", ascending=False).head(20)

plt.figure(figsize=(10,8))
sns.barplot(data=df_cases, x="total_cases", y="country", palette="Blues_r")

# Labels for the bars
ax = sns.barplot(data=df_cases, x="total_cases", y="country", palette="Blues_r")

for i, v in enumerate(df_cases["total_cases"]):
    ax.text(v, i, f"{int(v):,}", va='center')
    
plt.title("Top 20 Countries by Total COVID-19 Cases")
plt.xlabel("Total Cases")
plt.ylabel("Country")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [20]:
### Observation

# The distribution of total COVID-19 cases is highly concentrated among a small number of countries. 
# The United States reports the highest total case count at over 111 million cases, 
# followed by India with approximately 45 million cases. 

# There is a significant drop after the top two countries, with the remaining countries showing progressively lower totals. 
# This pattern reflects both the influence of large population sizes and differences in pandemic spread across countries. 
# Large and densely populated countries tend to dominate the upper end of total case counts.
In [21]:
# Cases Per Million by Country (Top 20)

df_cases_pm = df.sort_values(by="cases_per_million", ascending=False).head(20)

plt.figure(figsize=(10,8))

ax = sns.barplot(data=df_cases_pm, x="cases_per_million", y="country", palette="Oranges_r")

# Labels for the bars
for i, v in enumerate(df_cases_pm["cases_per_million"]):
    ax.text(v, i, f"{int(v):,}", va='center')

plt.title("Top 20 Countries by COVID-19 Cases per Million")
plt.xlabel("Cases per Million")
plt.ylabel("Country")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [22]:
### Observation

# When adjusting for population size, the countries with the highest infection rates differ significantly from those with the highest 
# total cases. Smaller European countries and microstates such as Brunei, San Marino, and Austria appear 
# among the highest cases per million.

# This indicates that although some countries may have relatively small populations and lower total case counts, 
#a large proportion of their population experienced infection. Cases per million therefore provide a more comparable measure 
# of pandemic intensity across countries.
In [23]:
# Total Deaths by Country (Top 20)

df_deaths = df.sort_values(by="total_deaths", ascending=False).head(20)

plt.figure(figsize=(10,8))

ax = sns.barplot(data=df_deaths, x="total_deaths", y="country", palette="Reds_r")

# Labels for the bars
for i, v in enumerate(df_deaths["total_deaths"]):
    ax.text(v, i, f"{int(v):,}", va='center')

plt.title("Top 20 Countries by Total COVID-19 Deaths")
plt.xlabel("Total Deaths")
plt.ylabel("Country")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [24]:
### Observation

# Total COVID-19 deaths are also concentrated among a small number of countries. The United States reports the highest number 
# of deaths, exceeding 1.2 million, followed by Brazil and India. 

# Similar to total case counts, countries with large populations and widespread outbreaks tend to have the highest 
# death totals. However, differences between countries may also reflect variations in healthcare capacity, 
# public health policies, and demographic factors.
In [25]:
# Deaths per Million by Country (Top 20)

df_deaths_pm = df.sort_values(by="deaths_per_million", ascending=False).head(20)

plt.figure(figsize=(10,8))

ax = sns.barplot(data=df_deaths_pm, x="deaths_per_million", y="country", palette="Purples_r")

# Labels for the bars
for i, v in enumerate(df_deaths_pm["deaths_per_million"]):
    ax.text(v, i, f"{int(v):,}", va='center')

plt.title("Top 20 Countries by COVID-19 Deaths per Million")
plt.xlabel("Deaths per Million")
plt.ylabel("Country")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [26]:
### Observation

# Deaths per million provide a population-adjusted measure of mortality and reveal a different pattern compared to total deaths. 
# Countries such as Peru, Bulgaria, and Hungary report some of the highest death rates relative to their population size.

# This suggests that certain countries experienced disproportionately high mortality during the pandemic. 
# Factors such as healthcare system capacity, population age structure, and government response measures may contribute to 
# these differences.
In [27]:
# GDP per Country (Top 20)


df_gdp = df.sort_values(by="GDP ( USD billions)", ascending=False).head(20)

plt.figure(figsize=(10,8))

ax = sns.barplot(data=df_gdp, x="GDP ( USD billions)", y="country", palette="Greens_r")

# Labels for the bars
for i, v in enumerate(df_gdp["GDP ( USD billions)"]):
    ax.text(v, i, f"{v:,.0f}", va='center')

plt.title("Top 20 Countries by GDP (USD Billions)")
plt.xlabel("GDP (USD Billions)")
plt.ylabel("Country")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [28]:
### Observation

# The GDP distribution highlights the dominance of a few large global economies. The United States and China have substantially 
# larger economies than all other countries, followed by Germany, India, and Japan.

# There is a sharp decline in GDP values after the top economies, illustrating the unequal distribution of economic output worldwide. 
# Most countries operate on a significantly smaller economic scale compared to the largest global economies.
In [29]:
# GDP per Capita by country

df_gdp_pc = df.sort_values(by="GDP Per Capita(USD)", ascending=False).head(20)

plt.figure(figsize=(10,8))

ax = sns.barplot(data=df_gdp_pc, x="GDP Per Capita(USD)", y="country", palette="BuGn_r")

# Labels for the bars
for i, v in enumerate(df_gdp_pc["GDP Per Capita(USD)"]):
    ax.text(v, i, f"{int(v):,}", va='center')

plt.title("Top 20 Countries by GDP per Capita")
plt.xlabel("GDP per Capita (USD)")
plt.ylabel("Country")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [30]:
### Observation

# GDP per capita highlights differences in economic prosperity across countries. 
# Smaller, high-income economies such as Liechtenstein and Luxembourg report the highest GDP per capita values.

# In contrast to total GDP, this measure emphasizes wealth per individual rather than total economic output. 
# Several smaller countries appear at the top of this ranking, indicating that high income levels are not necessarily associated with large economies.
In [31]:
### Section Summary

# The visualizations reveal significant differences across countries in both pandemic outcomes and economic indicators. 
# Total case and death counts tend to be highest in large and populous countries, while population-adjusted measures such 
# as cases per million and deaths per million highlight the relative severity of the pandemic across countries.

# Economic indicators also show substantial disparities, with a small number of countries dominating global GDP 
# while smaller high-income countries lead in GDP per capita. These patterns suggest that population size and economic 
# conditions may influence pandemic outcomes, which will be explored further through correlation and regression analysis 
# in the next section.
In [32]:
# C Inferential Analysis

# We will conduct inferential analysis to examine the relaionships between COVID-19 outcomes and country level dempographic 
# and economic indicators. 

# Correlation anlaysis among numerical variables
# Simple Linear regression for the required variable pairs
# Diagnostic plots for regression models
# Interpretation of statistical significance / behavior
In [33]:
# Correlation Analysis

# Select numerical columns
num_df = df.select_dtypes(include=["number"])

# Correlation matrix
corr_matrix = num_df.corr()

# Display matrix
corr_matrix
Out[33]:
population cases_per_million total_cases deaths_per_million total_deaths GDP ( USD billions) GDP Per Capita(USD)
population 1.00 -0.12 0.37 -0.08 0.40 0.58 -0.05
cases_per_million -0.12 1.00 0.20 0.52 0.07 0.07 0.67
total_cases 0.37 0.20 1.00 0.24 0.90 0.77 0.22
deaths_per_million -0.08 0.52 0.24 1.00 0.30 0.12 0.30
total_deaths 0.40 0.07 0.90 0.30 1.00 0.71 0.12
GDP ( USD billions) 0.58 0.07 0.77 0.12 0.71 1.00 0.19
GDP Per Capita(USD) -0.05 0.67 0.22 0.30 0.12 0.19 1.00
In [34]:
# Correlation Heatmap

sns.set_theme(style="white", font_scale=1.1)

plt.figure(figsize=(10,8))

sns.heatmap(
    corr_matrix,
    annot=True,
    cmap="icefire",
    center=0,
    fmt=".2f",
    linewidths=0.8,
    square=True,
    annot_kws={"weight":"bold"}
)

plt.title("Correlation Matrix of COVID-19 Metrics and Economic Indicators", fontsize=15, weight="bold")
plt.xticks(rotation=45)
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [35]:
### Correlation Analysis Observation

# The correlation matrix reveals several meaningful relationships between the COVID-19 metrics and economic indicators in the dataset.

# The strongest relationship appears between total cases and total deaths (0.90), indicating a very strong positive association. 
# This is expected, as countries with higher infection counts generally experience higher numbers of deaths.

# There is also a strong positive correlation between GDP and total cases (0.77) and between **GDP and total deaths (0.71). 
# This likely reflects the fact that larger economies often correspond to larger populations, more urbanization, 
# and greater international connectivity, all of which can influence the scale of pandemic spread.

# Population shows moderate positive correlations with total cases (0.37) and total deaths (0.40), 
# suggesting that population size contributes to overall case and death counts, though it is not the sole determining factor.

# For the population-adjusted metrics, GDP per capita shows a relatively strong positive correlation with cases per 
# million (0.67) and a moderate correlation with deaths per million (0.30). This suggests that wealthier countries 
# may report higher infection rates per capita, potentially due to differences in testing capacity, healthcare reporting systems, 
# or population mobility.

# Additionally, cases per million and deaths per million show a moderate correlation (0.52), indicating that countries 
# with higher infection rates per capita tend to also experience higher mortality rates per capita.

# Overall, the correlation results suggest that total pandemic impact is more strongly associated with economic 
# scale and total case counts, while per-capita measures reveal different patterns related to economic development and healthcare 
# reporting differences. These relationships will be further examined using regression analysis in the following section.
In [36]:
# Making a copy of the dataset that's regression friendly

reg_df = df.rename(columns={
    "GDP ( USD billions)": "gdp_usd_billions",
    "GDP Per Capita(USD)": "gdp_per_capita"
})

reg_df.head()
Out[36]:
continent country population cases_per_million total_cases deaths_per_million total_deaths gdp_usd_billions gdp_per_capita
0 Africa Niger 26083660 381 9931 12 312 26.11 864.48
1 Asia China 1448471400 347 503302 4 5272 20,650.75 14,730.31
2 Africa Liberia 5305117 1525 8090 56 295 5.59 955.67
3 Oceania Nauru 10903 494635 5393 92 1 0.18 14,958.98
4 Africa Comoros 907419 10038 9109 177 161 1.77 1,904.28
In [37]:
# Regression Function

import statsmodels.formula.api as smf

# Fits a simple linear regression model, and displays:
# Regression summary
# residual plot
# normal Q-Q plot

def run_regression(data, x_var, y_var):
    
    formula = f"{y_var} ~ {x_var}"
    model = smf.ols(formula=formula, data=data).fit()
    
    print("=" * 80)
    print(f"Regression Model: {y_var} ~ {x_var}")
    print("=" * 80)
    print(model.summary())
    
    # Create diagnostic plots
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Residual plot
    sns.residplot(x=model.fittedvalues, y=model.resid, lowess=True, ax=axes[0],
                  scatter_kws={"alpha": 0.7}, line_kws={"color": "red"})
    axes[0].set_title(f"Residual Plot: {y_var} ~ {x_var}")
    axes[0].set_xlabel("Fitted Values")
    axes[0].set_ylabel("Residuals")
    
    # Q-Q plot
    sm.qqplot(model.resid, line="45", fit=True, ax=axes[1])
    axes[1].set_title(f"Normal Q-Q Plot: {y_var} ~ {x_var}")
    
    plt.tight_layout()
    plt.show()
    
    return model

model_1 = run_regression(reg_df, "population", "total_cases")
================================================================================
Regression Model: total_cases ~ population
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            total_cases   R-squared:                       0.135
Model:                            OLS   Adj. R-squared:                  0.131
Method:                 Least Squares   F-statistic:                     28.51
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           2.76e-07
Time:                        02:20:34   Log-Likelihood:                -3228.5
No. Observations:                 184   AIC:                             6461.
Df Residuals:                     182   BIC:                             6467.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2.503e+06   7.74e+05      3.232      0.001    9.75e+05    4.03e+06
population     0.0262      0.005      5.339      0.000       0.016       0.036
==============================================================================
Omnibus:                      258.018   Durbin-Watson:                   1.138
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            23916.398
Skew:                           5.913   Prob(JB):                         0.00
Kurtosis:                      57.586   Cond. No.                     1.64e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [38]:
# Redo Residual plot without extremes for better visual


fitted = model_1.fittedvalues
residuals = model_1.resid

# Remove extreme outliers (top 2% of fitted values)
threshold = fitted.quantile(0.98)
mask = fitted < threshold

plt.figure(figsize=(8,6))

sns.residplot(
    x=fitted[mask],
    y=residuals[mask],
    lowess=True,
    scatter_kws={"alpha":0.7},
    line_kws={"color":"red"}
)

plt.title("Residual Plot (Trimmed): Total Cases vs Population")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [39]:
# Regression analysis

# Population does explain some variation in total cases, but isn't the whole story
# Larger countries show much more unpredictable case counts
# The model has increasing error as countries get larger 

# Population matters - countries with larger populations, in general, have more COVID cases
# That's what we learn from the positive coefficient (0.0262)
# The relationship exists between larger populations and COVID cases, and is statistically significant. 

# Population explains a small part of the story
# R^2 = 0.135
# Population only explains around 13.5% of the variance in total cases between countries
# The most variation comes from the other factors, like 
# Testing availability
# Public health policy
# Population Density
# Travel Patterns
# Healthcare systems
# Timing of Outbreaks

# The residual plot shows prediction errors get bigger for larger countries 
# It shows a fan shape
# This means the model is less reliable for large countries 
# Small countries are clustered near the line
# Large countries have a huge unpredictable variation
# In Data analysis, this is called heteroskedasticity

# The Q-Q plot shows the assumptions are violated
# It curves upward instead of following the line
# This shows us the residuals are not normally distributed
# This is because the datest has extreme outliers (large countries) and very skewed data
# Extremely common in global country-level datsets 

# Although population is a statistically significant predictor of total COVID-19 cases, the relatively low explanatory power 
# and diagnostic plots suggest that population alone does not adequately explain the variation in case counts across countries.
In [40]:
model_2 = run_regression(reg_df, "population", "total_deaths")
================================================================================
Regression Model: total_deaths ~ population
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           total_deaths   R-squared:                       0.159
Model:                            OLS   Adj. R-squared:                  0.155
Method:                 Least Squares   F-statistic:                     34.47
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           2.02e-08
Time:                        02:20:34   Log-Likelihood:                -2398.7
No. Observations:                 184   AIC:                             4801.
Df Residuals:                     182   BIC:                             4808.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2.438e+04   8518.969      2.862      0.005    7570.529    4.12e+04
population     0.0003   5.39e-05      5.871      0.000       0.000       0.000
==============================================================================
Omnibus:                      259.145   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            24453.383
Skew:                           5.953   Prob(JB):                         0.00
Kurtosis:                      58.207   Cond. No.                     1.64e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [41]:
model_3 = run_regression(reg_df, "gdp_usd_billions", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_usd_billions
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            total_cases   R-squared:                       0.586
Model:                            OLS   Adj. R-squared:                  0.584
Method:                 Least Squares   F-statistic:                     257.6
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           1.09e-36
Time:                        02:20:47   Log-Likelihood:                -3160.8
No. Observations:                 184   AIC:                             6326.
Df Residuals:                     182   BIC:                             6332.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept         1.671e+06   5.31e+05      3.146      0.002    6.23e+05    2.72e+06
gdp_usd_billions  2894.2937    180.344     16.049      0.000    2538.460    3250.128
==============================================================================
Omnibus:                      131.271   Durbin-Watson:                   0.882
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9658.070
Skew:                          -1.790   Prob(JB):                         0.00
Kurtosis:                      38.312   Cond. No.                     3.02e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [42]:
model_4 = run_regression(reg_df, "gdp_per_capita", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            total_cases   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     8.982
Date:                Mon, 09 Mar 2026   Prob (F-statistic):            0.00311
Time:                        02:20:53   Log-Likelihood:                -3237.5
No. Observations:                 184   AIC:                             6479.
Df Residuals:                     182   BIC:                             6485.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       1.931e+06   9.57e+05      2.017      0.045    4.19e+04    3.82e+06
gdp_per_capita    73.2092     24.428      2.997      0.003      25.011     121.407
==============================================================================
Omnibus:                      259.673   Durbin-Watson:                   0.711
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            20019.139
Skew:                           6.096   Prob(JB):                         0.00
Kurtosis:                      52.624   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [43]:
model_6 = run_regression(reg_df, "gdp_per_capita", "cases_per_million")
================================================================================
Regression Model: cases_per_million ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      cases_per_million   R-squared:                       0.443
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                     144.9
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           6.32e-25
Time:                        02:21:01   Log-Likelihood:                -2450.0
No. Observations:                 184   AIC:                             4904.
Df Residuals:                     182   BIC:                             4910.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       8.697e+04   1.33e+04      6.560      0.000    6.08e+04    1.13e+05
gdp_per_capita     4.0722      0.338     12.037      0.000       3.405       4.740
==============================================================================
Omnibus:                       29.713   Durbin-Watson:                   1.681
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.015
Skew:                           0.846   Prob(JB):                     2.27e-11
Kurtosis:                       4.878   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [44]:
model_8 = run_regression(reg_df, "gdp_per_capita", "deaths_per_million")
================================================================================
Regression Model: deaths_per_million ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:     deaths_per_million   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.086
Method:                 Least Squares   F-statistic:                     18.21
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           3.18e-05
Time:                        02:21:13   Log-Likelihood:                -1583.4
No. Observations:                 184   AIC:                             3171.
Df Residuals:                     182   BIC:                             3177.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       1012.2171    119.406      8.477      0.000     776.619    1247.815
gdp_per_capita     0.0130      0.003      4.267      0.000       0.007       0.019
==============================================================================
Omnibus:                       49.027   Durbin-Watson:                   1.526
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               82.585
Skew:                           1.388   Prob(JB):                     1.17e-18
Kurtosis:                       4.752   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [47]:
# Run all models for relevant data points

model_2 = run_regression(reg_df, "population", "total_deaths")
================================================================================
Regression Model: total_deaths ~ population
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           total_deaths   R-squared:                       0.159
Model:                            OLS   Adj. R-squared:                  0.155
Method:                 Least Squares   F-statistic:                     34.47
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           2.02e-08
Time:                        02:23:32   Log-Likelihood:                -2398.7
No. Observations:                 184   AIC:                             4801.
Df Residuals:                     182   BIC:                             4808.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2.438e+04   8518.969      2.862      0.005    7570.529    4.12e+04
population     0.0003   5.39e-05      5.871      0.000       0.000       0.000
==============================================================================
Omnibus:                      259.145   Durbin-Watson:                   1.529
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            24453.383
Skew:                           5.953   Prob(JB):                         0.00
Kurtosis:                      58.207   Cond. No.                     1.64e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [50]:
# Population | Deaths

# Countries with larger economies tend to report more total COVID cases

# This is probably because large economies often have:
# larger populations
# more international travel
# more urban environments
# and stronger testing/reporting systems

# GDP explains some of the differences in total cases,
# but it still leaves a lot of variation unexplained

# The residual plot suggests that prediction errors grow
# as the predicted number of cases increases

# The Q-Q plot shows some skewness in the residuals,
# meaning a few countries behave very differently from the rest
In [49]:
model_3 = run_regression(reg_df, "gdp_usd_billions", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_usd_billions
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            total_cases   R-squared:                       0.586
Model:                            OLS   Adj. R-squared:                  0.584
Method:                 Least Squares   F-statistic:                     257.6
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           1.09e-36
Time:                        02:24:39   Log-Likelihood:                -3160.8
No. Observations:                 184   AIC:                             6326.
Df Residuals:                     182   BIC:                             6332.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept         1.671e+06   5.31e+05      3.146      0.002    6.23e+05    2.72e+06
gdp_usd_billions  2894.2937    180.344     16.049      0.000    2538.460    3250.128
==============================================================================
Omnibus:                      131.271   Durbin-Watson:                   0.882
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9658.070
Skew:                          -1.790   Prob(JB):                         0.00
Kurtosis:                      38.312   Cond. No.                     3.02e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [ ]:
# Countries with larger economies tend to report more total COVID cases

# R^2 = 0.586
# This means GDP explains about 58.6% of the variation in total COVID cases between countries

# This is a much stronger relationship than population alone
# suggesting that economic scale is closely tied to the size of pandemic outbreaks

# Larger economies often have:
# Larger populations
# More international travel
# Higher levels of urbanization
# More developed testing and reporting systems

# All of these factors can contribute to higher reported case counts

# The GDP coefficient is statistically significant (p < 0.001)
# meaning the relationship between GDP and total cases is very unlikely to be due to random chance

# The residual plot still shows some spread for countries with very large GDP
# indicating the model becomes less precise for the largest economies

# The Q-Q plot shows noticeable deviations from the straight line
# which means the residuals are not perfectly normally distributed
# likely due to extreme values from very large countries

# Overall, GDP is a strong predictor of total COVID cases in this dataset
# but it still does not capture all of the factors influencing case counts across countries
In [51]:
model_4 = run_regression(reg_df, "gdp_per_capita", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            total_cases   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     8.982
Date:                Mon, 09 Mar 2026   Prob (F-statistic):            0.00311
Time:                        02:26:33   Log-Likelihood:                -3237.5
No. Observations:                 184   AIC:                             6479.
Df Residuals:                     182   BIC:                             6485.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       1.931e+06   9.57e+05      2.017      0.045    4.19e+04    3.82e+06
gdp_per_capita    73.2092     24.428      2.997      0.003      25.011     121.407
==============================================================================
Omnibus:                      259.673   Durbin-Watson:                   0.711
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            20019.139
Skew:                           6.096   Prob(JB):                         0.00
Kurtosis:                      52.624   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [52]:
# GDP per capita has a statistically significant relationship with total COVID cases
# but it explains very little of the differences between countries

# R^2 = 0.047
# This means GDP per capita explains only about 4.7% of the variation in total cases

# In other words, wealth per person does not do a good job predicting
# how many total cases a country experienced

# This makes sense because total cases are heavily influenced by
# how large a country is, not just how wealthy it is

# Large countries can have huge case counts even if they are not very wealthy,
# while small wealthy countries may still have relatively low total cases

# The GDP per capita coefficient is statistically significant (p ≈ 0.003),
# meaning the relationship likely exists, but it is very weak

# The residual plot shows a lot of spread around the prediction line,
# suggesting the model does not fit the data very well

# The Q-Q plot shows strong deviation from normality,
# indicating the presence of extreme outliers and skewed data

# Overall, GDP per capita alone is a poor predictor of total COVID case counts
# because population size and other pandemic-related factors play a much larger role
In [53]:
model_5 = run_regression(reg_df, "gdp_usd_billions", "cases_per_million")
================================================================================
Regression Model: cases_per_million ~ gdp_usd_billions
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      cases_per_million   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.027
Date:                Mon, 09 Mar 2026   Prob (F-statistic):              0.312
Time:                        02:27:20   Log-Likelihood:                -2503.4
No. Observations:                 184   AIC:                             5011.
Df Residuals:                     182   BIC:                             5017.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept         1.748e+05   1.49e+04     11.725      0.000    1.45e+05    2.04e+05
gdp_usd_billions     5.1313      5.064      1.013      0.312      -4.860      15.123
==============================================================================
Omnibus:                       28.212   Durbin-Watson:                   1.650
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               37.210
Skew:                           1.099   Prob(JB):                     8.32e-09
Kurtosis:                       3.154   Cond. No.                     3.02e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [ ]:
# GDP does not appear to explain differences in COVID infection rates between countries

# R^2 = 0.006
# This means GDP explains less than 1% of the variation in cases per million

# In other words, the size of a country's economy tells us almost nothing
# about how widespread COVID infections were relative to population

# The GDP coefficient is not statistically significant (p ≈ 0.31)
# which suggests there is no meaningful linear relationship
# between GDP and cases per million in this dataset

# This makes sense because infection rates depend much more on factors like:
# government containment policies
# population density
# mobility and travel patterns
# vaccination rates
# and public behavior during the pandemic

# The residual plot shows a crooked trend line rather than a flat one
# which suggests the relationship between GDP and cases per million
# is not well described by a simple linear model

# We also see very large vertical spread in the residuals
# meaning countries with similar GDP values can have extremely different infection rates

# The Q-Q plot forms an S-shaped curve instead of following the straight reference line
# which indicates the residuals are not normally distributed

# This S-shape usually means the model is missing important explanatory variables
# and that the data contains heavy tails or outliers

# Overall, this model shows that GDP alone is a poor predictor
# of how widely COVID spread within a country's population
In [54]:
model_6 = run_regression(reg_df, "gdp_per_capita", "cases_per_million")
================================================================================
Regression Model: cases_per_million ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      cases_per_million   R-squared:                       0.443
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                     144.9
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           6.32e-25
Time:                        02:28:18   Log-Likelihood:                -2450.0
No. Observations:                 184   AIC:                             4904.
Df Residuals:                     182   BIC:                             4910.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       8.697e+04   1.33e+04      6.560      0.000    6.08e+04    1.13e+05
gdp_per_capita     4.0722      0.338     12.037      0.000       3.405       4.740
==============================================================================
Omnibus:                       29.713   Durbin-Watson:                   1.681
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.015
Skew:                           0.846   Prob(JB):                     2.27e-11
Kurtosis:                       4.878   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [55]:
# GDP per capita shows a much stronger relationship with cases per million
# than total GDP did in the previous model

# R^2 = 0.443
# This means GDP per capita explains about 44.3% of the variation
# in COVID cases per million between countries

# The coefficient is positive and statistically significant (p < 0.001)
# meaning countries with higher income per person tend to report
# higher infection rates per million people

# One likely explanation is that wealthier countries tend to have:
# stronger testing systems
# more complete reporting of infections
# higher international travel and mobility
# more urbanized populations

# The residual plot shows a curved LOWESS line instead of a flat one
# which suggests the relationship may not be perfectly linear

# In particular, the model appears to slightly overpredict or underpredict
# cases at certain GDP levels, meaning the true relationship may be more complex

# The Q-Q plot again shows an S-shaped pattern
# indicating that the residuals are not perfectly normally distributed

# This usually happens when the data contains outliers or heavy tails,
# which is common in global country-level datasets

# Overall, GDP per capita appears to be a meaningful predictor
# of infection rates across countries, but it still does not capture
# all of the factors influencing COVID spread
In [56]:
model_7 = run_regression(reg_df, "gdp_per_capita", "total_deaths")
================================================================================
Regression Model: total_deaths ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:           total_deaths   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     2.494
Date:                Mon, 09 Mar 2026   Prob (F-statistic):              0.116
Time:                        02:29:05   Log-Likelihood:                -2413.4
No. Observations:                 184   AIC:                             4831.
Df Residuals:                     182   BIC:                             4837.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       2.749e+04   1.09e+04      2.530      0.012    6046.846    4.89e+04
gdp_per_capita     0.4379      0.277      1.579      0.116      -0.109       0.985
==============================================================================
Omnibus:                      267.515   Durbin-Watson:                   1.136
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            20965.904
Skew:                           6.447   Prob(JB):                         0.00
Kurtosis:                      53.680   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [ ]:
# GDP per capita does not appear to explain differences in total COVID deaths

# R^2 = 0.014
# This means GDP per capita explains only about 1.4% of the variation
# in total deaths between countries

# The coefficient is not statistically significant (p ≈ 0.116)
# which means we cannot conclude there is a meaningful relationship
# between GDP per capita and total deaths in this dataset

# This result makes sense because total death counts depend heavily
# on the size of a country's population rather than how wealthy it is

# Large countries can have very high death counts even if they are not very wealthy,
# while smaller wealthy countries may still have relatively low totals

# The residual plot shows a wide spread of prediction errors,
# indicating the model does a poor job explaining differences in death totals

# The red LOWESS line slopes slightly downward,
# suggesting the linear relationship between these variables is weak

# The Q-Q plot shows a strong S-shaped curve and several extreme points
# meaning the residuals are not normally distributed

# This indicates the presence of outliers and heavy tails,
# which is common when analyzing global country-level data

# Overall, GDP per capita alone is not a useful predictor
# of total COVID deaths across countries
In [57]:
model_8 = run_regression(reg_df, "gdp_per_capita", "deaths_per_million")
================================================================================
Regression Model: deaths_per_million ~ gdp_per_capita
================================================================================
                            OLS Regression Results                            
==============================================================================
Dep. Variable:     deaths_per_million   R-squared:                       0.091
Model:                            OLS   Adj. R-squared:                  0.086
Method:                 Least Squares   F-statistic:                     18.21
Date:                Mon, 09 Mar 2026   Prob (F-statistic):           3.18e-05
Time:                        02:29:54   Log-Likelihood:                -1583.4
No. Observations:                 184   AIC:                             3171.
Df Residuals:                     182   BIC:                             3177.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept       1012.2171    119.406      8.477      0.000     776.619    1247.815
gdp_per_capita     0.0130      0.003      4.267      0.000       0.007       0.019
==============================================================================
Omnibus:                       49.027   Durbin-Watson:                   1.526
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               82.585
Skew:                           1.388   Prob(JB):                     1.17e-18
Kurtosis:                       4.752   Cond. No.                     4.78e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
No description has been provided for this image
In [58]:
# GDP per capita shows a small but statistically significant relationship
# with COVID deaths per million

# R^2 = 0.091
# This means GDP per capita explains about 9.1% of the variation
# in deaths per million across countries

# The coefficient is statistically significant (p < 0.001)
# meaning there is evidence of a relationship between wealth per person
# and mortality rates during the pandemic

# However, the overall explanatory power of the model is still low
# which means GDP per capita alone does not explain most of the
# differences in death rates between countries

# Other factors likely played a much larger role, such as:
# healthcare system capacity
# vaccination availability
# population age structure
# government response policies
# timing of pandemic waves

# The residual plot shows a curved LOWESS line instead of a flat one
# suggesting the relationship between GDP per capita and deaths per million
# may not be perfectly linear

# We also see wide variation in residual values,
# meaning countries with similar GDP per capita can still have very different mortality outcomes

# The Q-Q plot shows an S-shaped curve rather than following the straight reference line
# indicating the residuals are not perfectly normally distributed

# This pattern suggests the presence of outliers and heavy-tailed data,
# which is common in global datasets where some countries experienced
# unusually high or unusually low mortality rates

# Overall, GDP per capita has a statistically significant relationship
# with deaths per million, but it explains only a small portion
# of the differences in pandemic mortality between countries
In [60]:
# Final Conclusion


# This analysis examined country-level COVID-19 outcomes alongside demographic and economic indicators. 
# The exploratory data analysis showed that most numerical variables were strongly right-skewed and contained 
# several outliers, reflecting major differences in country size, economic scale, and pandemic impact.

# The visualization section showed that total case counts and total deaths were concentrated in a small 
# number of large countries, while population-adjusted measures such as cases per million and deaths 
# per million revealed a different set of countries with relatively high pandemic burden. GDP and GDP 
# per capita also showed substantial inequality across countries.

# The inferential analysis found that total cases and total deaths were strongly associated with broader measures 
# of country scale, especially GDP and, to a lesser extent, population. GDP per capita was more informative 
# for population-adjusted case rates than for total counts. Several regression models were statistically 
# significant, but many also showed low explanatory power, non-normal residuals, and signs of heteroskedasticity. 
# This suggests that while population and economic indicators help explain some variation in COVID-19 outcomes, 
# they do not fully capture the complexity of pandemic patterns across countries.

# Higher-income countries often reported more cases per million.
# That doesn’t necessarily mean the virus spread more there — it can means:
# better testing
# better reporting
# more transparent health systems
# Lower-income countries often under-detect infections.
# So the relationship is partly measurement/reporting effects, not just true spread.
# Overall, the results show that country size, economic scale, and income level are related to COVID-19 outcomes, 
# but other factors such as healthcare capacity, policy response, testing availability, 
# and outbreak timing also played major roles.

# the analysis shows that larger and more economically active countries tended to report more total 
# COVID-19 cases and deaths, but the reasons are more complex than population or wealth alone.
# When we looked at population, it did help explain why some countries had more total cases and deaths 
# than others, but it only explained a small portion of the differences between countries. 
# This tells us that population size matters, but it does not determine pandemic outcomes by itself.

# Overall, the results suggest that country size and economic scale help explain some of the differences 
# in COVID-19 outcomes, but they do not tell the full story. The spread and impact of the pandemic were 
# shaped by a combination of population size, mobility, healthcare systems, government response, 
# and timing of outbreaks across different regions.
# Big countries had more total cases and deaths, richer countries reported more cases per person, 
# but no single factor fully explains how COVID affected different countries.
In [ ]:
# Extra Credit
In [ ]:
 
In [84]:
# country-name fixes for map matching
map_df["country"] = map_df["country"].replace({
    "USA": "United States",
    "UK": "United Kingdom",
    "Russia": "Russian Federation",
    "Bosnia-and-Herzegovina": "Bosnia and Herzegovina",
    "North-Macedonia": "North Macedonia",
    "South-Africa": "South Africa",
    "New-Zealand": "New Zealand",
    "San-Marino": "San Marino"
})
In [65]:
import pandas as pd
import plotly.express as px

# Clean copy
map_df = df.copy()

map_df = map_df.rename(columns={
    "GDP ( USD billions)": "gdp_usd_billions",
    "GDP Per Capita(USD)": "gdp_per_capita"
})

# Metrics dropdown
map_metrics = {
    "Total COVID Cases": "total_cases",
    "Cases per Million": "cases_per_million",
    "Total Deaths": "total_deaths",
    "Deaths per Million": "deaths_per_million",
    "GDP (USD Billions)": "gdp_usd_billions",
    "GDP per Capita (USD)": "gdp_per_capita"
}

# Swag color palette
swag_colors = [
    "#b8f2e6",  
    "#5dd9c1", 
    "#00bcd4",  
    "#4ea8de",  
    "#c1121f",  
    "#780000"   
]

fig = px.choropleth(
    map_df,
    locations="country",
    locationmode="country names",
    color="total_cases",
    hover_name="country",
    hover_data={
        "continent": True,
        "population": ":,",
        "total_cases": ":,",
        "cases_per_million": ":,.0f",
        "total_deaths": ":,",
        "deaths_per_million": ":,.0f",
        "gdp_usd_billions": ":,.1f",
        "gdp_per_capita": ":,.0f"
    },
    color_continuous_scale=swag_colors,
    projection="natural earth",
    title=" Global COVID-19 and Economic Indicators"
)

# Dropdown buttons
buttons = []
for label, metric in map_metrics.items():
    buttons.append(
        dict(
            method="update",
            label=label,
            args=[
                {"z": [map_df[metric]]},
                {"title": f" Global Map: {label}"}
            ]
        )
    )

fig.update_layout(
    template="plotly_dark",
    width=1150,
    height=700,
    margin=dict(l=20, r=20, t=80, b=20),

    updatemenus=[
        dict(
            buttons=buttons,
            direction="down",
            showactive=True,
            x=0.02,
            y=1.08,
            xanchor="left",
            yanchor="top",
            bgcolor="#1e1e1e",
            bordercolor="white",
            font=dict(size=12)
        )
    ],

    coloraxis_colorbar=dict(
        title="Metric Value",
        thickness=20,
        len=0.7
    ),

    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor="white",
        showocean=True,
        oceancolor="#0f1c2e",
        showland=True,
        landcolor="#1c1c1c"
    )
)

fig.show()
No description has been provided for this image
In [67]:
import pandas as pd
import plotly.express as px

# Clean copy
bubble_df = df.copy().rename(columns={
    "GDP ( USD billions)": "gdp_usd_billions",
    "GDP Per Capita(USD)": "gdp_per_capita"
})


# Swaggy alive palette
bubble__swag_colors = [
    "#b8f2e6",  
    "#5dd9c1",  
    "#00bcd4",  
    "#4ea8de",  
    "#c1121f",  
    "#780000"   
]

fig = px.scatter_geo(
    bubble_df,
    locations="country",
    locationmode="country names",
    size="total_cases",
    color="gdp_per_capita",
    hover_name="country",
    hover_data={
        "continent": True,
        "population": ":,",
        "total_cases": ":,",
        "cases_per_million": ":,.0f",
        "total_deaths": ":,",
        "deaths_per_million": ":,.0f",
        "gdp_usd_billions": ":,.1f",
        "gdp_per_capita": ":,.0f"
    },
    projection="natural earth",
    size_max=45,
    color_continuous_scale=bubble_colors,
    title=" Global Bubble Map: COVID-19 Burden and Economic Prosperity"
)

fig.update_layout(
    width=1200,
    height=700,
    title_x=0.5,
    title_font_size=22,
    margin=dict(l=20, r=20, t=70, b=20),

    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor="white",
        showocean=True,
        oceancolor="#4ea8de",   
        showland=True,
        landcolor="#e9ecef",
        bgcolor="#4ea8de"
    ),

    coloraxis_colorbar=dict(
        title="GDP per Capita",
        thickness=18,
        len=0.7
    ),

    hoverlabel=dict(
        bgcolor="white",
        font_size=12
    )
)

fig.show()
No description has been provided for this image