In [1]:
# COVID-19 Global Statistics Analysis
## Exploratory Data Analysis, Data Preparation, and Inferential Analysis
#Name: Thomas Hollingshead
#Course: MSBA 320
#Project:* COVID-19 Global Statistics with IMF GDP Data
In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#ignore seaborn warnings
warnings.filterwarnings("ignore")
# statsmodels and scipy will be useful later for regression/inferential analysis
import statsmodels.api as sm
import scipy.stats as stats
# Display options for cleaner notebook output
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.2f}".format)
# Plot style
sns.set_theme()
In [3]:
## 1. Data Read / Loading
# In this section, the dataset is loaded into a pandas DataFrame.
# After loading, the data is inspected to understand its structure, variable types,
# and overall contents.
In [4]:
# 2. Load the Dataset
file_path = "covid19_global_statistics_2026_with_imf_gdp.csv"
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)
# Display the first few rows
df.head()
Out[4]:
| continent | country | population | cases_per_million | total_cases | deaths_per_million | total_deaths | GDP ( USD billions) | GDP Per Capita(USD) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Africa | Niger | 26083660 | 381 | 9931 | 12 | 312 | 26.11 | 864.48 |
| 1 | Asia | China | 1448471400 | 347 | 503302 | 4 | 5272 | 20,650.75 | 14,730.31 |
| 2 | Africa | Liberia | 5305117 | 1525 | 8090 | 56 | 295 | 5.59 | 955.67 |
| 3 | Oceania | Nauru | 10903 | 494635 | 5393 | 92 | 1 | 0.18 | 14,958.98 |
| 4 | Africa | Comoros | 907419 | 10038 | 9109 | 177 | 161 | 1.77 | 1,904.28 |
In [5]:
# 3. Structure of the Data
# Number of rows and columns
print("Shape of dataset:", df.shape)
# Column names
print("\nColumns:")
print(df.columns.tolist())
Shape of dataset: (184, 9) Columns: ['continent', 'country', 'population', 'cases_per_million', 'total_cases', 'deaths_per_million', 'total_deaths', 'GDP ( USD billions)', 'GDP Per Capita(USD)']
In [6]:
# View data types and non-null counts
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 184 entries, 0 to 183 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 continent 184 non-null object 1 country 184 non-null object 2 population 184 non-null int64 3 cases_per_million 184 non-null int64 4 total_cases 184 non-null int64 5 deaths_per_million 184 non-null int64 6 total_deaths 184 non-null int64 7 GDP ( USD billions) 184 non-null float64 8 GDP Per Capita(USD) 184 non-null float64 dtypes: float64(2), int64(5), object(2) memory usage: 13.1+ KB
In [7]:
# Structural Observations:
# The main categories are "Country" and "Continent"
# The remaining columns are numerical and can be used for descriptive/inferential analysis
# This structure is appropriate for:
# country comparisons
# economic relationship analysis
# correlation and regression analysis
In [8]:
# 4. Check for Missing Values
missing_values = df.isnull().sum()
print("Missing values by column:")
print(missing_values)
print("\nTotal missing values in dataset:", missing_values.sum())
Missing values by column: continent 0 country 0 population 0 cases_per_million 0 total_cases 0 deaths_per_million 0 total_deaths 0 GDP ( USD billions) 0 GDP Per Capita(USD) 0 dtype: int64 Total missing values in dataset: 0
In [9]:
# 5. Summary Statistics
#numerical columns
df.describe()
Out[9]:
| population | cases_per_million | total_cases | deaths_per_million | total_deaths | GDP ( USD billions) | GDP Per Capita(USD) | |
|---|---|---|---|---|---|---|---|
| count | 184.00 | 184.00 | 184.00 | 184.00 | 184.00 | 184.00 | 184.00 |
| mean | 40,830,702.17 | 178,202.89 | 3,570,773.47 | 1,303.53 | 37,295.62 | 656.42 | 22,403.25 |
| std | 153,150,656.87 | 197,194.72 | 10,883,346.97 | 1,389.99 | 121,415.94 | 2,878.40 | 32,239.24 |
| min | 10,903.00 | 347.00 | 2,943.00 | 3.00 | 1.00 | 0.06 | 368.83 |
| 25% | 1,832,623.50 | 12,820.50 | 43,973.75 | 132.75 | 415.25 | 15.94 | 2,954.81 |
| 50% | 8,727,237.00 | 100,561.00 | 346,717.00 | 775.50 | 3,618.50 | 51.59 | 8,700.21 |
| 75% | 29,506,638.75 | 284,247.25 | 1,920,548.00 | 2,160.25 | 19,127.50 | 334.64 | 29,996.03 |
| max | 1,448,471,400.00 | 771,655.00 | 111,820,082.00 | 6,595.00 | 1,219,487.00 | 31,821.29 | 246,738.26 |
In [10]:
# Transposed for readability
df.describe().T
Out[10]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| population | 184.00 | 40,830,702.17 | 153,150,656.87 | 10,903.00 | 1,832,623.50 | 8,727,237.00 | 29,506,638.75 | 1,448,471,400.00 |
| cases_per_million | 184.00 | 178,202.89 | 197,194.72 | 347.00 | 12,820.50 | 100,561.00 | 284,247.25 | 771,655.00 |
| total_cases | 184.00 | 3,570,773.47 | 10,883,346.97 | 2,943.00 | 43,973.75 | 346,717.00 | 1,920,548.00 | 111,820,082.00 |
| deaths_per_million | 184.00 | 1,303.53 | 1,389.99 | 3.00 | 132.75 | 775.50 | 2,160.25 | 6,595.00 |
| total_deaths | 184.00 | 37,295.62 | 121,415.94 | 1.00 | 415.25 | 3,618.50 | 19,127.50 | 1,219,487.00 |
| GDP ( USD billions) | 184.00 | 656.42 | 2,878.40 | 0.06 | 15.94 | 51.59 | 334.64 | 31,821.29 |
| GDP Per Capita(USD) | 184.00 | 22,403.25 | 32,239.24 | 368.83 | 2,954.81 | 8,700.21 | 29,996.03 | 246,738.26 |
In [11]:
# Key descriptive Statistics for Numerical Variables
df.describe().T[['mean','50%','std','min','max']]
Out[11]:
| mean | 50% | std | min | max | |
|---|---|---|---|---|---|
| population | 40,830,702.17 | 8,727,237.00 | 153,150,656.87 | 10,903.00 | 1,448,471,400.00 |
| cases_per_million | 178,202.89 | 100,561.00 | 197,194.72 | 347.00 | 771,655.00 |
| total_cases | 3,570,773.47 | 346,717.00 | 10,883,346.97 | 2,943.00 | 111,820,082.00 |
| deaths_per_million | 1,303.53 | 775.50 | 1,389.99 | 3.00 | 6,595.00 |
| total_deaths | 37,295.62 | 3,618.50 | 121,415.94 | 1.00 | 1,219,487.00 |
| GDP ( USD billions) | 656.42 | 51.59 | 2,878.40 | 0.06 | 31,821.29 |
| GDP Per Capita(USD) | 22,403.25 | 8,700.21 | 32,239.24 | 368.83 | 246,738.26 |
In [ ]:
In [12]:
# 6. Select Numerical Columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns:", num_cols)
Numerical columns: ['population', 'cases_per_million', 'total_cases', 'deaths_per_million', 'total_deaths', 'GDP ( USD billions)', 'GDP Per Capita(USD)']
In [13]:
# 7. Histograms for numerical columns and create histograms
# Loop through all numerical columns and create histograms
for col in num_cols:
plt.figure(figsize=(8, 5))
sns.histplot(df[col], bins=30, kde=True)
plt.title(f"Distribution of {col}")
plt.xlabel(col)
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
In [14]:
# 8. Boxplots for detecting outliers
for col in num_cols:
plt.figure(figsize=(8, 3))
sns.boxplot(x=df[col])
plt.title(f"Boxplot of {col}")
plt.xlabel(col)
plt.tight_layout()
plt.show()
In [15]:
### Distribution Observations
# Based on the histograms and boxplots generated above, several clear patterns emerge across the numerical variables.
# Population
# The population distribution is extremely right skewed, with most countries having relatively small populations and a few countries having very large populations.
# The boxplot confirms the presence of multiple extreme outliers, representing highly populated countries such as those exceeding one billion people.
# Because population varies so dramatically between countries, it is expected that total case and death counts will also vary widely.
# Cases per Million
# The distribution of cases per million is right skewed but more spread out than population.
# Most countries fall within the lower range of cases per million, while a smaller number of countries exhibit much higher infection rates.
# The boxplot shows several high #end outliers, indicating that some countries experienced significantly higher infection rates relative to their population size.
# Total Cases
# Total COVID #19 case counts are highly right skewed, with most countries reporting relatively low total case numbers compared to a small group of countries with extremely large totals.
# The boxplot highlights several large outliers, reflecting countries with massive outbreaks or very large populations.
# This confirms that raw totals are heavily influenced by population size.
# Deaths per Million
# Deaths per million also show a right skewed distribution, although the spread is somewhat more gradual than total death counts.
# Most countries have relatively low deaths per million, while a smaller group shows significantly higher mortality rates.
# The boxplot indicates a few extreme values above 5000 deaths per million, suggesting substantial variation in pandemic severity and health system outcomes across countries.
# Total Deaths
# Similar to total cases, the distribution of total deaths is heavily right skewed.
# The majority of countries have relatively low death totals, while a small number of countries exhibit very high counts.
# The boxplot reveals numerous high #value outliers, again indicating that population size and pandemic scale differ greatly across countries.
# GDP (USD billions)
# GDP values are strongly right #skewed, with most countries having relatively small economies and a few countries having extremely large economies.
# The boxplot shows clear outliers representing the largest global economies.
# This distribution is typical in global economic datasets where a few countries dominate total economic output.
# GDP per Capita (USD)
# GDP per capita also shows right skewness, though less extreme than total GDP.
# Most countries fall within a lower to middle income range, while a smaller number of countries have very high per #capita income levels.
# The boxplot indicates several high #income outliers, suggesting strong economic inequality between countries.
# Overall Observation
# Nearly all numerical variables exhibit right #skewed distributions and visible outliers.
# These patterns are expected in global datasets where country sizes, economic power, and pandemic impacts vary widely.
# Because of these skewed distributions, relationships between variables will be further explored using correlation analysis and regression models in the inferential analysis section.
In [16]:
# 9. Countries by Continent
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x="continent", order=df["continent"].value_counts().index)
plt.title("Number of Countries in Dataset by Continent")
plt.xlabel("Continent")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [17]:
# 10. Reusable Plot function for Section B
def plot_bar(data, x, y, title, xlabel, ylabel, figsize=(12, 8)):
"""
Creates a horizontal bar plot for easier comparison across countries.
"""
plt.figure(figsize=figsize)
sns.barplot(data=data, x=x, y=y)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.tight_layout()
plt.show()
In [18]:
# B - Data Visualization
# In this section, we visualize explorations in COVID-19 data and explore outcomes
# and economic indicators across countries
# Variables to be analyzed:
# Total COVID-19 Cases
# Total Deaths
# Deaths per million
# GDP (USD Billions)
# GDP per capita (USD)
# Bar plots are used to compare countries, and the data is sorted so that patterns and extreme values
# can be more easily identified.
In [19]:
# Total COIVD Cases by Country (Top 20)
# Styling
sns.set_style("whitegrid")
df_cases = df.sort_values(by="total_cases", ascending=False).head(20)
plt.figure(figsize=(10,8))
sns.barplot(data=df_cases, x="total_cases", y="country", palette="Blues_r")
# Labels for the bars
ax = sns.barplot(data=df_cases, x="total_cases", y="country", palette="Blues_r")
for i, v in enumerate(df_cases["total_cases"]):
ax.text(v, i, f"{int(v):,}", va='center')
plt.title("Top 20 Countries by Total COVID-19 Cases")
plt.xlabel("Total Cases")
plt.ylabel("Country")
plt.tight_layout()
plt.show()
In [20]:
### Observation
# The distribution of total COVID-19 cases is highly concentrated among a small number of countries.
# The United States reports the highest total case count at over 111 million cases,
# followed by India with approximately 45 million cases.
# There is a significant drop after the top two countries, with the remaining countries showing progressively lower totals.
# This pattern reflects both the influence of large population sizes and differences in pandemic spread across countries.
# Large and densely populated countries tend to dominate the upper end of total case counts.
In [21]:
# Cases Per Million by Country (Top 20)
df_cases_pm = df.sort_values(by="cases_per_million", ascending=False).head(20)
plt.figure(figsize=(10,8))
ax = sns.barplot(data=df_cases_pm, x="cases_per_million", y="country", palette="Oranges_r")
# Labels for the bars
for i, v in enumerate(df_cases_pm["cases_per_million"]):
ax.text(v, i, f"{int(v):,}", va='center')
plt.title("Top 20 Countries by COVID-19 Cases per Million")
plt.xlabel("Cases per Million")
plt.ylabel("Country")
plt.tight_layout()
plt.show()
In [22]:
### Observation
# When adjusting for population size, the countries with the highest infection rates differ significantly from those with the highest
# total cases. Smaller European countries and microstates such as Brunei, San Marino, and Austria appear
# among the highest cases per million.
# This indicates that although some countries may have relatively small populations and lower total case counts,
#a large proportion of their population experienced infection. Cases per million therefore provide a more comparable measure
# of pandemic intensity across countries.
In [23]:
# Total Deaths by Country (Top 20)
df_deaths = df.sort_values(by="total_deaths", ascending=False).head(20)
plt.figure(figsize=(10,8))
ax = sns.barplot(data=df_deaths, x="total_deaths", y="country", palette="Reds_r")
# Labels for the bars
for i, v in enumerate(df_deaths["total_deaths"]):
ax.text(v, i, f"{int(v):,}", va='center')
plt.title("Top 20 Countries by Total COVID-19 Deaths")
plt.xlabel("Total Deaths")
plt.ylabel("Country")
plt.tight_layout()
plt.show()
In [24]:
### Observation
# Total COVID-19 deaths are also concentrated among a small number of countries. The United States reports the highest number
# of deaths, exceeding 1.2 million, followed by Brazil and India.
# Similar to total case counts, countries with large populations and widespread outbreaks tend to have the highest
# death totals. However, differences between countries may also reflect variations in healthcare capacity,
# public health policies, and demographic factors.
In [25]:
# Deaths per Million by Country (Top 20)
df_deaths_pm = df.sort_values(by="deaths_per_million", ascending=False).head(20)
plt.figure(figsize=(10,8))
ax = sns.barplot(data=df_deaths_pm, x="deaths_per_million", y="country", palette="Purples_r")
# Labels for the bars
for i, v in enumerate(df_deaths_pm["deaths_per_million"]):
ax.text(v, i, f"{int(v):,}", va='center')
plt.title("Top 20 Countries by COVID-19 Deaths per Million")
plt.xlabel("Deaths per Million")
plt.ylabel("Country")
plt.tight_layout()
plt.show()
In [26]:
### Observation
# Deaths per million provide a population-adjusted measure of mortality and reveal a different pattern compared to total deaths.
# Countries such as Peru, Bulgaria, and Hungary report some of the highest death rates relative to their population size.
# This suggests that certain countries experienced disproportionately high mortality during the pandemic.
# Factors such as healthcare system capacity, population age structure, and government response measures may contribute to
# these differences.
In [27]:
# GDP per Country (Top 20)
df_gdp = df.sort_values(by="GDP ( USD billions)", ascending=False).head(20)
plt.figure(figsize=(10,8))
ax = sns.barplot(data=df_gdp, x="GDP ( USD billions)", y="country", palette="Greens_r")
# Labels for the bars
for i, v in enumerate(df_gdp["GDP ( USD billions)"]):
ax.text(v, i, f"{v:,.0f}", va='center')
plt.title("Top 20 Countries by GDP (USD Billions)")
plt.xlabel("GDP (USD Billions)")
plt.ylabel("Country")
plt.tight_layout()
plt.show()
In [28]:
### Observation
# The GDP distribution highlights the dominance of a few large global economies. The United States and China have substantially
# larger economies than all other countries, followed by Germany, India, and Japan.
# There is a sharp decline in GDP values after the top economies, illustrating the unequal distribution of economic output worldwide.
# Most countries operate on a significantly smaller economic scale compared to the largest global economies.
In [29]:
# GDP per Capita by country
df_gdp_pc = df.sort_values(by="GDP Per Capita(USD)", ascending=False).head(20)
plt.figure(figsize=(10,8))
ax = sns.barplot(data=df_gdp_pc, x="GDP Per Capita(USD)", y="country", palette="BuGn_r")
# Labels for the bars
for i, v in enumerate(df_gdp_pc["GDP Per Capita(USD)"]):
ax.text(v, i, f"{int(v):,}", va='center')
plt.title("Top 20 Countries by GDP per Capita")
plt.xlabel("GDP per Capita (USD)")
plt.ylabel("Country")
plt.tight_layout()
plt.show()
In [30]:
### Observation
# GDP per capita highlights differences in economic prosperity across countries.
# Smaller, high-income economies such as Liechtenstein and Luxembourg report the highest GDP per capita values.
# In contrast to total GDP, this measure emphasizes wealth per individual rather than total economic output.
# Several smaller countries appear at the top of this ranking, indicating that high income levels are not necessarily associated with large economies.
In [31]:
### Section Summary
# The visualizations reveal significant differences across countries in both pandemic outcomes and economic indicators.
# Total case and death counts tend to be highest in large and populous countries, while population-adjusted measures such
# as cases per million and deaths per million highlight the relative severity of the pandemic across countries.
# Economic indicators also show substantial disparities, with a small number of countries dominating global GDP
# while smaller high-income countries lead in GDP per capita. These patterns suggest that population size and economic
# conditions may influence pandemic outcomes, which will be explored further through correlation and regression analysis
# in the next section.
In [32]:
# C Inferential Analysis
# We will conduct inferential analysis to examine the relaionships between COVID-19 outcomes and country level dempographic
# and economic indicators.
# Correlation anlaysis among numerical variables
# Simple Linear regression for the required variable pairs
# Diagnostic plots for regression models
# Interpretation of statistical significance / behavior
In [33]:
# Correlation Analysis
# Select numerical columns
num_df = df.select_dtypes(include=["number"])
# Correlation matrix
corr_matrix = num_df.corr()
# Display matrix
corr_matrix
Out[33]:
| population | cases_per_million | total_cases | deaths_per_million | total_deaths | GDP ( USD billions) | GDP Per Capita(USD) | |
|---|---|---|---|---|---|---|---|
| population | 1.00 | -0.12 | 0.37 | -0.08 | 0.40 | 0.58 | -0.05 |
| cases_per_million | -0.12 | 1.00 | 0.20 | 0.52 | 0.07 | 0.07 | 0.67 |
| total_cases | 0.37 | 0.20 | 1.00 | 0.24 | 0.90 | 0.77 | 0.22 |
| deaths_per_million | -0.08 | 0.52 | 0.24 | 1.00 | 0.30 | 0.12 | 0.30 |
| total_deaths | 0.40 | 0.07 | 0.90 | 0.30 | 1.00 | 0.71 | 0.12 |
| GDP ( USD billions) | 0.58 | 0.07 | 0.77 | 0.12 | 0.71 | 1.00 | 0.19 |
| GDP Per Capita(USD) | -0.05 | 0.67 | 0.22 | 0.30 | 0.12 | 0.19 | 1.00 |
In [34]:
# Correlation Heatmap
sns.set_theme(style="white", font_scale=1.1)
plt.figure(figsize=(10,8))
sns.heatmap(
corr_matrix,
annot=True,
cmap="icefire",
center=0,
fmt=".2f",
linewidths=0.8,
square=True,
annot_kws={"weight":"bold"}
)
plt.title("Correlation Matrix of COVID-19 Metrics and Economic Indicators", fontsize=15, weight="bold")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
In [35]:
### Correlation Analysis Observation
# The correlation matrix reveals several meaningful relationships between the COVID-19 metrics and economic indicators in the dataset.
# The strongest relationship appears between total cases and total deaths (0.90), indicating a very strong positive association.
# This is expected, as countries with higher infection counts generally experience higher numbers of deaths.
# There is also a strong positive correlation between GDP and total cases (0.77) and between **GDP and total deaths (0.71).
# This likely reflects the fact that larger economies often correspond to larger populations, more urbanization,
# and greater international connectivity, all of which can influence the scale of pandemic spread.
# Population shows moderate positive correlations with total cases (0.37) and total deaths (0.40),
# suggesting that population size contributes to overall case and death counts, though it is not the sole determining factor.
# For the population-adjusted metrics, GDP per capita shows a relatively strong positive correlation with cases per
# million (0.67) and a moderate correlation with deaths per million (0.30). This suggests that wealthier countries
# may report higher infection rates per capita, potentially due to differences in testing capacity, healthcare reporting systems,
# or population mobility.
# Additionally, cases per million and deaths per million show a moderate correlation (0.52), indicating that countries
# with higher infection rates per capita tend to also experience higher mortality rates per capita.
# Overall, the correlation results suggest that total pandemic impact is more strongly associated with economic
# scale and total case counts, while per-capita measures reveal different patterns related to economic development and healthcare
# reporting differences. These relationships will be further examined using regression analysis in the following section.
In [36]:
# Making a copy of the dataset that's regression friendly
reg_df = df.rename(columns={
"GDP ( USD billions)": "gdp_usd_billions",
"GDP Per Capita(USD)": "gdp_per_capita"
})
reg_df.head()
Out[36]:
| continent | country | population | cases_per_million | total_cases | deaths_per_million | total_deaths | gdp_usd_billions | gdp_per_capita | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Africa | Niger | 26083660 | 381 | 9931 | 12 | 312 | 26.11 | 864.48 |
| 1 | Asia | China | 1448471400 | 347 | 503302 | 4 | 5272 | 20,650.75 | 14,730.31 |
| 2 | Africa | Liberia | 5305117 | 1525 | 8090 | 56 | 295 | 5.59 | 955.67 |
| 3 | Oceania | Nauru | 10903 | 494635 | 5393 | 92 | 1 | 0.18 | 14,958.98 |
| 4 | Africa | Comoros | 907419 | 10038 | 9109 | 177 | 161 | 1.77 | 1,904.28 |
In [37]:
# Regression Function
import statsmodels.formula.api as smf
# Fits a simple linear regression model, and displays:
# Regression summary
# residual plot
# normal Q-Q plot
def run_regression(data, x_var, y_var):
formula = f"{y_var} ~ {x_var}"
model = smf.ols(formula=formula, data=data).fit()
print("=" * 80)
print(f"Regression Model: {y_var} ~ {x_var}")
print("=" * 80)
print(model.summary())
# Create diagnostic plots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Residual plot
sns.residplot(x=model.fittedvalues, y=model.resid, lowess=True, ax=axes[0],
scatter_kws={"alpha": 0.7}, line_kws={"color": "red"})
axes[0].set_title(f"Residual Plot: {y_var} ~ {x_var}")
axes[0].set_xlabel("Fitted Values")
axes[0].set_ylabel("Residuals")
# Q-Q plot
sm.qqplot(model.resid, line="45", fit=True, ax=axes[1])
axes[1].set_title(f"Normal Q-Q Plot: {y_var} ~ {x_var}")
plt.tight_layout()
plt.show()
return model
model_1 = run_regression(reg_df, "population", "total_cases")
================================================================================
Regression Model: total_cases ~ population
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_cases R-squared: 0.135
Model: OLS Adj. R-squared: 0.131
Method: Least Squares F-statistic: 28.51
Date: Mon, 09 Mar 2026 Prob (F-statistic): 2.76e-07
Time: 02:20:34 Log-Likelihood: -3228.5
No. Observations: 184 AIC: 6461.
Df Residuals: 182 BIC: 6467.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.503e+06 7.74e+05 3.232 0.001 9.75e+05 4.03e+06
population 0.0262 0.005 5.339 0.000 0.016 0.036
==============================================================================
Omnibus: 258.018 Durbin-Watson: 1.138
Prob(Omnibus): 0.000 Jarque-Bera (JB): 23916.398
Skew: 5.913 Prob(JB): 0.00
Kurtosis: 57.586 Cond. No. 1.64e+08
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [38]:
# Redo Residual plot without extremes for better visual
fitted = model_1.fittedvalues
residuals = model_1.resid
# Remove extreme outliers (top 2% of fitted values)
threshold = fitted.quantile(0.98)
mask = fitted < threshold
plt.figure(figsize=(8,6))
sns.residplot(
x=fitted[mask],
y=residuals[mask],
lowess=True,
scatter_kws={"alpha":0.7},
line_kws={"color":"red"}
)
plt.title("Residual Plot (Trimmed): Total Cases vs Population")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.tight_layout()
plt.show()
In [39]:
# Regression analysis
# Population does explain some variation in total cases, but isn't the whole story
# Larger countries show much more unpredictable case counts
# The model has increasing error as countries get larger
# Population matters - countries with larger populations, in general, have more COVID cases
# That's what we learn from the positive coefficient (0.0262)
# The relationship exists between larger populations and COVID cases, and is statistically significant.
# Population explains a small part of the story
# R^2 = 0.135
# Population only explains around 13.5% of the variance in total cases between countries
# The most variation comes from the other factors, like
# Testing availability
# Public health policy
# Population Density
# Travel Patterns
# Healthcare systems
# Timing of Outbreaks
# The residual plot shows prediction errors get bigger for larger countries
# It shows a fan shape
# This means the model is less reliable for large countries
# Small countries are clustered near the line
# Large countries have a huge unpredictable variation
# In Data analysis, this is called heteroskedasticity
# The Q-Q plot shows the assumptions are violated
# It curves upward instead of following the line
# This shows us the residuals are not normally distributed
# This is because the datest has extreme outliers (large countries) and very skewed data
# Extremely common in global country-level datsets
# Although population is a statistically significant predictor of total COVID-19 cases, the relatively low explanatory power
# and diagnostic plots suggest that population alone does not adequately explain the variation in case counts across countries.
In [40]:
model_2 = run_regression(reg_df, "population", "total_deaths")
================================================================================
Regression Model: total_deaths ~ population
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_deaths R-squared: 0.159
Model: OLS Adj. R-squared: 0.155
Method: Least Squares F-statistic: 34.47
Date: Mon, 09 Mar 2026 Prob (F-statistic): 2.02e-08
Time: 02:20:34 Log-Likelihood: -2398.7
No. Observations: 184 AIC: 4801.
Df Residuals: 182 BIC: 4808.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.438e+04 8518.969 2.862 0.005 7570.529 4.12e+04
population 0.0003 5.39e-05 5.871 0.000 0.000 0.000
==============================================================================
Omnibus: 259.145 Durbin-Watson: 1.529
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24453.383
Skew: 5.953 Prob(JB): 0.00
Kurtosis: 58.207 Cond. No. 1.64e+08
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [41]:
model_3 = run_regression(reg_df, "gdp_usd_billions", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_usd_billions
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_cases R-squared: 0.586
Model: OLS Adj. R-squared: 0.584
Method: Least Squares F-statistic: 257.6
Date: Mon, 09 Mar 2026 Prob (F-statistic): 1.09e-36
Time: 02:20:47 Log-Likelihood: -3160.8
No. Observations: 184 AIC: 6326.
Df Residuals: 182 BIC: 6332.
Df Model: 1
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
Intercept 1.671e+06 5.31e+05 3.146 0.002 6.23e+05 2.72e+06
gdp_usd_billions 2894.2937 180.344 16.049 0.000 2538.460 3250.128
==============================================================================
Omnibus: 131.271 Durbin-Watson: 0.882
Prob(Omnibus): 0.000 Jarque-Bera (JB): 9658.070
Skew: -1.790 Prob(JB): 0.00
Kurtosis: 38.312 Cond. No. 3.02e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [42]:
model_4 = run_regression(reg_df, "gdp_per_capita", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_cases R-squared: 0.047
Model: OLS Adj. R-squared: 0.042
Method: Least Squares F-statistic: 8.982
Date: Mon, 09 Mar 2026 Prob (F-statistic): 0.00311
Time: 02:20:53 Log-Likelihood: -3237.5
No. Observations: 184 AIC: 6479.
Df Residuals: 182 BIC: 6485.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 1.931e+06 9.57e+05 2.017 0.045 4.19e+04 3.82e+06
gdp_per_capita 73.2092 24.428 2.997 0.003 25.011 121.407
==============================================================================
Omnibus: 259.673 Durbin-Watson: 0.711
Prob(Omnibus): 0.000 Jarque-Bera (JB): 20019.139
Skew: 6.096 Prob(JB): 0.00
Kurtosis: 52.624 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [43]:
model_6 = run_regression(reg_df, "gdp_per_capita", "cases_per_million")
================================================================================
Regression Model: cases_per_million ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: cases_per_million R-squared: 0.443
Model: OLS Adj. R-squared: 0.440
Method: Least Squares F-statistic: 144.9
Date: Mon, 09 Mar 2026 Prob (F-statistic): 6.32e-25
Time: 02:21:01 Log-Likelihood: -2450.0
No. Observations: 184 AIC: 4904.
Df Residuals: 182 BIC: 4910.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 8.697e+04 1.33e+04 6.560 0.000 6.08e+04 1.13e+05
gdp_per_capita 4.0722 0.338 12.037 0.000 3.405 4.740
==============================================================================
Omnibus: 29.713 Durbin-Watson: 1.681
Prob(Omnibus): 0.000 Jarque-Bera (JB): 49.015
Skew: 0.846 Prob(JB): 2.27e-11
Kurtosis: 4.878 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [44]:
model_8 = run_regression(reg_df, "gdp_per_capita", "deaths_per_million")
================================================================================
Regression Model: deaths_per_million ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: deaths_per_million R-squared: 0.091
Model: OLS Adj. R-squared: 0.086
Method: Least Squares F-statistic: 18.21
Date: Mon, 09 Mar 2026 Prob (F-statistic): 3.18e-05
Time: 02:21:13 Log-Likelihood: -1583.4
No. Observations: 184 AIC: 3171.
Df Residuals: 182 BIC: 3177.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 1012.2171 119.406 8.477 0.000 776.619 1247.815
gdp_per_capita 0.0130 0.003 4.267 0.000 0.007 0.019
==============================================================================
Omnibus: 49.027 Durbin-Watson: 1.526
Prob(Omnibus): 0.000 Jarque-Bera (JB): 82.585
Skew: 1.388 Prob(JB): 1.17e-18
Kurtosis: 4.752 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [47]:
# Run all models for relevant data points
model_2 = run_regression(reg_df, "population", "total_deaths")
================================================================================
Regression Model: total_deaths ~ population
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_deaths R-squared: 0.159
Model: OLS Adj. R-squared: 0.155
Method: Least Squares F-statistic: 34.47
Date: Mon, 09 Mar 2026 Prob (F-statistic): 2.02e-08
Time: 02:23:32 Log-Likelihood: -2398.7
No. Observations: 184 AIC: 4801.
Df Residuals: 182 BIC: 4808.
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 2.438e+04 8518.969 2.862 0.005 7570.529 4.12e+04
population 0.0003 5.39e-05 5.871 0.000 0.000 0.000
==============================================================================
Omnibus: 259.145 Durbin-Watson: 1.529
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24453.383
Skew: 5.953 Prob(JB): 0.00
Kurtosis: 58.207 Cond. No. 1.64e+08
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.64e+08. This might indicate that there are
strong multicollinearity or other numerical problems.
In [50]:
# Population | Deaths
# Countries with larger economies tend to report more total COVID cases
# This is probably because large economies often have:
# larger populations
# more international travel
# more urban environments
# and stronger testing/reporting systems
# GDP explains some of the differences in total cases,
# but it still leaves a lot of variation unexplained
# The residual plot suggests that prediction errors grow
# as the predicted number of cases increases
# The Q-Q plot shows some skewness in the residuals,
# meaning a few countries behave very differently from the rest
In [49]:
model_3 = run_regression(reg_df, "gdp_usd_billions", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_usd_billions
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_cases R-squared: 0.586
Model: OLS Adj. R-squared: 0.584
Method: Least Squares F-statistic: 257.6
Date: Mon, 09 Mar 2026 Prob (F-statistic): 1.09e-36
Time: 02:24:39 Log-Likelihood: -3160.8
No. Observations: 184 AIC: 6326.
Df Residuals: 182 BIC: 6332.
Df Model: 1
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
Intercept 1.671e+06 5.31e+05 3.146 0.002 6.23e+05 2.72e+06
gdp_usd_billions 2894.2937 180.344 16.049 0.000 2538.460 3250.128
==============================================================================
Omnibus: 131.271 Durbin-Watson: 0.882
Prob(Omnibus): 0.000 Jarque-Bera (JB): 9658.070
Skew: -1.790 Prob(JB): 0.00
Kurtosis: 38.312 Cond. No. 3.02e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# Countries with larger economies tend to report more total COVID cases
# R^2 = 0.586
# This means GDP explains about 58.6% of the variation in total COVID cases between countries
# This is a much stronger relationship than population alone
# suggesting that economic scale is closely tied to the size of pandemic outbreaks
# Larger economies often have:
# Larger populations
# More international travel
# Higher levels of urbanization
# More developed testing and reporting systems
# All of these factors can contribute to higher reported case counts
# The GDP coefficient is statistically significant (p < 0.001)
# meaning the relationship between GDP and total cases is very unlikely to be due to random chance
# The residual plot still shows some spread for countries with very large GDP
# indicating the model becomes less precise for the largest economies
# The Q-Q plot shows noticeable deviations from the straight line
# which means the residuals are not perfectly normally distributed
# likely due to extreme values from very large countries
# Overall, GDP is a strong predictor of total COVID cases in this dataset
# but it still does not capture all of the factors influencing case counts across countries
In [51]:
model_4 = run_regression(reg_df, "gdp_per_capita", "total_cases")
================================================================================
Regression Model: total_cases ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_cases R-squared: 0.047
Model: OLS Adj. R-squared: 0.042
Method: Least Squares F-statistic: 8.982
Date: Mon, 09 Mar 2026 Prob (F-statistic): 0.00311
Time: 02:26:33 Log-Likelihood: -3237.5
No. Observations: 184 AIC: 6479.
Df Residuals: 182 BIC: 6485.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 1.931e+06 9.57e+05 2.017 0.045 4.19e+04 3.82e+06
gdp_per_capita 73.2092 24.428 2.997 0.003 25.011 121.407
==============================================================================
Omnibus: 259.673 Durbin-Watson: 0.711
Prob(Omnibus): 0.000 Jarque-Bera (JB): 20019.139
Skew: 6.096 Prob(JB): 0.00
Kurtosis: 52.624 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [52]:
# GDP per capita has a statistically significant relationship with total COVID cases
# but it explains very little of the differences between countries
# R^2 = 0.047
# This means GDP per capita explains only about 4.7% of the variation in total cases
# In other words, wealth per person does not do a good job predicting
# how many total cases a country experienced
# This makes sense because total cases are heavily influenced by
# how large a country is, not just how wealthy it is
# Large countries can have huge case counts even if they are not very wealthy,
# while small wealthy countries may still have relatively low total cases
# The GDP per capita coefficient is statistically significant (p ≈ 0.003),
# meaning the relationship likely exists, but it is very weak
# The residual plot shows a lot of spread around the prediction line,
# suggesting the model does not fit the data very well
# The Q-Q plot shows strong deviation from normality,
# indicating the presence of extreme outliers and skewed data
# Overall, GDP per capita alone is a poor predictor of total COVID case counts
# because population size and other pandemic-related factors play a much larger role
In [53]:
model_5 = run_regression(reg_df, "gdp_usd_billions", "cases_per_million")
================================================================================
Regression Model: cases_per_million ~ gdp_usd_billions
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: cases_per_million R-squared: 0.006
Model: OLS Adj. R-squared: 0.000
Method: Least Squares F-statistic: 1.027
Date: Mon, 09 Mar 2026 Prob (F-statistic): 0.312
Time: 02:27:20 Log-Likelihood: -2503.4
No. Observations: 184 AIC: 5011.
Df Residuals: 182 BIC: 5017.
Df Model: 1
Covariance Type: nonrobust
====================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------
Intercept 1.748e+05 1.49e+04 11.725 0.000 1.45e+05 2.04e+05
gdp_usd_billions 5.1313 5.064 1.013 0.312 -4.860 15.123
==============================================================================
Omnibus: 28.212 Durbin-Watson: 1.650
Prob(Omnibus): 0.000 Jarque-Bera (JB): 37.210
Skew: 1.099 Prob(JB): 8.32e-09
Kurtosis: 3.154 Cond. No. 3.02e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# GDP does not appear to explain differences in COVID infection rates between countries
# R^2 = 0.006
# This means GDP explains less than 1% of the variation in cases per million
# In other words, the size of a country's economy tells us almost nothing
# about how widespread COVID infections were relative to population
# The GDP coefficient is not statistically significant (p ≈ 0.31)
# which suggests there is no meaningful linear relationship
# between GDP and cases per million in this dataset
# This makes sense because infection rates depend much more on factors like:
# government containment policies
# population density
# mobility and travel patterns
# vaccination rates
# and public behavior during the pandemic
# The residual plot shows a crooked trend line rather than a flat one
# which suggests the relationship between GDP and cases per million
# is not well described by a simple linear model
# We also see very large vertical spread in the residuals
# meaning countries with similar GDP values can have extremely different infection rates
# The Q-Q plot forms an S-shaped curve instead of following the straight reference line
# which indicates the residuals are not normally distributed
# This S-shape usually means the model is missing important explanatory variables
# and that the data contains heavy tails or outliers
# Overall, this model shows that GDP alone is a poor predictor
# of how widely COVID spread within a country's population
In [54]:
model_6 = run_regression(reg_df, "gdp_per_capita", "cases_per_million")
================================================================================
Regression Model: cases_per_million ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: cases_per_million R-squared: 0.443
Model: OLS Adj. R-squared: 0.440
Method: Least Squares F-statistic: 144.9
Date: Mon, 09 Mar 2026 Prob (F-statistic): 6.32e-25
Time: 02:28:18 Log-Likelihood: -2450.0
No. Observations: 184 AIC: 4904.
Df Residuals: 182 BIC: 4910.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 8.697e+04 1.33e+04 6.560 0.000 6.08e+04 1.13e+05
gdp_per_capita 4.0722 0.338 12.037 0.000 3.405 4.740
==============================================================================
Omnibus: 29.713 Durbin-Watson: 1.681
Prob(Omnibus): 0.000 Jarque-Bera (JB): 49.015
Skew: 0.846 Prob(JB): 2.27e-11
Kurtosis: 4.878 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [55]:
# GDP per capita shows a much stronger relationship with cases per million
# than total GDP did in the previous model
# R^2 = 0.443
# This means GDP per capita explains about 44.3% of the variation
# in COVID cases per million between countries
# The coefficient is positive and statistically significant (p < 0.001)
# meaning countries with higher income per person tend to report
# higher infection rates per million people
# One likely explanation is that wealthier countries tend to have:
# stronger testing systems
# more complete reporting of infections
# higher international travel and mobility
# more urbanized populations
# The residual plot shows a curved LOWESS line instead of a flat one
# which suggests the relationship may not be perfectly linear
# In particular, the model appears to slightly overpredict or underpredict
# cases at certain GDP levels, meaning the true relationship may be more complex
# The Q-Q plot again shows an S-shaped pattern
# indicating that the residuals are not perfectly normally distributed
# This usually happens when the data contains outliers or heavy tails,
# which is common in global country-level datasets
# Overall, GDP per capita appears to be a meaningful predictor
# of infection rates across countries, but it still does not capture
# all of the factors influencing COVID spread
In [56]:
model_7 = run_regression(reg_df, "gdp_per_capita", "total_deaths")
================================================================================
Regression Model: total_deaths ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: total_deaths R-squared: 0.014
Model: OLS Adj. R-squared: 0.008
Method: Least Squares F-statistic: 2.494
Date: Mon, 09 Mar 2026 Prob (F-statistic): 0.116
Time: 02:29:05 Log-Likelihood: -2413.4
No. Observations: 184 AIC: 4831.
Df Residuals: 182 BIC: 4837.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 2.749e+04 1.09e+04 2.530 0.012 6046.846 4.89e+04
gdp_per_capita 0.4379 0.277 1.579 0.116 -0.109 0.985
==============================================================================
Omnibus: 267.515 Durbin-Watson: 1.136
Prob(Omnibus): 0.000 Jarque-Bera (JB): 20965.904
Skew: 6.447 Prob(JB): 0.00
Kurtosis: 53.680 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
# GDP per capita does not appear to explain differences in total COVID deaths
# R^2 = 0.014
# This means GDP per capita explains only about 1.4% of the variation
# in total deaths between countries
# The coefficient is not statistically significant (p ≈ 0.116)
# which means we cannot conclude there is a meaningful relationship
# between GDP per capita and total deaths in this dataset
# This result makes sense because total death counts depend heavily
# on the size of a country's population rather than how wealthy it is
# Large countries can have very high death counts even if they are not very wealthy,
# while smaller wealthy countries may still have relatively low totals
# The residual plot shows a wide spread of prediction errors,
# indicating the model does a poor job explaining differences in death totals
# The red LOWESS line slopes slightly downward,
# suggesting the linear relationship between these variables is weak
# The Q-Q plot shows a strong S-shaped curve and several extreme points
# meaning the residuals are not normally distributed
# This indicates the presence of outliers and heavy tails,
# which is common when analyzing global country-level data
# Overall, GDP per capita alone is not a useful predictor
# of total COVID deaths across countries
In [57]:
model_8 = run_regression(reg_df, "gdp_per_capita", "deaths_per_million")
================================================================================
Regression Model: deaths_per_million ~ gdp_per_capita
================================================================================
OLS Regression Results
==============================================================================
Dep. Variable: deaths_per_million R-squared: 0.091
Model: OLS Adj. R-squared: 0.086
Method: Least Squares F-statistic: 18.21
Date: Mon, 09 Mar 2026 Prob (F-statistic): 3.18e-05
Time: 02:29:54 Log-Likelihood: -1583.4
No. Observations: 184 AIC: 3171.
Df Residuals: 182 BIC: 3177.
Df Model: 1
Covariance Type: nonrobust
==================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 1012.2171 119.406 8.477 0.000 776.619 1247.815
gdp_per_capita 0.0130 0.003 4.267 0.000 0.007 0.019
==============================================================================
Omnibus: 49.027 Durbin-Watson: 1.526
Prob(Omnibus): 0.000 Jarque-Bera (JB): 82.585
Skew: 1.388 Prob(JB): 1.17e-18
Kurtosis: 4.752 Cond. No. 4.78e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.78e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [58]:
# GDP per capita shows a small but statistically significant relationship
# with COVID deaths per million
# R^2 = 0.091
# This means GDP per capita explains about 9.1% of the variation
# in deaths per million across countries
# The coefficient is statistically significant (p < 0.001)
# meaning there is evidence of a relationship between wealth per person
# and mortality rates during the pandemic
# However, the overall explanatory power of the model is still low
# which means GDP per capita alone does not explain most of the
# differences in death rates between countries
# Other factors likely played a much larger role, such as:
# healthcare system capacity
# vaccination availability
# population age structure
# government response policies
# timing of pandemic waves
# The residual plot shows a curved LOWESS line instead of a flat one
# suggesting the relationship between GDP per capita and deaths per million
# may not be perfectly linear
# We also see wide variation in residual values,
# meaning countries with similar GDP per capita can still have very different mortality outcomes
# The Q-Q plot shows an S-shaped curve rather than following the straight reference line
# indicating the residuals are not perfectly normally distributed
# This pattern suggests the presence of outliers and heavy-tailed data,
# which is common in global datasets where some countries experienced
# unusually high or unusually low mortality rates
# Overall, GDP per capita has a statistically significant relationship
# with deaths per million, but it explains only a small portion
# of the differences in pandemic mortality between countries
In [60]:
# Final Conclusion
# This analysis examined country-level COVID-19 outcomes alongside demographic and economic indicators.
# The exploratory data analysis showed that most numerical variables were strongly right-skewed and contained
# several outliers, reflecting major differences in country size, economic scale, and pandemic impact.
# The visualization section showed that total case counts and total deaths were concentrated in a small
# number of large countries, while population-adjusted measures such as cases per million and deaths
# per million revealed a different set of countries with relatively high pandemic burden. GDP and GDP
# per capita also showed substantial inequality across countries.
# The inferential analysis found that total cases and total deaths were strongly associated with broader measures
# of country scale, especially GDP and, to a lesser extent, population. GDP per capita was more informative
# for population-adjusted case rates than for total counts. Several regression models were statistically
# significant, but many also showed low explanatory power, non-normal residuals, and signs of heteroskedasticity.
# This suggests that while population and economic indicators help explain some variation in COVID-19 outcomes,
# they do not fully capture the complexity of pandemic patterns across countries.
# Higher-income countries often reported more cases per million.
# That doesn’t necessarily mean the virus spread more there — it can means:
# better testing
# better reporting
# more transparent health systems
# Lower-income countries often under-detect infections.
# So the relationship is partly measurement/reporting effects, not just true spread.
# Overall, the results show that country size, economic scale, and income level are related to COVID-19 outcomes,
# but other factors such as healthcare capacity, policy response, testing availability,
# and outbreak timing also played major roles.
# the analysis shows that larger and more economically active countries tended to report more total
# COVID-19 cases and deaths, but the reasons are more complex than population or wealth alone.
# When we looked at population, it did help explain why some countries had more total cases and deaths
# than others, but it only explained a small portion of the differences between countries.
# This tells us that population size matters, but it does not determine pandemic outcomes by itself.
# Overall, the results suggest that country size and economic scale help explain some of the differences
# in COVID-19 outcomes, but they do not tell the full story. The spread and impact of the pandemic were
# shaped by a combination of population size, mobility, healthcare systems, government response,
# and timing of outbreaks across different regions.
# Big countries had more total cases and deaths, richer countries reported more cases per person,
# but no single factor fully explains how COVID affected different countries.
In [ ]:
# Extra Credit
In [ ]:
In [84]:
# country-name fixes for map matching
map_df["country"] = map_df["country"].replace({
"USA": "United States",
"UK": "United Kingdom",
"Russia": "Russian Federation",
"Bosnia-and-Herzegovina": "Bosnia and Herzegovina",
"North-Macedonia": "North Macedonia",
"South-Africa": "South Africa",
"New-Zealand": "New Zealand",
"San-Marino": "San Marino"
})
In [65]:
import pandas as pd
import plotly.express as px
# Clean copy
map_df = df.copy()
map_df = map_df.rename(columns={
"GDP ( USD billions)": "gdp_usd_billions",
"GDP Per Capita(USD)": "gdp_per_capita"
})
# Metrics dropdown
map_metrics = {
"Total COVID Cases": "total_cases",
"Cases per Million": "cases_per_million",
"Total Deaths": "total_deaths",
"Deaths per Million": "deaths_per_million",
"GDP (USD Billions)": "gdp_usd_billions",
"GDP per Capita (USD)": "gdp_per_capita"
}
# Swag color palette
swag_colors = [
"#b8f2e6",
"#5dd9c1",
"#00bcd4",
"#4ea8de",
"#c1121f",
"#780000"
]
fig = px.choropleth(
map_df,
locations="country",
locationmode="country names",
color="total_cases",
hover_name="country",
hover_data={
"continent": True,
"population": ":,",
"total_cases": ":,",
"cases_per_million": ":,.0f",
"total_deaths": ":,",
"deaths_per_million": ":,.0f",
"gdp_usd_billions": ":,.1f",
"gdp_per_capita": ":,.0f"
},
color_continuous_scale=swag_colors,
projection="natural earth",
title=" Global COVID-19 and Economic Indicators"
)
# Dropdown buttons
buttons = []
for label, metric in map_metrics.items():
buttons.append(
dict(
method="update",
label=label,
args=[
{"z": [map_df[metric]]},
{"title": f" Global Map: {label}"}
]
)
)
fig.update_layout(
template="plotly_dark",
width=1150,
height=700,
margin=dict(l=20, r=20, t=80, b=20),
updatemenus=[
dict(
buttons=buttons,
direction="down",
showactive=True,
x=0.02,
y=1.08,
xanchor="left",
yanchor="top",
bgcolor="#1e1e1e",
bordercolor="white",
font=dict(size=12)
)
],
coloraxis_colorbar=dict(
title="Metric Value",
thickness=20,
len=0.7
),
geo=dict(
showframe=False,
showcoastlines=True,
coastlinecolor="white",
showocean=True,
oceancolor="#0f1c2e",
showland=True,
landcolor="#1c1c1c"
)
)
fig.show()
In [67]:
import pandas as pd
import plotly.express as px
# Clean copy
bubble_df = df.copy().rename(columns={
"GDP ( USD billions)": "gdp_usd_billions",
"GDP Per Capita(USD)": "gdp_per_capita"
})
# Swaggy alive palette
bubble__swag_colors = [
"#b8f2e6",
"#5dd9c1",
"#00bcd4",
"#4ea8de",
"#c1121f",
"#780000"
]
fig = px.scatter_geo(
bubble_df,
locations="country",
locationmode="country names",
size="total_cases",
color="gdp_per_capita",
hover_name="country",
hover_data={
"continent": True,
"population": ":,",
"total_cases": ":,",
"cases_per_million": ":,.0f",
"total_deaths": ":,",
"deaths_per_million": ":,.0f",
"gdp_usd_billions": ":,.1f",
"gdp_per_capita": ":,.0f"
},
projection="natural earth",
size_max=45,
color_continuous_scale=bubble_colors,
title=" Global Bubble Map: COVID-19 Burden and Economic Prosperity"
)
fig.update_layout(
width=1200,
height=700,
title_x=0.5,
title_font_size=22,
margin=dict(l=20, r=20, t=70, b=20),
geo=dict(
showframe=False,
showcoastlines=True,
coastlinecolor="white",
showocean=True,
oceancolor="#4ea8de",
showland=True,
landcolor="#e9ecef",
bgcolor="#4ea8de"
),
coloraxis_colorbar=dict(
title="GDP per Capita",
thickness=18,
len=0.7
),
hoverlabel=dict(
bgcolor="white",
font_size=12
)
)
fig.show()