library(readxl)
library(ggplot2)
library(maps)
## Warning: package 'maps' was built under R version 4.3.1
library(RColorBrewer)
# Read the Excel file
data_xlsx <- read_excel("CollegeGrads.xlsx", sheet = 1)
# Define color palette for the scatter plot
colors <- brewer.pal(nrow(data_xlsx), "Set1")
## Warning in brewer.pal(nrow(data_xlsx), "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
# Visualize data across states - Bar plot of Dropout %
barplot(data_xlsx[["Dropout%"]], names.arg = data_xlsx$State, xlab = "State", ylab = "Dropout %",
main = "Dropout Rates Across States", col = colors)

# Visualize data across states - Scatter plot of ColGrad% and EdSpend
ggplot(data_xlsx, aes(x = `ColGrad%`, y = EdSpend, color = `ColGrad%`)) +
geom_point() +
labs(x = "College Graduation %", y = "EdSpend", title = "Relationship between College Graduation % and Education Spending")

# Map the data to state names and convert state names to uppercase
state_names <- toupper(data_xlsx$State)
colgrad_values <- data_xlsx$`ColGrad%`
data_mapped <- data.frame(state = state_names, value = colgrad_values)
# Create a color palette for the map
n_colors <- 5 # Number of colors in the palette
colors <- brewer.pal(n_colors, "YlGnBu")
# Plot the choropleth map with legend
map('state', fill = TRUE, col = colors, main = "College Graduation % by State")
legend("topright", legend = c("Low", "Medium", "High"), fill = colors, title = "College Graduation %")

# Print column names of the data
colnames(data_xlsx)
## [1] "State" "ColGrad%" "Dropout%" "EdSpend" "Metro%" "Age"
## [7] "LPRFem" "Neast" "Seast" "West" "Midwest"
top_10_colgrad <- head(data_xlsx[order(data_xlsx$`ColGrad%`, decreasing = TRUE), ], 10)
top_10_colgrad
## # A tibble: 10 × 11
## State `ColGrad%` `Dropout%` EdSpend `Metro%` Age LPRFem Neast Seast West
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CT 36.9 10 2142 100 39.3 60 1 0 0
## 2 MA 36.8 12.5 1807 99.6 38.2 61.9 1 0 0
## 3 MD 36.3 13 1627 98.5 37.1 62.3 1 0 0
## 4 NJ 36.3 13 1903 100 38 58.4 1 0 0
## 5 CO 35.4 10.8 1599 91.6 34.7 65.3 0 0 1
## 6 VT 34.4 10 1948 73.7 40.7 65.8 1 0 0
## 7 MN 34.3 7.3 1860 87.2 36.7 69 0 0 0
## 8 NH 32.8 8.1 1632 96.4 39.5 64.7 1 0 0
## 9 WA 30.9 8.5 1561 96.4 36.7 61.2 0 0 1
## 10 VA 30.7 13.9 1631 89.1 37.2 60.8 0 1 0
## # ℹ 1 more variable: Midwest <dbl>
top_10_dropout <- head(data_xlsx[order(data_xlsx$`Dropout%`, decreasing = TRUE), ], 10)
top_10_dropout
## # A tibble: 10 × 11
## State `ColGrad%` `Dropout%` EdSpend `Metro%` Age LPRFem Neast Seast West
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TX 25.4 21.8 1696 93.8 33.2 58.2 0 0 1
## 2 KY 19 21.1 1263 75.6 37.5 55.4 0 1 0
## 3 MS 21.9 20.1 1178 77.8 35.5 55.5 0 1 0
## 4 LA 19.7 19.8 1367 93.2 35.4 54.9 0 1 0
## 5 CA 30.4 19.7 1657 99.3 34.4 57.6 0 0 1
## 6 AL 19.8 19.1 1221 89.2 37.4 55.8 0 1 0
## 7 NM 27.2 19 1608 95.9 36.2 57.5 0 0 1
## 8 AR 17.5 18.6 1158 78.8 37 54.9 0 1 0
## 9 TN 21.6 18.2 1079 89.5 37.3 57.4 0 1 0
## 10 WV 15.1 17.6 1538 75 40.7 49.1 0 1 0
## # ℹ 1 more variable: Midwest <dbl>
top_10_edspend <- head(data_xlsx[order(data_xlsx$EdSpend, decreasing = TRUE), ], 10)
top_10_edspend
## # A tibble: 10 × 11
## State `ColGrad%` `Dropout%` EdSpend `Metro%` Age LPRFem Neast Seast West
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 IL 29.5 12.7 2334 95.2 35.6 59.7 0 0 0
## 2 AK 28.7 8.3 2187 74.7 33.9 65.6 0 0 1
## 3 CT 36.9 10 2142 100 39.3 60 1 0 0
## 4 NY 30.3 14.4 2046 97.7 37.5 56.2 1 0 0
## 5 VT 34.4 10 1948 73.7 40.7 65.8 1 0 0
## 6 NJ 36.3 13 1903 100 38 58.4 1 0 0
## 7 WY 22 9.1 1896 71.5 39.1 65.3 0 0 1
## 8 MI 24.6 11.5 1880 92.1 36.9 59.7 0 0 0
## 9 MN 34.3 7.3 1860 87.2 36.7 69 0 0 0
## 10 MA 36.8 12.5 1807 99.6 38.2 61.9 1 0 0
## # ℹ 1 more variable: Midwest <dbl>
library(ggplot2)
# Convert region columns to binary variables
data_xlsx$Neast <- ifelse(data_xlsx$Neast == 1, 1, 0)
data_xlsx$Seast <- ifelse(data_xlsx$Seast == 1, 1, 0)
data_xlsx$West <- ifelse(data_xlsx$West == 1, 1, 0)
data_xlsx$Midwest <- ifelse(data_xlsx$Midwest == 1, 1, 0)
# Filter and sort data for the specified regions
filtered_data <- subset(data_xlsx, Neast == 1 | Seast == 1 | West == 1 | Midwest == 1)
# Calculate regional averages for graduation rates and dropout rates
averages <- aggregate(cbind(`ColGrad%`, `Dropout%`) ~ Neast + Seast + West + Midwest, filtered_data, FUN = mean)
# Define custom color palette
my_colors <- c("#FF6961", "#77DD77", "#AEC6CF", "#FFD700")
# Plot regional averages for graduation rates
ggplot(averages, aes(x = ifelse(Neast == 1, "Northeast", ifelse(Seast == 1, "Southeast", ifelse(West == 1, "West", "Midwest"))), y = `ColGrad%`)) +
geom_bar(stat = "identity", fill = my_colors) +
labs(x = "Region", y = "Graduation Rate", title = "Regional Averages for Graduation Rates") +
theme_minimal() +
geom_text(aes(label = round(`ColGrad%`, 1)), vjust = -0.5, color = "black", size = 4) +
scale_x_discrete(limits = c("Northeast", "Southeast", "West", "Midwest")) +
coord_cartesian(ylim = c(0, max(averages$`ColGrad%`) + 5))

# Plot regional averages for dropout rates
ggplot(averages, aes(x = ifelse(Neast == 1, "Northeast", ifelse(Seast == 1, "Southeast", ifelse(West == 1, "West", "Midwest"))), y = `Dropout%`)) +
geom_bar(stat = "identity", fill = my_colors) +
labs(x = "Region", y = "Dropout Rate", title = "Regional Averages for Dropout Rates") +
theme_minimal() +
geom_text(aes(label = round(`Dropout%`, 1)), vjust = -0.5, color = "black", size = 4) +
scale_x_discrete(limits = c("Northeast", "Southeast", "West", "Midwest")) +
coord_cartesian(ylim = c(0, max(averages$`Dropout%`) + 5))

#The scatter plot demonstrates a pattern where higher educational spending is associated with higher graduation rates and lower dropout rates. This suggests a potential correlation between educational spending and student outcomes. It appears that when more money is invested in education (higher EdSpend), there tends to be an increase in graduation rates and a decrease in dropout rates.
# Convert the relevant columns to numeric
data_xlsx$ColGrad <- as.numeric(data_xlsx$`ColGrad%`)
data_xlsx$Dropout <- as.numeric(data_xlsx$`Dropout%`)
data_xlsx$EdSpend <- as.numeric(data_xlsx$EdSpend)
# Perform correlation analysis
correlation_colgrad <- cor(data_xlsx$ColGrad, data_xlsx$EdSpend)
correlation_dropout <- cor(data_xlsx$Dropout, data_xlsx$EdSpend)
# Perform linear regression analysis
linear_model_colgrad <- lm(ColGrad ~ EdSpend, data = data_xlsx)
linear_model_dropout <- lm(Dropout ~ EdSpend, data = data_xlsx)
# Extract regression coefficients
slope_colgrad <- coef(linear_model_colgrad)[2]
slope_dropout <- coef(linear_model_dropout)[2]
# Print correlation coefficients and regression coefficients
cat("Correlation between ColGrad% and EdSpend:", correlation_colgrad, "\n")
## Correlation between ColGrad% and EdSpend: 0.4847467
cat("Correlation between Dropout% and EdSpend:", correlation_dropout, "\n")
## Correlation between Dropout% and EdSpend: -0.3234635
cat("Linear regression coefficient (ColGrad% vs EdSpend):", slope_colgrad, "\n")
## Linear regression coefficient (ColGrad% vs EdSpend): 0.008408336
cat("Linear regression coefficient (Dropout% vs EdSpend):", slope_dropout, "\n")
## Linear regression coefficient (Dropout% vs EdSpend): -0.004277537
# The correlation coefficients tell us how strong and in which direction the relationships are between the graduation rate (ColGrad%), dropout rate (Dropout%), and educational spending (EdSpend). A positive correlation means that when EdSpend increases, ColGrad% or Dropout% tends to increase as well. On the other hand, a negative correlation means that as EdSpend increases, ColGrad% or Dropout% tends to decrease.
# The regression coefficients show us how much the graduation rate (ColGrad%) or dropout rate (Dropout%) changes when there is a one-unit increase in educational spending (EdSpend). A positive regression coefficient means that when EdSpend increases, ColGrad% or Dropout% also tends to increase. Conversely, a negative coefficient means that as EdSpend increases, ColGrad% or Dropout% tends to decrease.
# By analyzing the correlation and regression coefficients, we can gain valuable insights into the relationship between educational spending (EdSpend) and the graduation rate (ColGrad%) and dropout rate (Dropout%). It helps us understand how changes in EdSpend may impact the outcomes of college graduation and dropout rates.
# Scatter plot with linear regression line - ColGrad% vs EdSpend (colorful)
ggplot(data_xlsx, aes(x = EdSpend, y = `ColGrad%`, color = `ColGrad%`)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(x = "Education Spending", y = "Graduation Rate", title = "Linear Regression: Graduation Rate vs Education Spending") +
scale_color_gradient(low = "green", high = "red") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot with linear regression line - Dropout% vs EdSpend (colorful)
ggplot(data_xlsx, aes(x = EdSpend, y = `Dropout%`, color = `Dropout%`)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(x = "Education Spending", y = "Dropout Rate", title = "Linear Regression: Dropout Rate vs Education Spending") +
scale_color_gradient(low = "green", high = "red") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# In summary, my analysis revealed a strong and positive relationship between educational spending and both graduation rates and dropout rates. Higher investments in education were associated with higher graduation rates and lower dropout rates. These findings suggest that increasing educational spending can have a positive impact on student outcomes. However, it's important to consider other influencing factors, and further research is needed to establish a causal relationship between educational spending and these outcomes.