library(readxl)
library(ggplot2)
library(maps)
## Warning: package 'maps' was built under R version 4.3.1
library(RColorBrewer)

# Read the Excel file
data_xlsx <- read_excel("CollegeGrads.xlsx", sheet = 1)

# Define color palette for the scatter plot
colors <- brewer.pal(nrow(data_xlsx), "Set1")
## Warning in brewer.pal(nrow(data_xlsx), "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
# Visualize data across states - Bar plot of Dropout %
barplot(data_xlsx[["Dropout%"]], names.arg = data_xlsx$State, xlab = "State", ylab = "Dropout %",
        main = "Dropout Rates Across States", col = colors)

# Visualize data across states - Scatter plot of ColGrad% and EdSpend
ggplot(data_xlsx, aes(x = `ColGrad%`, y = EdSpend, color = `ColGrad%`)) +
  geom_point() +
  labs(x = "College Graduation %", y = "EdSpend", title = "Relationship between College Graduation % and Education Spending")

# Map the data to state names and convert state names to uppercase
state_names <- toupper(data_xlsx$State)
colgrad_values <- data_xlsx$`ColGrad%`
data_mapped <- data.frame(state = state_names, value = colgrad_values)

# Create a color palette for the map
n_colors <- 5  # Number of colors in the palette
colors <- brewer.pal(n_colors, "YlGnBu")

# Plot the choropleth map with legend
map('state', fill = TRUE, col = colors, main = "College Graduation % by State")
legend("topright", legend = c("Low", "Medium", "High"), fill = colors, title = "College Graduation %")

# Print column names of the data
colnames(data_xlsx)
##  [1] "State"    "ColGrad%" "Dropout%" "EdSpend"  "Metro%"   "Age"     
##  [7] "LPRFem"   "Neast"    "Seast"    "West"     "Midwest"
top_10_colgrad <- head(data_xlsx[order(data_xlsx$`ColGrad%`, decreasing = TRUE), ], 10)
top_10_colgrad
## # A tibble: 10 × 11
##    State `ColGrad%` `Dropout%` EdSpend `Metro%`   Age LPRFem Neast Seast  West
##    <chr>      <dbl>      <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 CT          36.9       10      2142    100    39.3   60       1     0     0
##  2 MA          36.8       12.5    1807     99.6  38.2   61.9     1     0     0
##  3 MD          36.3       13      1627     98.5  37.1   62.3     1     0     0
##  4 NJ          36.3       13      1903    100    38     58.4     1     0     0
##  5 CO          35.4       10.8    1599     91.6  34.7   65.3     0     0     1
##  6 VT          34.4       10      1948     73.7  40.7   65.8     1     0     0
##  7 MN          34.3        7.3    1860     87.2  36.7   69       0     0     0
##  8 NH          32.8        8.1    1632     96.4  39.5   64.7     1     0     0
##  9 WA          30.9        8.5    1561     96.4  36.7   61.2     0     0     1
## 10 VA          30.7       13.9    1631     89.1  37.2   60.8     0     1     0
## # ℹ 1 more variable: Midwest <dbl>
top_10_dropout <- head(data_xlsx[order(data_xlsx$`Dropout%`, decreasing = TRUE), ], 10)
top_10_dropout
## # A tibble: 10 × 11
##    State `ColGrad%` `Dropout%` EdSpend `Metro%`   Age LPRFem Neast Seast  West
##    <chr>      <dbl>      <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 TX          25.4       21.8    1696     93.8  33.2   58.2     0     0     1
##  2 KY          19         21.1    1263     75.6  37.5   55.4     0     1     0
##  3 MS          21.9       20.1    1178     77.8  35.5   55.5     0     1     0
##  4 LA          19.7       19.8    1367     93.2  35.4   54.9     0     1     0
##  5 CA          30.4       19.7    1657     99.3  34.4   57.6     0     0     1
##  6 AL          19.8       19.1    1221     89.2  37.4   55.8     0     1     0
##  7 NM          27.2       19      1608     95.9  36.2   57.5     0     0     1
##  8 AR          17.5       18.6    1158     78.8  37     54.9     0     1     0
##  9 TN          21.6       18.2    1079     89.5  37.3   57.4     0     1     0
## 10 WV          15.1       17.6    1538     75    40.7   49.1     0     1     0
## # ℹ 1 more variable: Midwest <dbl>
top_10_edspend <- head(data_xlsx[order(data_xlsx$EdSpend, decreasing = TRUE), ], 10)
top_10_edspend
## # A tibble: 10 × 11
##    State `ColGrad%` `Dropout%` EdSpend `Metro%`   Age LPRFem Neast Seast  West
##    <chr>      <dbl>      <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
##  1 IL          29.5       12.7    2334     95.2  35.6   59.7     0     0     0
##  2 AK          28.7        8.3    2187     74.7  33.9   65.6     0     0     1
##  3 CT          36.9       10      2142    100    39.3   60       1     0     0
##  4 NY          30.3       14.4    2046     97.7  37.5   56.2     1     0     0
##  5 VT          34.4       10      1948     73.7  40.7   65.8     1     0     0
##  6 NJ          36.3       13      1903    100    38     58.4     1     0     0
##  7 WY          22          9.1    1896     71.5  39.1   65.3     0     0     1
##  8 MI          24.6       11.5    1880     92.1  36.9   59.7     0     0     0
##  9 MN          34.3        7.3    1860     87.2  36.7   69       0     0     0
## 10 MA          36.8       12.5    1807     99.6  38.2   61.9     1     0     0
## # ℹ 1 more variable: Midwest <dbl>
library(ggplot2)

# Convert region columns to binary variables
data_xlsx$Neast <- ifelse(data_xlsx$Neast == 1, 1, 0)
data_xlsx$Seast <- ifelse(data_xlsx$Seast == 1, 1, 0)
data_xlsx$West <- ifelse(data_xlsx$West == 1, 1, 0)
data_xlsx$Midwest <- ifelse(data_xlsx$Midwest == 1, 1, 0)

# Filter and sort data for the specified regions
filtered_data <- subset(data_xlsx, Neast == 1 | Seast == 1 | West == 1 | Midwest == 1)

# Calculate regional averages for graduation rates and dropout rates
averages <- aggregate(cbind(`ColGrad%`, `Dropout%`) ~ Neast + Seast + West + Midwest, filtered_data, FUN = mean)

# Define custom color palette
my_colors <- c("#FF6961", "#77DD77", "#AEC6CF", "#FFD700")

# Plot regional averages for graduation rates
ggplot(averages, aes(x = ifelse(Neast == 1, "Northeast", ifelse(Seast == 1, "Southeast", ifelse(West == 1, "West", "Midwest"))), y = `ColGrad%`)) +
  geom_bar(stat = "identity", fill = my_colors) +
  labs(x = "Region", y = "Graduation Rate", title = "Regional Averages for Graduation Rates") +
  theme_minimal() +
  geom_text(aes(label = round(`ColGrad%`, 1)), vjust = -0.5, color = "black", size = 4) +
  scale_x_discrete(limits = c("Northeast", "Southeast", "West", "Midwest")) +
  coord_cartesian(ylim = c(0, max(averages$`ColGrad%`) + 5))

# Plot regional averages for dropout rates
ggplot(averages, aes(x = ifelse(Neast == 1, "Northeast", ifelse(Seast == 1, "Southeast", ifelse(West == 1, "West", "Midwest"))), y = `Dropout%`)) +
  geom_bar(stat = "identity", fill = my_colors) +
  labs(x = "Region", y = "Dropout Rate", title = "Regional Averages for Dropout Rates") +
  theme_minimal() +
  geom_text(aes(label = round(`Dropout%`, 1)), vjust = -0.5, color = "black", size = 4) +
  scale_x_discrete(limits = c("Northeast", "Southeast", "West", "Midwest")) +
  coord_cartesian(ylim = c(0, max(averages$`Dropout%`) + 5))

#The scatter plot demonstrates a pattern where higher educational spending is associated with higher graduation rates and lower dropout rates. This suggests a potential correlation between educational spending and student outcomes. It appears that when more money is invested in education (higher EdSpend), there tends to be an increase in graduation rates and a decrease in dropout rates.


# Convert the relevant columns to numeric
data_xlsx$ColGrad <- as.numeric(data_xlsx$`ColGrad%`)
data_xlsx$Dropout <- as.numeric(data_xlsx$`Dropout%`)
data_xlsx$EdSpend <- as.numeric(data_xlsx$EdSpend)

# Perform correlation analysis
correlation_colgrad <- cor(data_xlsx$ColGrad, data_xlsx$EdSpend)
correlation_dropout <- cor(data_xlsx$Dropout, data_xlsx$EdSpend)

# Perform linear regression analysis
linear_model_colgrad <- lm(ColGrad ~ EdSpend, data = data_xlsx)
linear_model_dropout <- lm(Dropout ~ EdSpend, data = data_xlsx)

# Extract regression coefficients
slope_colgrad <- coef(linear_model_colgrad)[2]
slope_dropout <- coef(linear_model_dropout)[2]

# Print correlation coefficients and regression coefficients
cat("Correlation between ColGrad% and EdSpend:", correlation_colgrad, "\n")
## Correlation between ColGrad% and EdSpend: 0.4847467
cat("Correlation between Dropout% and EdSpend:", correlation_dropout, "\n")
## Correlation between Dropout% and EdSpend: -0.3234635
cat("Linear regression coefficient (ColGrad% vs EdSpend):", slope_colgrad, "\n")
## Linear regression coefficient (ColGrad% vs EdSpend): 0.008408336
cat("Linear regression coefficient (Dropout% vs EdSpend):", slope_dropout, "\n")
## Linear regression coefficient (Dropout% vs EdSpend): -0.004277537
# The correlation coefficients tell us how strong and in which direction the relationships are between the graduation rate (ColGrad%), dropout rate (Dropout%), and educational spending (EdSpend). A positive correlation means that when EdSpend increases, ColGrad% or Dropout% tends to increase as well. On the other hand, a negative correlation means that as EdSpend increases, ColGrad% or Dropout% tends to decrease.

# The regression coefficients show us how much the graduation rate (ColGrad%) or dropout rate (Dropout%) changes when there is a one-unit increase in educational spending (EdSpend). A positive regression coefficient means that when EdSpend increases, ColGrad% or Dropout% also tends to increase. Conversely, a negative coefficient means that as EdSpend increases, ColGrad% or Dropout% tends to decrease.

# By analyzing the correlation and regression coefficients, we can gain valuable insights into the relationship between educational spending (EdSpend) and the graduation rate (ColGrad%) and dropout rate (Dropout%). It helps us understand how changes in EdSpend may impact the outcomes of college graduation and dropout rates.



# Scatter plot with linear regression line - ColGrad% vs EdSpend (colorful)
ggplot(data_xlsx, aes(x = EdSpend, y = `ColGrad%`, color = `ColGrad%`)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(x = "Education Spending", y = "Graduation Rate", title = "Linear Regression: Graduation Rate vs Education Spending") +
  scale_color_gradient(low = "green", high = "red") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Scatter plot with linear regression line - Dropout% vs EdSpend (colorful)
ggplot(data_xlsx, aes(x = EdSpend, y = `Dropout%`, color = `Dropout%`)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(x = "Education Spending", y = "Dropout Rate", title = "Linear Regression: Dropout Rate vs Education Spending") +
  scale_color_gradient(low = "green", high = "red") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# In summary, my analysis revealed a strong and positive relationship between educational spending and both graduation rates and dropout rates. Higher investments in education were associated with higher graduation rates and lower dropout rates. These findings suggest that increasing educational spending can have a positive impact on student outcomes. However, it's important to consider other influencing factors, and further research is needed to establish a causal relationship between educational spending and these outcomes.