top of page

 
rm(list=ls())
if(!require(readxl)){install.packages("readxl")
  library(readxl)}
library(psych)
library(fixest)
if (!require(openxlsx)){
  install.packages("openxlsx")
  library(openxlsx)
}
if (!require(broom)) {
  install.packages("broom")
}
library(broom)
library(tidyr)
library(dplyr)
new_cap=read.csv("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Data/NCES/FiscalData/NCES_NEW_cap_stocks.csv") 
 # some variables are not read in, so I used IMPORT function (with reader); when reading from OneDrive, not Google Drive
 # it works
#rm(test)
Fisc_Vars=c("F12","CapConst","CapConst_Short","CapEquip","CapEquip_Short","CapLandBuild","CapLandBuild_2")
Share_Vars=c("fed_share","state_share","loc_share")
PP_REAL=NULL;PP_INSTRUMENTAL=NULL
for (i in 1: length(Fisc_Vars)){
  temp=c(paste0("pp_real_",Fisc_Vars[i]))
  PP_REAL=c(PP_REAL, temp)
}
for (i in 1: length(Fisc_Vars)){
  temp=c(paste0("pp_instrumental_real_",Fisc_Vars[i]))
  PP_INSTRUMENTAL=c(PP_INSTRUMENTAL, temp)
}
lag_names=paste0("lag_",1:5,"_pp_real_");cat_names=paste0("cat_lag_",1:5,"_pp_real_")
LAG_REAL=NULL; CAT_REAL=NULL
for (i in 1:length(Fisc_Vars)){
  for (j in 1:length(lag_names)){
    temp=c(paste0(lag_names[j],Fisc_Vars[i]))
    LAG_REAL=c(LAG_REAL,temp)
  }
}
for (i in 1:length(Fisc_Vars)){
  for (j in 1:length(cat_names)){
    temp=c(paste0(cat_names[j],Fisc_Vars[i]))
    CAT_REAL=c(CAT_REAL,temp)
  }
}
LAG_SHARE=NULL;CAT_SHARE=NULL
lag_name=paste0("lag_",1:5,"_");cat_name=paste0("cat_lag_",1:5,"_")
for (i in 1:length(Share_Vars)){
  for (j in 1:length(lag_name)){
    temp=c(paste0(lag_name[j],Share_Vars[i]))
    LAG_SHARE=c(LAG_SHARE,temp)
  }
}
for (i in 1:length(Share_Vars)){
  for (j in 1:length(cat_name)){
    temp=c(paste0(cat_name[j],Share_Vars[i]))
    CAT_SHARE=c(CAT_SHARE,temp)
  }
}
Vars_To_Read=c("LEAID","year",PP_REAL,PP_INSTRUMENTAL,LAG_REAL,CAT_REAL,LAG_SHARE,CAT_SHARE,
               Fisc_Vars,Share_Vars,"real_F12")
new_cap2=new_cap[,colnames(new_cap)%in%Vars_To_Read]
setdiff(Vars_To_Read,colnames(new_cap2))
## since merging new_cap2 with the df below causes vector memory exhausted, I need to filter
# only needed variables from df below
df=read.csv("/Users/jayryu/Library/CloudStorage/GoogleDrive-jayryujungle@gmail.com/My Drive/EduData/ALL_COMBINED_Stata2.csv",sep=",") 
df2=df[df$year>2009&df$year<2020,]
lg1 <- function(x) lag(x, 1)
lg2 <- function(x) lag(x, 2)
lg3 <- function(x) lag(x, 3)
lg4 <- function(x) lag(x, 4)
lg5 <- function(x) lag(x, 5)
var_names <- c("pp_real_Z33", "pp_real_TCURINST", "pp_Teachers","pp_real_V15",
               "pp_real_V17","pp_LEAAdministrators","pp_Schooladministrators","pp_StudentSupportServicesStaff",
               "pp_real_T06","pp_real_TLOCREV","pp_real_TCURSSVC","pp_real_CapOtherStock","pct_ALL_White",
               "pp_real_TFEDREV", "pp_real_TSTREV","pp_real_TOTALEXP")
var_names_lag <- paste0("lag_", 1:5)
for (i in 1:length(var_names)) {
  df2 <- df2 %>%
    group_by(LEAID) %>%
    mutate(across(all_of(var_names[i]), lg1, .names = "{var_names_lag[1]}_{var_names[i]}"),
           across(all_of(var_names[i]), lg2, .names = "{var_names_lag[2]}_{var_names[i]}"),
           across(all_of(var_names[i]), lg3, .names = "{var_names_lag[3]}_{var_names[i]}"),
           across(all_of(var_names[i]), lg4, .names = "{var_names_lag[4]}_{var_names[i]}"),
           across(all_of(var_names[i]), lg5, .names = "{var_names_lag[5]}_{var_names[i]}"))
}
cat_names=paste0("cat_lag_",1:5,"_")
for (h in 1:length(var_names_lag)){
  for (i in 1: length(var_names)){
    percentile_33=quantile(df2[,paste0(var_names_lag[h],"_",var_names[i])],na.rm=TRUE, probs=c(0.33))
    percentile_67=quantile(df2[,paste0(var_names_lag[h],"_",var_names[i])],na.rm=TRUE, probs=c(0.67))
    df2[,paste0(cat_names[h],var_names[i])]=ifelse(df2[,paste0(var_names_lag[h],"_",var_names[i])]< percentile_33, 1,
                                                   ifelse(df2[,paste0(var_names_lag[h],"_",var_names[i])] >= percentile_33 & df2[,paste0(var_names_lag[h],"_",var_names[i])] < percentile_67, 2, 3))
  }
}
df2=df2[df2$year>2010,] 
#df <- read.xlsx("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Result/EdFacts/Fisc_Grad_Prof_Combined/ALL_COMBINED.xlsx",
# sheet = "ALL_COMBINED", na.strings = "NA") # this loses many obs, so use the above code
#desc_check=describe(df2)
## data prep
Variable_Names=c("ALL_MTHHSPCTPROF2","pp_real_Z33","pp_real_CapStock","pp_lag_1_real_CapStock",
                 "pp_lag_2_real_CapStock","pp_lag_3_real_CapStock",
                 "pp_lag_4_real_CapStock","pp_lag_5_real_CapStock",
                 "pp_real_CapOtherStock","pp_real_F12","pp_lag_1_real_F12",
                 "pp_lag_2_real_F12","pp_lag_3_real_F12",
                 "pp_lag_4_real_F12","pp_lag_5_real_F12",
                 "pct_ALL_White","pp_Teachers","pp_LEAAdministrators","pp_StudentSupportServicesStaff",
                 "pp_Schooladministrators",sprintf("ALL_MTH%02dPCTPROF2", seq(3, 8)),
                 sprintf("ALL_RLA%02dPCTPROF2", seq(3, 8))
                 ,"ALL_RLAHSPCTPROF2","ALL2",
                 "MHI2","ECD2","MWH2","year","LEAID")
df3=df2[,colnames(df2)%in%Variable_Names]
df3_Descriptive_Stat=describe(df3)
# Check for missing values
missing_values <- colSums(is.na(df3))
print(missing_values)
# Check for constant columns
constant_columns <- colnames(df3)[sapply(df3, function(x) length(unique(x)) == 1)]
print(constant_columns)
# Remove missing values and constant columns
df3_cleaned <- df3[, !(colnames(df3) %in% c("year", "LEAID", constant_columns))]
df3_cleaned <- df3_cleaned[complete.cases(df3_cleaned), ]
# Calculate correlation matrix
cor_matrix <- cor(df3_cleaned)
# Print the correlation matrix
print(cor_matrix)
#### to prepare a data file that includes non-missing cases only
#### and to create a table for descriptive statistics
Cap_Rank_CapStock_Lag1=c(sprintf("pp_lag_%01d_real_CapStock", seq(1,1)))
for (i in 1:length(Cap_Rank_CapStock_Lag1)){
  percentile_33=quantile(df2[[Cap_Rank_CapStock_Lag1[i]]],na.rm=TRUE, probs=c(0.33))
  percentile_67=quantile(df2[[Cap_Rank_CapStock_Lag1[i]]],na.rm=TRUE, probs=c(0.67))
  df2[,paste0("cat_",Cap_Rank_CapStock_Lag1[i])]=ifelse(df2[[Cap_Rank_CapStock_Lag1[i]]]< percentile_33, 1,
                                                        ifelse(df2[[Cap_Rank_CapStock_Lag1[i]]] >= percentile_33 & df2[[Cap_Rank_CapStock_Lag1[i]]] < percentile_67, 2, 3))
}
IV_List=c("pp_real_CapStock", "pp_lag_1_real_CapStock",   "pp_lag_2_real_CapStock",  "pp_lag_3_real_CapStock",  "pp_lag_4_real_CapStock",
          "pp_real_F12", "pp_lag_1_real_F12", "pp_lag_2_real_F12", "pp_lag_3_real_F12", "pp_lag_4_real_F12",
          "pp_real_Z33",  "pp_real_V17",  "pp_real_V15",  "pp_real_T06", "pp_Teachers",  "pp_Schooladministrators", 
          "pp_StudentSupportServicesStaff",  "pp_real_CapOtherStock",   "pp_real_TFEDREV",  "pct_ALL_White",
          "log_cat_lag_1_pp_real_CapOtherStock", "pp_lag_5_real_F12", "pp_instrumental_real_F12",
          "log_cat_lag_1_pp_real_Z33",  "log_cat_lag_1_pp_real_real_V17", # V17 differs b/c I created in other data steps
          "log_cat_lag_1_pp_real_V15",  "log_cat_lag_1_pp_real_T06",  "log_cat_lag_1_pp_Teachers", 
          "log_cat_lag_1_pp_Schooladministrators", "log_cat_lag_1_pp_StudentSupportServicesStaff", "pp_instrumental_real_CapStock",
          "cat_lag_1_pp_real_CapOtherStock", "pp_lag_5_real_CapStock",
          "cat_lag_1_pp_real_Z33",  "cat_lag_1_pp_real_V17", 
          "cat_lag_1_pp_real_V15",  "cat_lag_1_pp_real_T06",  "cat_lag_1_pp_Teachers", 
          "cat_lag_1_pp_Schooladministrators", "cat_lag_1_pp_StudentSupportServicesStaff","ALL2","cat_pp_lag_1_real_F12",
          "cat_pp_lag_1_real_CapStock","pp_real_TOTALREV","pp_real_TFEDREV","pp_real_TSTREV","pp_real_TLOCREV",
          "pp_real_TOTALEXP","pp_real_TCURINST","pp_real_TCURSSVC","STUDENT_COUNT", "V33","cat_pp_lag_1_real_CapStock",
          "cat_lag_1_pct_ALL_White","cat_lag_1_pp_real_TFEDREV", "cat_lag_1_pp_real_TSTREV",
          "cat_lag_1_pp_real_TOTALEXP")
filtered_IV_List=IV_List[!grepl("CapStock|F12|CapOtherStock",IV_List)]
Proficiency_Variables=c("ALL_MTHHSPCTPROF2",sprintf("ALL_MTH%02dPCTPROF2", seq(3, 8)),
                        "ALL_RLAHSPCTPROF2",sprintf("ALL_RLA%02dPCTPROF2", seq(3, 8)))
Full_Variable_List=c(filtered_IV_List,Proficiency_Variables,"ALL2")
df_for_descriptive=df2[,colnames(df2)%in%Full_Variable_List]
df_for_descriptive$year=df2$year;df_for_descriptive$LEAID=df2$LEAID
#df_for_descriptive=na.omit(df_for_descriptive)
#rm(df2,df3,df3_cleaned,new_cap,df)
new_cap3=new_cap2[new_cap2$year>2010&new_cap2$year<2020 &!is.na(new_cap2$LEAID),]
df_cleaned=merge(x=df_for_descriptive,y=new_cap3,by=c("LEAID","year"),all.x = TRUE)
library(dplyr)
filtered_data <- df_cleaned %>%
  filter(if_all(all_of(Proficiency_Variables), ~ !is.na(.)))
df_cleaned=filtered_data[!is.na(filtered_data$ALL2),]
desc_check=describe(df_cleaned)
desc_check_yearly=describeBy(df_cleaned,group=df_cleaned$year)
treat=data.frame(desc_check)
treat2=tibble::rownames_to_column(as.data.frame(treat),"variable")[,-2]
datalist=list() 
for (i in 1:length(desc_check_yearly)){
  dat=data.frame(desc_check_yearly[[i]])
  dat2=tibble::rownames_to_column(as.data.frame(dat),"variable")[,-2]
  datalist[[i]]=dat2
}
namelist=list()
for(i in 1:length(desc_check_yearly)){
  l_names=2010+i
  namelist[[i]]=l_names
}
namelist2=as.vector(do.call(rbind,namelist))
names(datalist)=namelist2
# to create Excel tables
sig_notes=c("0 '***' 0.001","'**' 0.01","'*' 0.05","'.' 0.1 '' 1")
sig_codes = function(p) {
  ifelse(p < 0.001, "***",
         ifelse(p < 0.01, "**",
                ifelse(p < 0.05, "*",
                       ifelse(p < 0.1,".",""))))}
reg_col=c("Variable","Estimate","Std. Error","t value","Pr(>|t|)","Sig")
########## proficiency
Proficiency_Variables=c("ALL_MTHHSPCTPROF2",sprintf("ALL_MTH%02dPCTPROF2", seq(3, 8)),
                        "ALL_RLAHSPCTPROF2",sprintf("ALL_RLA%02dPCTPROF2", seq(3, 8)))
########### to see various types of standard errors and unit-levels
file_path="/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Data/NCES/FiscalData/sdf19_2a_xlsx.xlsx"
temp=read_excel(file_path)
temp2=temp[,c("FIPST","STNAME","STABBR")]
temp3=data.frame("FIPST"=unique(temp2$FIPST),
                 "STNAME"=unique(temp2$STNAME),"STABBR"=unique(temp2$STABBR))
df_cleaned$STATE_ID=substr(df_cleaned$LEAID,1,2)
desc_check_state=describeBy(df_cleaned,group=df_cleaned$STATE_ID)
datalist2=list() 
for (i in 1:length(desc_check_state)){
  dat=data.frame(desc_check_state[[i]])
  dat2=tibble::rownames_to_column(as.data.frame(dat),"variable")[,-2]
  datalist2[[i]]=dat2
}
namelist2=list()
for(i in 1:length(unique(df_cleaned$STATE_ID))){
  l_names=unique(df_cleaned$STATE_ID)[i]
  namelist2[[i]]=l_names
}
namelist2_2=as.vector(do.call(rbind,namelist2))
# to check whether the first two letters in LEAID are the same as FIPST
check=read_excel("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Data/NCES/FiscalData/sdf19_2a_xlsx.xlsx",
                 na="NA",sheet="sdf19_2a")
check2=check[,c("LEAID","FIPST")]
check2$STATE_ID=substr(check2$LEAID,1,2)
setdiff(check2$FIPST,check2$STATE_ID)
# Map FIPST in temp3 to STABBR based on namelist2_2
mapped_values <- temp3$STABBR[match(namelist2_2, temp3$FIPST)]
names(datalist2)=mapped_values
ordered_list <- datalist2[order(names(datalist2))]
setwd("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Result/EdFacts/Fisc_Grad_Prof_Combined")
#treat2%>%writexl::write_xlsx(paste("NEW_Cleaned","_Descriptive_Stat",".xlsx"))
#datalist%>%writexl::write_xlsx(paste("NEW_Yearly","_Descriptive_Stat",".xlsx"))
#datalist2%>%writexl::write_xlsx(paste("NEW_STATE","_Descriptive_Stat",".xlsx"))
#ordered_list%>%writexl::write_xlsx(paste("NEW_STATE","LEAID",".xlsx"))
## endogeneity test needs to be done for each endogenous variable separately, then run IV reg with all of them
Fit_Statistics=list()
for (i in 1:length(Proficiency_Variables)  ) { #length(Proficiency_Variables) 
  formula_str=paste(Proficiency_Variables[i],
                    "~   lag_2_pp_real_F12  + 
                     + pp_real_CapConst  + pp_real_CapLandBuild  + pp_real_Z33 +pp_real_V17 + pp_real_CapEquip +
                    pp_real_T06 + fed_share +loc_share|
   STATE_ID + year|  pp_real_F12   ~   cat_lag_1_pp_real_F12 + pp_instrumental_real_F12  ",collapse="") 
  # with STATE_ID + year, some models fail to satisfy Sargan tests; STATE_ID would be fine for checking 
  # endogeneity for each variable
  formula=as.formula(formula_str)
  reg_base=feols(formula,df_cleaned,cluster=~STATE_ID^year)
  #reg_base=feols(formula,df_cleaned,vcov="cluster") 
  # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  #Coefficient_Tables[[i]]=summary(reg_base,DK~year) # with 8 years, estimates are not highly reliable for autocorrelation
  Fit_Statistics[[i]]=list(fitstat(reg_base,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
}
names(Fit_Statistics)=Proficiency_Variables
Fit_Statistics
# to check conditional f statistics for multiple instruments; example code lines are in Recycle Bin
#install.packages("OneSampleMR")
library(OneSampleMR)
#install.packages("ivreg", dependencies = TRUE)
library(ivreg)
df_cleaned$STATE_ID2=as.factor(df_cleaned$STATE_ID)
df_cleaned$year2=as.factor(df_cleaned$year)
df_cleaned$log_cat_lag_1_pp_real_CapOtherStock=log(df_cleaned$cat_lag_1_pp_real_CapOtherStock)
df_cleaned$log_cat_lag_1_pp_real_Z33=log(df_cleaned$cat_lag_1_pp_real_Z33)
df_cleaned$log_cat_lag_1_pp_real_real_V17=log(df_cleaned$cat_lag_1_pp_real_V17)
df_cleaned$log_cat_lag_1_pp_real_V15=log(df_cleaned$cat_lag_1_pp_real_V15)
df_cleaned$log_cat_lag_1_pp_real_T06=log(df_cleaned$cat_lag_1_pp_real_T06)
df_cleaned$log_cat_lag_1_pp_Teachers=log(df_cleaned$cat_lag_1_pp_Teachers)
df_cleaned$log_cat_lag_1_pp_Schooladministrators=log(df_cleaned$cat_lag_1_pp_Schooladministrators)
df_cleaned$log_cat_lag_1_pp_StudentSupportServicesStaff=log(df_cleaned$cat_lag_1_pp_StudentSupportServicesStaff)
Conditional_F_Stat_HS=list()
for (i in 1:1){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~ pp_real_F12 + lag_2_pp_real_F12 +
                     pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip | pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip + cat_lag_1_pp_real_F12 + 
                    cat_lag_3_pp_real_F12 + pp_instrumental_real_F12",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned)
  Conditional_F_Stat_HS[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_HS[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_HS
Conditional_F_Stat_2_6_12=list()
for (i in 2:2){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~  lag_2_pp_real_F12 +
                     pp_real_F12 + pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip | pp_real_Z33 + pp_real_V17 + pp_real_T06 + 
                     loc_share + fed_share + lag_2_pp_real_F12 +
                      cat_lag_1_pp_real_F12 + 
                    pp_instrumental_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                    cat_lag_1_pp_real_CapEquip + lag_5_pp_real_CapConst",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned)
  Conditional_F_Stat_2_6_12[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_2_6_12[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_2_6_12

Conditional_F_Stat_3_to_14=list()
for (i in 3:3){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~  lag_2_pp_real_F12 +
                     pp_real_F12 + pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip | pp_real_Z33 + pp_real_V17 + pp_real_T06 + 
                     loc_share + fed_share + pp_real_CapEquip +
                     cat_lag_1_pp_real_F12 + 
                    cat_lag_3_pp_real_F12 + pp_instrumental_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                     lag_5_pp_real_CapConst
                      ",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned)
  Conditional_F_Stat_3_to_14[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_3_to_14[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_3_to_14
df_cleaned$log_pp_instrumental_real_CapConst=log(df_cleaned$pp_instrumental_real_CapConst)
Conditional_F_Stat_math06=list()
for (i in 5:5){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~ pp_real_F12 + lag_2_pp_real_F12 +
                     pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip | pp_real_Z33 + pp_real_V17 + pp_real_T06 + 
                     loc_share + fed_share + pp_real_CapEquip + lag_2_pp_real_F12 +
                    cat_lag_1_pp_real_F12 + 
                     cat_lag_1_pp_real_CapConst + cat_lag_1_pp_real_CapLandBuild + log_pp_instrumental_real_CapConst
                    ",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned)
  Conditional_F_Stat_math06[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_math06[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_math06

## only for the cases with extra control variables missing
#Extra_Variables=c("pct_ALL_White","pp_Teachers", "pp_Schooladministrators","pp_StudentSupportServicesStaff")
#df_cleaned3= df_cleaned %>% filter(if_all(all_of(Extra_Variables), ~ is.na(.)))
#unique(df_cleaned3$STATE_ID)

## IV regression
# *(HS)
Regression_Results_IV_HS=list()
Fit_Statistics_IV=list()
#Coefficient_Tables=list()
index_HS=c(1,8)
for (i in 1:length(index_HS)  ) { #length(Proficiency_Variables) 
  selected_index=index_HS[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share + 
                    pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip |
                    STATE_ID +year |  pp_real_F12 + lag_2_pp_real_F12   ~ cat_lag_1_pp_real_F12 + 
                    cat_lag_3_pp_real_F12 + pp_instrumental_real_F12  ",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_HS[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_HS) <- Proficiency_Variables[index_HS]
Regression_Results_IV_HS
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV) = Proficiency_Variables[index_HS]
Fit_Statistics_IV
 
# *(2,6,12)
Regression_Results_IV_2_6_12=list()
Fit_Statistics_IV_2_6_12=list()
#Coefficient_Tables=list()
index_2_6_12=c(2,6,12)
for (i in 1:length(index_2_6_12)  ) { #length(Proficiency_Variables) 
  selected_index=index_2_6_12[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ lag_2_pp_real_F12 + pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share   |
                    STATE_ID +year |  pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip   ~ cat_lag_1_pp_real_F12 + 
                    pp_instrumental_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                    cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV_2_6_12[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_2_6_12[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_2_6_12) <- Proficiency_Variables[index_2_6_12]
Regression_Results_IV_2_6_12
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV_2_6_12) = Proficiency_Variables[index_2_6_12]
Fit_Statistics_IV_2_6_12
 
# *(3_to_14)
Regression_Results_IV_3_to_14=list()
Fit_Statistics_IV_3_to_14=list()
#Coefficient_Tables=list()
index_3_to_14=c(3,4,7,9,10,11,13,14)
for (i in 1:length(index_3_to_14)  ) { #length(Proficiency_Variables) 
  selected_index=index_3_to_14[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~  pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share + pp_real_CapEquip  |
                    STATE_ID +year |  pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild    ~ cat_lag_1_pp_real_F12 + 
                    cat_lag_3_pp_real_F12 + pp_instrumental_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                     lag_5_pp_real_CapConst",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV_3_to_14[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_3_to_14[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_3_to_14) <- Proficiency_Variables[index_3_to_14]
Regression_Results_IV_3_to_14
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV_3_to_14) = Proficiency_Variables[index_3_to_14]
Fit_Statistics_IV_3_to_14
 
Regression_Results_IV_math06=list()
Fit_Statistics_IV_math06=list()
#Coefficient_Tables=list()
index_math06=c(5)
for (i in 1:length(index_math06)  ) { #length(Proficiency_Variables) 
  selected_index=index_math06[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ lag_2_pp_real_F12 + pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share + 
                     pp_real_CapEquip |
                    STATE_ID +year |  pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild    ~ cat_lag_1_pp_real_F12 + 
                     cat_lag_1_pp_real_CapConst + cat_lag_1_pp_real_CapLandBuild + log(pp_instrumental_real_CapConst)  ",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV_math06[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_math06[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_math06) <- Proficiency_Variables[index_math06]
Regression_Results_IV_math06
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV_math06) = Proficiency_Variables[index_math06]
Fit_Statistics_IV_math06

setwd("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Result/EdFacts/Fisc_Grad_Prof_Combined/SEL_State_RegressionResults")
Regression_Results_IV_HS%>%writexl::write_xlsx(paste("Regression_","Results_IV_HS",".xlsx"))
Regression_Results_IV_2_6_12%>%writexl::write_xlsx(paste("Regression_","Results_IV_2_6_12",".xlsx"))
Regression_Results_IV_3_to_14%>%writexl::write_xlsx(paste("Regression_","Results_IV_3_to_14",".xlsx"))
Regression_Results_IV_math06%>%writexl::write_xlsx(paste("Regression_","Results_IV_math06",".xlsx"))
######## high school graduation rate
# endogeneity test for each endogenous variable 
reg_base <- feols(ALL2 ~  pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst +
                    pp_real_CapLandBuild + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                    loc_share + fed_share|
                    STATE_ID + year |  pp_real_CapEquip  ~   cat_lag_1_pp_real_CapEquip + 
                    cat_lag_5_pp_real_CapEquip    ,
                  data = df_cleaned, cluster=~STATE_ID^year)
fit_grad=list(fitstat(reg_base,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald +sargan))
fit_grad
#IV regression
reg_base2=feols(ALL2 ~  pp_real_F12 + lag_2_pp_real_F12 + 
                  pp_real_CapLandBuild + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share|
                  STATE_ID + year | pp_real_CapConst + pp_real_CapEquip  ~ cat_lag_1_pp_real_CapConst +
                  cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst ,
                df_cleaned,cluster=~STATE_ID^year)
reg_base3=summary(reg_base2,DK~year) 
reg_base3
fit_grad2=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan))
fit_grad2
# conditional F test
formula_str=paste("ALL2",
                  "~ pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst +
                  pp_real_CapLandBuild + pp_real_CapEquip + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share | pp_real_F12 + lag_2_pp_real_F12 + 
                  pp_real_CapLandBuild +  pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share +
                    cat_lag_1_pp_real_CapConst + 
                    cat_lag_1_pp_real_CapEquip + lag_5_pp_real_CapConst",collapse="")
formula=as.formula(formula_str)
fsw_check=ivreg::ivreg(formula,data=df_cleaned)
Conditional_F_Stat_GradRate=(fsw(fsw_check))
Conditional_F_Stat_GradRate

# Use broom to tidy the results
tidy_results <- tidy(reg_base3)
glance_results <- glance(reg_base3)
# Combine the tidy and glance results into a single data frame
result_df <- cbind(tidy_results, glance_results)
# regression results
result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
result_parameter=result_df2[,1:5]
result_parameter$Sig=sig_codes(result_parameter$p.value)
colnames(result_parameter)=reg_col
result_df3=data.frame((round(result_df2[,6:14],4)))
result_df3=result_df3[,-4]
df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
df_long2=df_long[1:length(colnames(result_df3))  ,]
df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
colnames(df_long3)=reg_col
Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate="ALL2",Var1="",
                              Var2="",Var3="",Var4="")
colnames(Dependent_Variable)=reg_col
STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base2$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
colnames(STATE_ID)=reg_col
Regression_Results_Grad_Rate_IV=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
Grad_Rate_Regression_Combined=list(Regression_Results_Grad_Rate_IV
)
names(Grad_Rate_Regression_Combined)=c("GradR_IV_Regression")
Grad_Rate_Regression_Combined%>%writexl::write_xlsx(paste("NEW_Grad_Rate_Regression_","IV",".xlsx"))
#############################################################################
########## robustness checks with pct_ALL_White interacted, etc. #####################
#############################################################################
data_merge=df2[,c("LEAID","year","LEAAdministrators")]
df_cleaned2=merge(x=df_cleaned,y=data_merge,by=c("LEAID","year"),all.x = TRUE)
df_cleaned2$pp_LEAAdministrators=df_cleaned2$LEAAdministrators/df_cleaned2$V33
cap_var=c("pp_real_F12","lag_2_pp_real_F12","pp_real_CapConst","pp_real_CapLandBuild","pp_real_CapEquip")
for (i in 1:length(cap_var)){
  df_cleaned2[[paste0("int_",cap_var[i])]]=df_cleaned2$pct_ALL_White * df_cleaned2[[cap_var[i]]]
}
cap_var2=cap_var[-2]
for (i in 1:length(cap_var2)){
  df_cleaned2[[paste0("int_cat_lag_1_",cap_var2[i])]]=
    df_cleaned2$pct_ALL_White * df_cleaned2[[paste0("cat_lag_1_",cap_var2[i])]]
}
df_cleaned2$int_cat_lag_3_pp_real_F12=df_cleaned2$pct_ALL_White * df_cleaned2$cat_lag_3_pp_real_F12
corr.test(df_cleaned2$pct_ALL_White,df_cleaned2$int_pp_real_F12)
## IV regression
# *(HS)
Regression_Results_IV_HS=list()
Fit_Statistics_IV=list()
#Coefficient_Tables=list()
index_HS=c(1,8)
for (i in 1:length(index_HS)  ) { #length(Proficiency_Variables) 
  selected_index=index_HS[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ pp_real_Z33 + pp_real_V17 +  pp_real_T06 +   loc_share +fed_share + 
                    pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip + 
                   int_pp_real_CapConst + int_pp_real_CapLandBuild + int_pp_real_CapEquip +
                   pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff |
                    STATE_ID +year | int_pp_real_F12 + #when pp_real_F12 and lag_2_pp_real_F12 are included, negative r^2 & weak IV
                    int_lag_2_pp_real_F12 + pct_ALL_White  ~  
                    int_cat_lag_1_pp_real_F12 + int_cat_lag_3_pp_real_F12 + pp_instrumental_real_F12 
                    + cat_lag_1_pct_ALL_White ",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned2,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[index_HS][i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_HS[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_HS) <- Proficiency_Variables[index_HS]
Regression_Results_IV_HS
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV) = Proficiency_Variables[index_HS]
Fit_Statistics_IV

Conditional_F_Stat_HS=list()
for (i in 1:1){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~ int_pp_real_F12 + int_lag_2_pp_real_F12 +
                     pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip +
                      pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                      pct_ALL_White| pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip + int_cat_lag_1_pp_real_F12 + 
                    int_cat_lag_3_pp_real_F12 + pp_instrumental_real_F12 +
                     pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                    cat_lag_1_pct_ALL_White",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned2)
  Conditional_F_Stat_HS[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_HS[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_HS
 
# *(2,6,12)
Regression_Results_IV_2_6_12=list()
Fit_Statistics_IV_2_6_12=list()
#Coefficient_Tables=list()
index_2_6_12=c(2,6,12)
for (i in 1:length(index_2_6_12)  ) { #length(Proficiency_Variables) 
  selected_index=index_2_6_12[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ lag_2_pp_real_F12 + int_lag_2_pp_real_F12 + pp_real_Z33 + pp_real_V17 +
                    pp_real_T06 +  loc_share +fed_share + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff |
                    STATE_ID +year | pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip +
                    int_pp_real_F12 + int_pp_real_CapConst + int_pp_real_CapLandBuild +
                    int_pp_real_CapEquip + pct_ALL_White  ~ cat_lag_1_pp_real_F12 + pp_instrumental_real_CapConst +
                    cat_lag_1_pp_real_CapLandBuild + cat_lag_1_pp_real_CapEquip +
                    int_cat_lag_1_pp_real_F12 + int_cat_lag_1_pp_real_CapConst + int_cat_lag_1_pp_real_CapLandBuild +
                    int_cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst +
                    cat_lag_1_pct_ALL_White",collapse="")
  
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned2,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV_2_6_12[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[index_2_6_12][i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_2_6_12[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_2_6_12) <- Proficiency_Variables[index_2_6_12]
Regression_Results_IV_2_6_12
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV_2_6_12) = Proficiency_Variables[index_2_6_12]
Fit_Statistics_IV_2_6_12
Conditional_F_Stat_2_6_12=list()
for (i in 2:2){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~  lag_2_pp_real_F12 +
                     pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip + pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip +
                    int_pp_real_F12 + int_pp_real_CapConst + int_pp_real_CapLandBuild +
                    int_pp_real_CapEquip +
                    pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                    pct_ALL_White| pp_real_Z33 + pp_real_V17 + pp_real_T06 + 
                     loc_share + fed_share + lag_2_pp_real_F12 +
                      cat_lag_1_pp_real_F12 + pp_instrumental_real_CapConst +
                    cat_lag_1_pp_real_CapLandBuild + cat_lag_1_pp_real_CapEquip +
                    int_cat_lag_1_pp_real_F12 + int_cat_lag_1_pp_real_CapConst + int_cat_lag_1_pp_real_CapLandBuild +
                    int_cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst +
                    pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                    cat_lag_1_pct_ALL_White",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned2)
  Conditional_F_Stat_2_6_12[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_2_6_12[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_2_6_12

# *(3_to_14)
Regression_Results_IV_3_to_14=list()
Fit_Statistics_IV_3_to_14=list()
#Coefficient_Tables=list()
index_3_to_14=c(3,4,7,9,10,11,13,14)
for (i in 1:length(index_3_to_14)  ) { #length(Proficiency_Variables) 
  selected_index=index_3_to_14[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ pp_real_Z33 + pp_real_V17  + pp_real_T06 +  loc_share +fed_share + pp_real_CapEquip +int_pp_real_CapEquip +
                    + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff |
                    STATE_ID +year |  pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild +
                    int_pp_real_F12 + int_lag_2_pp_real_F12 + int_pp_real_CapConst + 
                    int_pp_real_CapLandBuild + pct_ALL_White ~ cat_lag_1_pp_real_F12 + 
                    cat_lag_3_pp_real_F12 + cat_lag_1_pp_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                     pp_instrumental_real_CapConst + int_cat_lag_1_pp_real_F12 + 
                    int_cat_lag_3_pp_real_F12 + int_cat_lag_1_pp_real_CapConst + 
                    int_cat_lag_1_pp_real_CapLandBuild + cat_lag_1_pct_ALL_White",collapse="")
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned2,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV_3_to_14[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[index_3_to_14][i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_3_to_14[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_3_to_14) <- Proficiency_Variables[index_3_to_14]
Regression_Results_IV_3_to_14
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV_3_to_14) = Proficiency_Variables[index_3_to_14]
Fit_Statistics_IV_3_to_14

Conditional_F_Stat_3_to_14=list()
for (i in 3:3){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~   pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share  + pp_real_CapEquip +
                     pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild +
                    int_pp_real_F12 + int_lag_2_pp_real_F12 + int_pp_real_CapConst + int_pp_real_CapLandBuild +
                    pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff | pp_real_Z33 + pp_real_V17 + pp_real_T06 + 
                     loc_share + fed_share + pp_real_CapEquip +
                     cat_lag_1_pp_real_F12 + 
                    cat_lag_3_pp_real_F12 + cat_lag_1_pp_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                     pp_instrumental_real_CapConst + int_cat_lag_1_pp_real_F12 + 
                    int_cat_lag_3_pp_real_F12 + int_cat_lag_1_pp_real_CapConst + int_cat_lag_1_pp_real_CapLandBuild +
                    + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                    cat_lag_1_pct_ALL_White
                      ",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned2)
  Conditional_F_Stat_3_to_14[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_3_to_14[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_3_to_14
Regression_Results_IV_math06=list()
Fit_Statistics_IV_math06=list()
#Coefficient_Tables=list()
index_math06=c(5)
for (i in 1:length(index_math06)  ) { #length(Proficiency_Variables) 
  selected_index=index_math06[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ lag_2_pp_real_F12 + int_lag_2_pp_real_F12 + pp_real_Z33 + pp_real_V17  + pp_real_T06 +   loc_share +fed_share + 
                     pp_real_CapEquip + int_pp_real_CapEquip +
pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
                    STATE_ID +year |  pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild +
                    int_pp_real_F12 + int_pp_real_CapConst + int_pp_real_CapLandBuild +
                    pct_ALL_White ~ cat_lag_1_pp_real_F12 + 
                     cat_lag_1_pp_real_CapConst + cat_lag_1_pp_real_CapLandBuild + 
                    int_cat_lag_1_pp_real_F12 + int_cat_lag_1_pp_real_CapConst +
                    int_cat_lag_1_pp_real_CapLandBuild + log(pp_instrumental_real_CapConst) +
                    cat_lag_1_pct_ALL_White",collapse="")
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned2,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  Fit_Statistics_IV_math06[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[index_math06][i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  Regression_Results_IV_math06[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(Regression_Results_IV_math06) <- Proficiency_Variables[index_math06]
Regression_Results_IV_math06
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(Fit_Statistics_IV_math06) = Proficiency_Variables[index_math06]
Fit_Statistics_IV_math06

df_cleaned$log_pp_instrumental_real_CapConst=log(df_cleaned$pp_instrumental_real_CapConst)
Conditional_F_Stat_math06=list()
for (i in 5:5){ # length 1 because test is run for each "endogenous" variable
  formula_str=paste(Proficiency_Variables[i],
                    "~ pp_real_F12 + lag_2_pp_real_F12 +
                     pp_real_Z33 + pp_real_V17 + pp_real_T06 + loc_share + fed_share + pp_real_CapConst +
                     pp_real_CapLandBuild + pp_real_CapEquip + int_pp_real_F12 +
                     int_pp_real_CapConst + int_pp_real_CapLandBuild + pct_ALL_White +
                     pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff| pp_real_Z33 + pp_real_V17 + pp_real_T06 + 
                     loc_share + fed_share + 
                     pp_real_CapEquip + lag_2_pp_real_F12 + 
                     pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                    cat_lag_1_pp_real_F12 + 
                     cat_lag_1_pp_real_CapConst + cat_lag_1_pp_real_CapLandBuild + 
                    int_cat_lag_1_pp_real_F12 + int_cat_lag_1_pp_real_CapConst +
                    int_cat_lag_1_pp_real_CapLandBuild  + log_pp_instrumental_real_CapConst +
                    cat_lag_1_pct_ALL_White
                    ",collapse="")
  formula=as.formula(formula_str)
  fsw_check=ivreg::ivreg(formula,data=df_cleaned2)
  Conditional_F_Stat_math06[[i]]=list(fsw(fsw_check))
  names(Conditional_F_Stat_math06[[i]])=Proficiency_Variables[[i]]
}
Conditional_F_Stat_math06

setwd("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Result/EdFacts/Fisc_Grad_Prof_Combined/SEL_State_RegressionResults")
Regression_Results_IV_HS%>%writexl::write_xlsx(paste("Regression_","Results_IV_HS_INT_White",".xlsx"))
Regression_Results_IV_2_6_12%>%writexl::write_xlsx(paste("Regression_","Results_IV_2_6_12_INT_White",".xlsx"))
Regression_Results_IV_3_to_14%>%writexl::write_xlsx(paste("Regression_","Results_IV_3_to_14_INT_White",".xlsx"))
Regression_Results_IV_math06%>%writexl::write_xlsx(paste("Regression_","Results_IV_math06_INT_White",".xlsx"))
######## high school graduation rate

#IV regression
reg_base2=feols(ALL2 ~  pp_real_F12 + lag_2_pp_real_F12 + int_pp_real_F12 + int_lag_2_pp_real_F12 +
                  pp_real_CapLandBuild + int_pp_real_CapLandBuild + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share  +
                  pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
                  STATE_ID + year | pp_real_CapConst + pp_real_CapEquip +
                  int_pp_real_CapConst + int_pp_real_CapEquip + pct_ALL_White ~ cat_lag_1_pp_real_CapConst +
                  cat_lag_1_pp_real_CapEquip + int_cat_lag_1_pp_real_CapConst +
                  int_cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst + cat_lag_1_pct_ALL_White,
                df_cleaned2,cluster=~STATE_ID^year)
reg_base3=summary(reg_base2,DK~year) 
reg_base3
fit_grad2=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan))
fit_grad2
# conditional F test
formula_str=paste("ALL2",
                  "~ pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst +
                  pp_real_CapLandBuild + pp_real_CapEquip + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share + int_pp_real_CapConst + int_pp_real_CapEquip +
                  pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                  pct_ALL_White| pp_real_F12 + lag_2_pp_real_F12 + 
                  pp_real_CapLandBuild +  pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share +
                    cat_lag_1_pp_real_CapConst +
                  cat_lag_1_pp_real_CapEquip + int_cat_lag_1_pp_real_CapConst +
                  int_cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst +
                  pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff +
                  cat_lag_1_pct_ALL_White",collapse="")
formula=as.formula(formula_str)
fsw_check=ivreg::ivreg(formula,data=df_cleaned2)
Conditional_F_Stat_GradRate=(fsw(fsw_check))
Conditional_F_Stat_GradRate

# Use broom to tidy the results
tidy_results <- tidy(reg_base3)
glance_results <- glance(reg_base3)
# Combine the tidy and glance results into a single data frame
result_df <- cbind(tidy_results, glance_results)
# regression results
result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
result_parameter=result_df2[,1:5]
result_parameter$Sig=sig_codes(result_parameter$p.value)
colnames(result_parameter)=reg_col
result_df3=data.frame((round(result_df2[,6:14],4)))
result_df3=result_df3[,-4]
df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
df_long2=df_long[1:length(colnames(result_df3))  ,]
df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
colnames(df_long3)=reg_col
Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate="ALL2",Var1="",
                              Var2="",Var3="",Var4="")
colnames(Dependent_Variable)=reg_col
STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base2$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
colnames(STATE_ID)=reg_col
Regression_Results_Grad_Rate_IV=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
Grad_Rate_Regression_Combined=list(Regression_Results_Grad_Rate_IV
)
names(Grad_Rate_Regression_Combined)=c("GradR_IV_Regression_INT_White")
Grad_Rate_Regression_Combined%>%writexl::write_xlsx(paste("NEW_Grad_Rate_Regression_","IV_INT",".xlsx"))
 
#############################################################################
########## robustness checks with four more covariates #####################
#############################################################################
## endogeneity test needs to be done for each endogenous variable separately, then run IV reg with all of them
II_Fit_Statistics=list()
for (i in 1:length(Proficiency_Variables)  ) { #length(Proficiency_Variables) 
  formula_str=paste(Proficiency_Variables[i],
                    "~   lag_2_pp_real_F12  + 
                      + pp_real_CapConst  + pp_real_CapLandBuild  + pp_real_Z33 +pp_real_V17 + pp_real_CapEquip +
                     pp_real_T06 + fed_share +loc_share +
                    pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
   STATE_ID + year|  pp_real_F12   ~   cat_lag_1_pp_real_F12 + cat_lag_5_pp_real_F12  
 ",collapse="") 
  # with STATE_ID + year, some models fail to satisfy Sargan tests; STATE_ID would be fine for checking 
  # endogeneity for each variable
  formula=as.formula(formula_str)
  reg_base=feols(formula,df_cleaned,cluster=~STATE_ID^year)
  #reg_base=feols(formula,df_cleaned,vcov="cluster") 
  # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  #Coefficient_Tables[[i]]=summary(reg_base,DK~year) # with 8 years, estimates are not highly reliable for autocorrelation
  II_Fit_Statistics[[i]]=list(fitstat(reg_base,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
}
names(II_Fit_Statistics)=Proficiency_Variables
II_Fit_Statistics
## to check correlations
# Define the variables
variables <- c("pp_real_F12" , "lag_2_pp_real_F12" , "pp_real_CapConst" ,
                 "pp_real_CapLandBuild" , "pp_real_Z33" , "pp_real_V17",  "pp_real_T06",
                 "loc_share" , "fed_share","pp_real_CapEquip","pct_ALL_White",
               "pp_Teachers","pp_Schooladministrators","pp_StudentSupportServicesStaff")
data_cor=na.omit(df_cleaned[,variables])
cor_results <- list()
for (i in 1:(ncol(data_cor) - 1)) {
  for (j in (i + 1):ncol(data_cor)) {
    x <- data_cor[, i]
    y <- data_cor[, j]
    cor_results[[paste(colnames(data_cor)[i], colnames(data_cor)[j], sep = "_")]] <- cor.test(x, y)
  }
}
cor_results
# Access results using names like "pp_real_F12_lag_2_pp_real_F12", etc.
corr.test(data_cor)

## IV regression
# *(1,4,5)
II_Regression_Results_IV_HS=list()
II_Fit_Statistics_IV=list()
#Coefficient_Tables=list()
index_HS=c(1,4,5)
for (i in 1:length(index_HS)  ) { #length(Proficiency_Variables) 
  selected_index=index_HS[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share + 
                    pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip +
                    pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff |STATE_ID +year   ",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  II_Fit_Statistics_IV[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  II_Regression_Results_IV_HS[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(II_Regression_Results_IV_HS) <- Proficiency_Variables[index_HS]
II_Regression_Results_IV_HS
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(II_Fit_Statistics_IV) = Proficiency_Variables[index_HS]
II_Fit_Statistics_IV
 
# *(2,3,9,10)
II_Regression_Results_IV_2_6_12=list()
II_Fit_Statistics_IV_2_6_12=list()
#Coefficient_Tables=list()
index_2_6_12=c(2,3,9,10)
for (i in 1:length(index_2_6_12)  ) { #length(Proficiency_Variables) 
  selected_index=index_2_6_12[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ lag_2_pp_real_F12 + pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share +
                    pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff  |
                    STATE_ID +year |  pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild + pp_real_CapEquip   ~ cat_lag_1_pp_real_F12 + 
                    pp_instrumental_real_CapConst + cat_lag_1_pp_real_CapLandBuild +
                    cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  II_Fit_Statistics_IV_2_6_12[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  II_Regression_Results_IV_2_6_12[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(II_Regression_Results_IV_2_6_12) <- Proficiency_Variables[index_2_6_12]
II_Regression_Results_IV_2_6_12
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(II_Fit_Statistics_IV_2_6_12) = Proficiency_Variables[index_2_6_12]
II_Fit_Statistics_IV_2_6_12
 
# *(6_to_14)
II_Regression_Results_IV_3_to_14=list()
II_Fit_Statistics_IV_3_to_14=list()
#Coefficient_Tables=list()
index_3_to_14=c(6,7,8,11,12,14)
for (i in 1:length(index_3_to_14)  ) { #length(Proficiency_Variables) 
  selected_index=index_3_to_14[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~  pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share +
                    pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst + pp_real_CapLandBuild  +
                    pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
                    STATE_ID +year |  pp_real_CapEquip    ~ cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapEquip
                    ",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  II_Fit_Statistics_IV_3_to_14[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  II_Regression_Results_IV_3_to_14[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(II_Regression_Results_IV_3_to_14) <- Proficiency_Variables[index_3_to_14]
II_Regression_Results_IV_3_to_14
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(II_Fit_Statistics_IV_3_to_14) = Proficiency_Variables[index_3_to_14]
II_Fit_Statistics_IV_3_to_14
 
II_Regression_Results_IV_math06=list()
II_Fit_Statistics_IV_math06=list()
#Coefficient_Tables=list()
index_math06=c(13)
for (i in 1:length(index_math06)  ) { #length(Proficiency_Variables) 
  selected_index=index_math06[i]
  formula_str=paste(Proficiency_Variables[selected_index],
                    "~ pp_real_Z33 + pp_real_V17 + pp_real_T06 +  loc_share +fed_share +
                     pp_real_CapLandBuild  + pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
                    STATE_ID +year | pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst + pp_real_CapEquip ~ cat_lag_2_pp_real_F12 + cat_lag_3_pp_real_F12 +
                    cat_lag_2_pp_real_CapConst + cat_lag_1_pp_real_CapEquip + cat_lag_1_pp_real_TOTALEXP  ",collapse="") 
  formula=as.formula(formula_str)
  reg_base2=feols(formula,df_cleaned,cluster=~STATE_ID^year) # when vcov="twoway" was used, 
  #Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
  reg_base=summary(reg_base2,DK~year) 
  II_Fit_Statistics_IV_math06[[i]]=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan)) #kpr is not running; Error 
  # in vcov + (-1)^(i + 1) * vcovClust(index, bread, scores, adj = ssc$cluster.adj &&  : non-conformable arrays
  
  # Use broom to tidy the results
  tidy_results <- tidy(reg_base)
  glance_results <- glance(reg_base)
  
  # Combine the tidy and glance results into a single data frame
  result_df <- cbind(tidy_results, glance_results)
  
  # regression results
  result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
  result_parameter=result_df2[,1:5]
  result_parameter$Sig=sig_codes(result_parameter$p.value)
  colnames(result_parameter)=reg_col
  result_parameter
  
  result_df3=data.frame((round(result_df2[,6:14],4)))
  result_df3=result_df3[,-4]
  df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
  df_long2=df_long[1:length(colnames(result_df3))  ,]
  df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
  colnames(df_long3)=reg_col
  Dependent_Variable=data.frame(Variable="Dependent Variable",Estimate=Proficiency_Variables[i],Var1="",
                                Var2="",Var3="",Var4="")
  colnames(Dependent_Variable)=reg_col
  STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
  #year=data.frame(Variable="year",Var1=reg_base$fixef_sizes[2],Var2="",Var3="",Var4="",Var5="")
  # STATE_ID_year=rbind(STATE_ID,year)
  colnames(STATE_ID)=reg_col
  II_Regression_Results_IV_math06[[i]]=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
}
names(II_Regression_Results_IV_math06) <- Proficiency_Variables[index_math06]
II_Regression_Results_IV_math06
#names(Coefficient_Tables) <- Proficiency_Variables
#Coefficient_Tables
names(II_Fit_Statistics_IV_math06) = Proficiency_Variables[index_math06]
II_Fit_Statistics_IV_math06

setwd("/Users/jayryu/Library/CloudStorage/OneDrive-OhioUniversity/WonJay/Result/EdFacts/Fisc_Grad_Prof_Combined/SEL_State_RegressionResults")
II_Regression_Results_IV_HS%>%writexl::write_xlsx(paste("II_Regression_","Results_IV_1_4_5",".xlsx"))
II_Regression_Results_IV_2_6_12%>%writexl::write_xlsx(paste("II_Regression_","Results_IV_2_3_9_10",".xlsx"))
II_Regression_Results_IV_3_to_14%>%writexl::write_xlsx(paste("II_Regression_","Results_IV_6_to_14",".xlsx"))
II_Regression_Results_IV_math06%>%writexl::write_xlsx(paste("II_Regression_","Results_IV_RLA07",".xlsx"))
######## high school graduation rate
# endogeneity test for each endogenous variable 
reg_base <- feols(ALL2 ~  pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst +
                    pp_real_CapLandBuild + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                    loc_share + fed_share +
                    pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
                    STATE_ID + year |  pp_real_CapEquip  ~   cat_lag_1_pp_real_CapEquip + 
                    cat_lag_5_pp_real_CapEquip    ,
                  data = df_cleaned, cluster=~STATE_ID^year)
fit_grad=list(fitstat(reg_base,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald +sargan))
fit_grad
#IV regression
reg_base2=feols(ALL2 ~  pp_real_F12 + lag_2_pp_real_F12 + 
                  pp_real_CapLandBuild + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share +
                  pct_ALL_White + pp_Schooladministrators + pp_Teachers + pp_StudentSupportServicesStaff|
                  STATE_ID + year | pp_real_CapConst + pp_real_CapEquip  ~ cat_lag_1_pp_real_CapConst +
                  cat_lag_1_pp_real_CapEquip + cat_lag_5_pp_real_CapConst ,
                df_cleaned,cluster=~STATE_ID^year)
reg_base3=summary(reg_base2,DK~year) 
reg_base3
fit_grad2=list(fitstat(reg_base2,~cd+rmse+wh+ivf+ivf2+ivfall+ivwald+sargan))
fit_grad2
# conditional F test
formula_str=paste("ALL2",
                  "~ pp_real_F12 + lag_2_pp_real_F12 + pp_real_CapConst +
                  pp_real_CapLandBuild + pp_real_CapEquip + pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share | pp_real_F12 + lag_2_pp_real_F12 + 
                  pp_real_CapLandBuild +  pp_real_Z33 + pp_real_V17 + pp_real_T06 +
                  loc_share + fed_share +
                    pp_instrumental_real_CapConst + 
                    cat_lag_1_pp_real_CapEquip + lag_5_pp_real_CapConst",collapse="")
formula=as.formula(formula_str)
fsw_check=ivreg::ivreg(formula,data=df_cleaned)
II_Conditional_F_Stat_GradRate=(fsw(fsw_check))
II_Conditional_F_Stat_GradRate

# Use broom to tidy the results
tidy_results <- tidy(reg_base2)
glance_results <- glance(reg_base2)
# Combine the tidy and glance results into a single data frame
result_df <- cbind(tidy_results, glance_results)
# regression results
result_df2=data.frame(cbind(result_df[,1],(round(result_df[,-1],5))))
result_parameter=result_df2[,1:5]
result_parameter$Sig=sig_codes(result_parameter$p.value)
colnames(result_parameter)=reg_col
result_df3=data.frame((round(result_df2[,6:14],4)))
result_df3=result_df3[,-4]
df_long <- pivot_longer(result_df3, cols = colnames(result_df3), names_to = "Variable", values_to = "Value")
df_long2=df_long[1:length(colnames(result_df3))  ,]
df_long3=data.frame(df_long2,Var1="",Var2="",Var3="",Var4="")
colnames(df_long3)=reg_col
Dependent_Variable=data.frame(Variable="Dependent Variable","ALL2",Var1="",
                              Var2="",Var3="",Var4="")
colnames(Dependent_Variable)=reg_col
STATE_ID=data.frame(Variable="STATE_ID",Var1=reg_base2$fixef_sizes[1],Var2="",Var3="",Var4="",Var5="")
colnames(STATE_ID)=reg_col
II_Regression_Results_Grad_Rate_IV=rbind(result_parameter,df_long3,Dependent_Variable,STATE_ID)
II_Grad_Rate_Regression_Combined=list(II_Regression_Results_Grad_Rate_IV
)
names(II_Grad_Rate_Regression_Combined)=c("II_GradR_IV_Regression")
II_Grad_Rate_Regression_Combined%>%writexl::write_xlsx(paste("II_NEW_Grad_Rate_Regression_","IV",".xlsx"))

bottom of page