/*

This program is a version of cjive which uses Leverage and Mata for a faster runtime

syntax cjive y x (d = z), cluster(cluster)

where y is the dependent variable, 
x is a varlist of covariates, placed before and / or after the parenthesis
d is the treatment variable
z are the instruments
and cluster denotes which cluster each observation belongs to 

Created by Samuel McIntyre 
January 2024
spm42@byu.edu

*/

capture program drop cjive
program cjive, eclass 

	syntax anything(equalok) [if] [in], cluster(varname) [gen(string)]
	
	*Warn if gen is alrady specified
	cap confirm variable `gen'
	if _rc == 0{
		disp in red "Variable `gen' is already defined. Choose a new argument for gen()"
	}

	
	else{
		
	*Create dependent, exogenous, endogenous, instruments
    
	local pos1 = strpos("`anything'", "(")
	local pos2 = strpos("`anything'", "=")
	local pos3 = strpos("`anything'", ")")

	local exogenous1 = substr("`anything'", 1, `pos1' - 1)
	local endogenous = substr("`anything'", `pos1' + 1, `pos2' - `pos1' - 1)
	local instruments = substr("`anything'", `pos2' + 1, `pos3' - `pos2' - 1)
	local exogenous2 = substr("`anything'", `pos3' + 1, .)

	local pos4 = strpos("`exogenous1'", " ")
	local dependent = substr("`exogenous1'", 1, `pos4')
	local exogenous3 = substr("`exogenous1'", `pos4', .)
	local exogenous `exogenous3' `exogenous2'
	
	*Check that an appropriate number of gen names specified
	local num_gen : word count `gen'
	local num_leverage : word count `endogenous'
	if `num_gen' != `num_leverage' & "`gen'" != "" {
		di as err "Error: The number of gen variables must match the number of leverage variables."
		exit
	}

		
	*Partialling Out
	*Dependent
	tempvar yres y_hat
	qui reg `dependent' `exogenous'
	qui predict `yres', resid
	
	*Endogenous
	local d_reslist
	foreach var in `endogenous' {
		tempvar `var'_res
		qui reg `var' `exogenous'
		qui predict ``var'_res', resid
		local d_reslist `d_reslist' ``var'_res'
	}
	
	*Instruments
	* intialize instrument list
	local z_reslist
	foreach var of varlist `instruments' {
		tempvar `var'_res
		qui reg `var' `exogenous'
		qui predict ``var'_res', resid
		local z_reslist `z_reslist' ``var'_res'
	}
		
	*Index the clusters, get cluster sizes
	tempname clustervals Csize nrows
	tempvar ones
	
	preserve
	qui keep `cluster'
	qui gen `ones' = 1
	qui collapse (sum) `ones', by(`cluster')
	mata `clustervals' = st_data(., "`cluster'")
	mata `Csize' = st_data(., "`ones'")
	restore
	
	mata `nrows' = rows(`Csize')
	mata st_local("numclusters", strofreal(`nrows'))
	
	*Temporary Names
	tempname Z X ZTZ ZTZI Pi A B L
	
	*Create data matrices
	sort `cluster'
	mata: `Z' = st_data(., ("`z_reslist'") )
	mata: `X' = st_data(., ("`d_reslist'"))

	mata `ZTZ' = `Z'' * `Z'
	mata `ZTZI' = invsym(`ZTZ')
	
	
	*Count Endogenous Variables
	local d_count = 0
	foreach var in `d_reslist' {
		local d_count = `d_count' + 1
	}
	
	*Get the coefficients in Pi, one regression at a time
	local d_1 : word 1 of `d_reslist'
	qui reg `d_1' `z_reslist', nocons
	mata: `Pi' = st_matrix("e(b)")'	
	
	forvalues i = 2/`d_count' {
		tempname Pi_`i' d_`i'
		local d_`i' : word `i' of `d_reslist'
		qui reg `d_`i'' `z_reslist', nocons
		mata: `Pi_`i'' = st_matrix("e(b)")'
		mata: `Pi' = `Pi' , `Pi_`i''
	}
		
	
	*Create matrices used for Leverage Trick
	local index = 1
	forvalues i = 1/`numclusters'{
		tempname AZC`i' AXC`i' AHC`i'
		mata st_local("len", strofreal(`Csize'[`i']))

		mata `AZC`i'' = `Z'[`index' .. `index'+`len'-1, 1...]
		mata `AXC`i'' = `X'[`index'..`index'+`len'-1, 1...]	
		mata `AHC`i'' = `AZC`i'' * `ZTZI' * `AZC`i'''
		local index = `index' + `len'
	}
	
	*Leverage Trick
	forvalues i = 1/`numclusters'{
		tempname L`i'
		mata st_local("len", strofreal(`Csize'[`i']))
		
		mata `A' = `AZC`i'' * `Pi' - `AHC`i'' * `AXC`i''
		mata `B' = invsym(I(`len') - `AHC`i'')
		mata `L`i'' = `B'*`A'
	}

	*Create a matrix of all the Ls together
	mata `L' = `L1'
	forvalues i = 2/`numclusters'{
		mata `L' = `L' \ `L`i''
	}
	
	*Convert matrix L to data for use in IV regression
	local leverage ""
	forvalues i = 1/`d_count' {
		tempvar leverage_`i'
		qui gen `leverage_`i'' = .
		mata st_store(., "`leverage_`i''", `L'[., `i'])
		local leverage = "`leverage' `leverage_`i''"
	}
	
	
	*2SLS for the answer
	qui ivregress 2sls `yres' (`d_reslist' = `leverage'), noconstant vce(cluster `cluster')
	
	
	*Returned Values
	tempname b V N 
	mat `b' = e(b)
	mat `V' = e(V)
	scalar `N' = e(N)
	
	*Rename the columns
	local names
	foreach var in `endogenous' {
		local names `names' `var'
	}
	
	matrix rownames `b' = `dependent'
	matrix colnames `b' = `names'
	matrix colnames `V' = `names'
	matrix rownames `V' = `names'
	

	*Generate variables if gen
	if "`gen'" != "" {
		forvalues i = 1/`num_gen' {
			local gen_var : word `i' of `gen'
			local leverage_var : word `i' of `leverage'
			gen `gen_var' = `leverage_var'
		}
	}
	
	** ERETURN POST	
	ereturn post `b' `V', depname("`dependent'") obs(`e(N)') esample(`esample')

	ereturn scalar N = `N'
    ereturn local cmd "cjive"
	ereturn local title "CJIVE"
    ereturn local depvar "`dependent'"
	
	display "Cluster Jackknife Instrumental Variable Estimation"
	ereturn display

	}
end