// Ariel Gu (ariel.gu@northumbria.ac.uk) and Hong Il Yoo (h.i.yoo@durham.ac.uk): 12 June 2022.
// M-Way Clustered Standard Errors
// v1.0.4 (12 June 2022) 
//  -where relevant, the code correctly prints a warning message that an eigenvalue adjustment has been applied to produce a p.s.d. covariance matrix.
//  -now support post-estimation command -suest-. See the option vmsuest(.) below.
// v1.0.3 (21 May 2020) 
//  - the code is now compatible with commands that only allow for vce(cluster clustvar) instead of both vce(cluster clustvar) and cluster(clustvar)
// v1.0.2 (16 Feb 2020)
//	- in Stata 16 or above, some commands (e.g. probit & tobit)do not allow e(N_clust) to be local. when c(stata_version) > 15, we now set the ereturn object to missing to avoid an error message.
// v1.0.1 (22 Jun 2019)
//	- like cluster(varname), do not report standard errors when one of clustering variables is a constant 
// v1.0.0 (11 Jan 2019)
//	- allows for [if] [in] and [weight]
program vcemway
	version 13.1
	if replay() {
		if ("`e(vcemway)'" != "yes") error 301
		Replay `0'
	}
	else Estimate `0'
end

program define Estimate, eclass 
	version 13.1
	syntax anything(id="command line" name=command_line) [if] [in] [fweight  aweight  pweight  iweight], CLuster(varlist min=2) ///
																								    [VMCFACTOR(string) VMDFR(integer 0) VMSUEST(string) *]  
	// Check for syntax errors
	if (`vmdfr' < 0) {
		di as red "Residual degrees of freedom for t tests and F tests, # in option -vmdfr(#)-, must be a positive integer."
		exit 197
	}
	if ("`vmcfactor'" == "") local vmcfactor default
	if ("`vmcfactor'" != "default" & "`vmcfactor'" != "minimum" & "`vmcfactor'" != "none") {
		di as red "The small-cluster correction type, str in option -vmcfactor(str)-, must be default or minimum or none."
		exit 197
	}
	if ("`vmsuest'" == "") local vmsuest no
	if ("`vmsuest'" != "no" & "`vmsuest'" != "yes") {
		di as red "The input str in option -vmsuest(str)- must be yes or no."
		exit 197
	}	
																									
	// Mark sample
	marksample touse
	markout `touse' `cluster', strok
	
	// Remove vcemway options from the rest 
	local remove vmcfactor(`vmcfactor') vmdfr(`vmdfr') vmsuest(`vmsuest')
	local options : list options - remove
	
	// Set up weight options
	if ("`weight'" != "") local weight [`weight' `exp']		
	
	// Identify the total number of clustering dimensions (k_way)
	local k_way = wordcount("`cluster'")

	// Install user-written command -tuples- if not already available
	capture which tuples
	if (_rc != 0) ssc install tuples	
	
	// Identify tuples of cluster variables. Stored as locals tuple1, tuple2, and so on, where first k_way tuples are singletons.   
	tuples `cluster', varlist
	
	// Define temporary objects (note: local ntuples counts the total number of tuples of cluster variables, and has been generated by command -tuples-).
	forvalues i = 1/`ntuples' {
		tempname V`i'
	}	
	forvalues i = `=`k_way'+1'/`ntuples' {
		tempvar cluster`i'
	}
	tempname b V_oneway V_mway V_mway_raw rank
	tempvar uhat	

	// Define local macros for cluster variables
	forvalues i = 1/`k_way' {
		local cluster`i': word `i' of `cluster'
	}
	
	// Generate temporary variables to capture intersections of clusters
	forvalues i = `=`k_way'+1'/`ntuples' {
		qui egen double `cluster`i'' = group(`tuple`i'') if `touse' `in'
	}		
	
	quietly {
		//---------------------------------------
		// Step 1: one-way cluster on `cluster1' 
		//---------------------------------------
		if ("`vmsuest'" == "yes") {
			capture	`command_line', cluster(`cluster1') `options'
			if (_rc != 0) `command_line', vce(cluster `cluster1') `options'	
			if ("`e(cmd)'" != "suest") {
			 	di as red "The option -vmsuest(yes)- must be dropped unless the command -suest- is being used."
				exit 197	
			}
		}
		else {
			capture	`command_line' `weight' if `touse' `in', cluster(`cluster1') `options'
			if (_rc != 0) `command_line' `weight' if `touse' `in', vce(cluster `cluster1') `options'
		}
		matrix `b'        = e(b)
		matrix `V_oneway' = e(V)
		local N_clust1    = e(N_clust)
		local N_clustALL `N_clust1'
		
		//===================================================================
		// Step 1.A: check if the underlying command allows -predict, score-
		//===================================================================
		if (e(k_eq) == . | e(k_eq) == 1) {
			capture predict double `uhat' if e(sample), score	
			local step4 = _rc
			local scorevar `uhat'	
		}
		else {
			capture predict double `uhat'* if e(sample), scores
			local step4 = _rc
			local scorevar `uhat'*
		}
		
		//=============================================================================
		// Step 1.B: check if -_robust- can replicate one-way clustering on `cluster1' 
		//============================================================================= 
		if (`step4' == 0) {
			// use _robust to replicate e(V) to identify which minus() option to apply:
			//	some Stata commands (e.g. probit) use minus(1), others use minus(k) (e.g. regress) where k is # of model parameters.
			local minus = colsof(`b') 
			matrix `V1' = e(V_modelbased)
			_robust `scorevar' `weight' if e(sample), cluster(`cluster1') variance(`V1') minus(`minus') 
			capture assert `=mreldif(`V_oneway', `V1')' <= 1e-8	
			
			if (_rc != 0) {
				// if minus = k does not replicate e(V), try to replicate it using minus = 1
				local minus = 1 
				matrix `V1' = e(V_modelbased)
				_robust `scorevar' `weight' if e(sample), cluster(`cluster1') variance(`V1') minus(`minus') 
				capture assert `=mreldif(`V_oneway', `V1')' <= 1e-8
				
				// if minus = 1 doesn't work either, proceed to step 4
				if (_rc != 0) {
					local step4 = 1
				}
			}
		}
		
		// If both 1.A and 1.B go through successfully, complete Step 2 and Step 3.
		if (`step4' == 0) {
			//-------------------------------------------------------------------------------------
			// Step 2: Use -_robust- to achieve one-way clustering on other variables in cluster()
			//-------------------------------------------------------------------------------------
			forvalues i = 2/`k_way' {
				matrix `V`i'' = e(V_modelbased)
				_robust `scorevar' `weight' if e(sample), cluster(`cluster`i'') variance(`V`i'') minus(`minus') 
				local N_clust`i' = r(N_clust)
				local N_clustALL `N_clustALL', `N_clust`i''
			}
			
			//--------------------------------------------------------------------------------------------
			// Step 3: Use -_robust- to achieve one-way clustering on tuples of several cluster variables
			//--------------------------------------------------------------------------------------------
			forvalues i = `=`k_way'+1'/`ntuples' {
				matrix `V`i'' = e(V_modelbased)
				_robust `scorevar' `weight' if e(sample), cluster(`cluster`i'') variance(`V`i'') minus(`minus') 
				local N_clust`i' = r(N_clust)
			}
		}
		
		// If either 1.A or 1.B fails, complete Step 4 instead.    
		// Now, obtain `ntuples' one-way clustered matrices from `ntuples' separate estimation runs.
		if (`step4' != 0) {
			//-------------------------------------------------------------------------
			// Step 4. Obtain one-way clustered covariance matricies using a long way
			//-------------------------------------------------------------------------
			tempname output_cluster1
			matrix `V1' = `V_oneway'
			est store `output_cluster1'
			forvalues i = 2/`ntuples' {
				if ("`vmsuest'" == "yes") {
					capture	`command_line', cluster(`cluster`i'') `options'
					if (_rc != 0) `command_line', vce(cluster `cluster`i'') `options'				
				}
				else {
					capture `command_line' `weight' if `touse' `in', cluster(`cluster`i'') `options'
					if (_rc != 0) `command_line' `weight' if `touse' `in', vce(cluster `cluster`i'') `options'
				}
				matrix `V`i'' = e(V)
				local N_clust`i' = e(N_clust)
				if (`i' <= `k_way') local N_clustALL `N_clustALL', `N_clust`i''			
			}
			est restore `output_cluster1'
		}
		
		//-----------------------------------------------------------
		// Step 5: compute the multi-way clustered covariance matrix 
		//-----------------------------------------------------------
		if ("`vmcfactor'" == "minimum") {
			// use G/(G-1) as the common correction factor where G = minimum of `k_way' cluster sizes		
			forvalues i = 1/`ntuples' {
				matrix `V`i'' = min(`N_clustALL') / (min(`N_clustALL') - 1) * (`N_clust`i'' - 1) / (`N_clust`i'') * `V`i''
			}
		}
		if ("`vmcfactor'" == "none") {
			// cancel out Stata's default small-cluster correction factor		
			forvalues i = 1/`ntuples' {
				matrix `V`i'' = (`N_clust`i'' - 1) / (`N_clust`i'') * `V`i''
			}
		}			
		
		// continue using G/(G-1) * (N - 1)/(N - K) as the multiplication factor where G varies across `ntuples' matrices 
		matrix `V_mway' = `V1'
		forvalues i = 2/`ntuples' {
			// in case the tuple comprises an even number of clustering dimensions, subtract the covariance matrix:
			// see formula (2.13) in Cameron, Gelbach and Miller [2012]
			if (`=mod(wordcount("`tuple`i''"),2)' == 0) {
				matrix `V_mway' = `V_mway' - `V`i''
			}
			// otherwise, add the matrix
			else matrix `V_mway' = `V_mway' + `V`i''
		}
		
		//------------------------------------------------------------------------------------------
		// Step 6. check V_mway is p.s.d.; if not, replace eigenvalues with zeroes and resconstruct
		//------------------------------------------------------------------------------------------
		mata {
			V_mway = st_matrix(st_local("V_mway"))
			st_numscalar(st_local("rank"), rank(V_mway))
			symeigensystem(V_mway, EVEC = ., eval = .)
			if (min(eval) < 0) {
				eval = eval :* (eval :> 0) 
				st_matrix(st_local("V_mway_raw"), V_mway)
				st_matrix(st_local("V_mway"), EVEC*diag(eval)*EVEC')
				st_local("replace","yes")	
			}
		}

		//------------------------------------------------------------------------------------------
		// Step 7. like cluster(varname), set V_mway to 0s if any clustering variable is a constant
		//------------------------------------------------------------------------------------------		
		if (min(`N_clustALL') == 1) {
			mata: V_mway = st_matrix(st_local("V_mway"))
			mata: st_matrix(st_local("V_mway"), J(rows(V_mway), cols(V_mway), 0))											
			local rank = 0
		}
		
		//--------------------------------------------------------------------------------------------
		// Final Step: add extra items to ereturn list and post multi-way clustered covariance matrix
		//--------------------------------------------------------------------------------------------
		ereturn local vcemway "yes"
		ereturn local clustvar "`cluster'"
		forvalues i = 1/`k_way' {
			ereturn local clustvar`i' "`cluster`i''"
		}
	
		// see vcemway release note v1.0.2 for why next two command lines differ for version 16 or above vs others
		if (`=int(c(stata_version))' > 15) ereturn local N_clust = .
		else ereturn local N_clust "N_clust'i' reports the number of clusters in clustvar'i'"
		
		ereturn local vmcfactor "`vmcfactor'"

		forvalues i = 1/`k_way' {
			ereturn scalar N_clust`i' = `N_clust`i''
		}
		ereturn scalar rank	= `rank'
		if (e(df_r) != .) {
			if (`vmdfr' > 0) ereturn scalar df_r = `vmdfr'
			else ereturn scalar df_r = min(`N_clustALL') - 1 
		}
		if ("`replace'" == "yes") ereturn matrix V_raw = `V_mway_raw'
		ereturn repost V=`V_mway'
	}
	
	// Display results
	Replay
	if (e(df_r) == . & `vmdfr' > 0) {
		di as text ""
		di as text "	# residual degrees of freedom in vmdfr(#) is irrelevant: `e(cmd)' does not use t tests and F tests."
	}	
end

program define Replay	
	version 13.1
	
	// Display estimation results with two-way clustered standard errors
	if ("`e(cmd)'" != "") `e(cmd)'
	else estimates replay
	
	// Display any notes (NOTE: Replay and Estimate are separate programs. Locals N_clustALL and k_way must be regenerated).
	local k_way = wordcount("`e(clustvar)'")
	local N_clustALL e(N_clust1)

	di as text "Notes:"
	di as text "	Std. Err. adjusted for `k_way'-way clustering on `e(clustvar)'"	
	
	forvalues i = 1/`k_way' {
		di as text "	  Number of clusters in " as text %-12s abbrev("`e(clustvar`i')'",12) " = " as result `e(N_clust`i')'
		if (`i' > 1) local N_clustALL `N_clustALL', e(N_clust`i')
	}
	if ("`e(vmcfactor)'" == "default") {
		di as text ""
		di as text "    Stata's default small-cluster correction factors have been applied. See {helpb vcemway} for detail."
	}
	if ("`e(vmcfactor)'" == "minimum") {
		di as text ""
		di as text "    The small-cluster correction factor is G/(G-1) where G = " as result `=min(`N_clustALL')' as text ", the minimum of `k_way' cluster sizes."
	}	
	if ("`e(vmcfactor)'" == "none") {
		di as text ""
		di as text "    No small-cluster correction factor has been applied."
	}		
	
	if (e(df_r) != .) {
		di as text "" 
		di as text "	Residual degrees of freedom for t tests and F tests = " as result `e(df_r)' 	
	}	
	if (e(F) != .) {
		local stat F(,)
		local prob Prob > F
	}
	if (e(chi2) != .) {
		local stat chi2()
		local prob Prob > chi2
	}		
	if ("`e(chi2type)'" != "LR" & "`stat'" != "") {
		di as text ""
		di as text "    `stat' and `prob' above only account for one-way clustering on `e(clustvar1)'."       
		di as text "      Use {helpb test} to compute `stat' and `prob' that account for `k_way'-way clustering."		
	}
	capture confirm matrix e(V_raw)
	if (_rc == 0) {
		di as text ""
		di as text "	The initial variance-covariance matrix, " as result "e(V_raw)" as text ", was not positive semi-definite." 
		di as text "	  The final matrix, " as result "e(V)" as text ", was computed by replacing negative eigenvalues with 0s."
	}
end 

exit