/******************************
* cgmwildboot
* Regression with wild cluster
* bootstrap standard errors as
* in Cameron, Gelbach and Miller
* (2008 ReStat)
* Updates 2013-09-06:
* - Changed p-value computation to base p-values for negative coefficients on
*   the lower tail of the bootstrap distribution and for positive coefficients
*   on the upper tail of the bootstrap distribution
*
* - Changed covariance matrix entries to deal with extreme p-values of 0 or 1,
*   which can occur if initial-tstat is the larger or smaller than any of the
*   bootstrap t-stats. The covariance matrix now puts maxfloat() for a p-value
*   of one (approximately infinite variance, which will give a p-value of 1)
*   and 1/maxfloat() for a p-value of zero (approximately zero variance, which
*   will give a p-value of zero)
* Updates 2013-10-15
* - Added check for the "unique" command
*
* Updates 2013-10-18
* - Corrected number of t-stats divisor for the t-statistic
* - Corrected computation of confidence interval
*
* Updates 2014-02-04
* - Added adjusted R2 and R2 labels
* - Updated cgmreg (does not modify this ado file, but affects results from it)
*
* Updates 2014-02-19
* - Added code to deal with dropped variables (e.g., fixed effects dropped due to collinearity)
* - Similar updates to cgmreg (does not modify this ado file, but affects results from it)
*
* Updates 2014-03-26
* - Corrected error with confidence interval (force negative coefficients to look at
*   lower tail for p-values, and force positive to look at upper tail)
*
* Updates 2015-03-10
* - Corrected confidence interval (again) - Put code to avoid p-values greater than one
*   when, for example, the coefficient is positive and lies within the bottom half of
*   the bootstrap sample (previously, if t-stat on positive coefficient was at the, say
*   25th percentile, the p-value would be 2*(1 - 0.25) = 1.5; this puts it to 1)
*
******************************/


program define cgmwildboot, eclass byable(onecall) sortpreserve
	syntax varlist [if] [in] [aweight fweight iweight pweight /], Cluster(varlist) bootcluster(varlist min=1 max=1) [null(string) reps(integer 1000) seed(numlist integer >=0) maxbad(integer 10) *]

	/* Temporary variables, etc */
	tempname depvar numx numxc tmpi tmpj tmpv b covmat depv xnull xlist xlistc tmpx tmpb tmpb tmpp opt tmat imat bmat e_NC e_S nbad numt p ig iy c1 c2 c3 c4 c5 ///
		bsb cH cL cN e_N e_df_m e_df_r e_title e_properties e_predict e_model e_estat_cmd e_r2 e_r2_a gmat nreps matnames colnames omit colN kC
	tempvar tmpy tmp pos we wy yhat ehat tmpx n regvar clusvar bootvar
	tempfile data

	capture which unique
	if _rc != 0 {
		di as error `"You need the unique command. You can obtain it by typing "findit unique" in Stata."'
		exit
		}

	
	preserve

	qui query memory
	if r(matsize)<2*`reps' {
		qui set matsize `=2*`reps''
		}

	/* Mark sample */
	marksample touse
	qui egen `regvar'=rowmiss(`varlist')
	qui egen `clusvar'=rowmiss(`cluster')
	qui egen `bootvar'=rowmiss(`bootcluster')
	qui replace `touse'=(`touse' & `regvar'==0 & `clusvar'==0 & `bootvar'==0)


	/* deal with weights */
	if "`weight'"~="" {
		local weight "[`weight'=`exp']"
	} 
	else {
		local weight ""
	}




	/* Clean up options */
	local `opt' "`options'"
	while ( regexm("``opt''","robust")==1 ) {

		di " -> Removing string 'robust' from your options line: it's unnecessary as an option,"
		di "    but it can cause problems if we leave it in."
		di "    If some variable in your options list contains the string 'robust', you will"
		di "    have to rename it."
		di 
		local `opt' = regexr("``opt''", "robust", "")

		} 



	while regexm("``opt''","cluster")==1 {
		local `opt'=regexr("``opt''","cluster","")
		}

	while regexm("``opt''","bootcluster")==1 {
		local `opt'=regexr("``opt''","bootcluster","")
		}


	/* Remove boot cluster from cluster list */

	while regexm("`cluster'","`bootcluster'")==1 {
		local cluster=regexr("`cluster'","`bootcluster'","")
		}

	scalar `e_NC'=wordcount("`cluster' `bootcluster'")

	/* Get regression coefficients */

	capture cgmreg `varlist' if `touse' `weight', cluster(`cluster' `bootcluster') ``opt''


	if _rc != 0 {
		di as error "Initial estimtation does not work with cgmreg. Try"
		di as error "cgmreg by itself to see what the problem is."
		exit
		}


	mat `b'=e(b)
	local `tmpx' : colnames `b'
	local `xlistc' : colnames `b'

	_ms_omit_info `b'
	matrix `omit' = r(omit) // Identify omitted variables

	/* Identify regressors */
	local `xlist' : subinstr local `tmpx' " _cons" ""

	local `numx' : word count ``xlist''

	if "`noconstant'"=="" {
		scalar `numxc'=``numx''+1
		}
	else {
		di as error "Sorry, this program does not allow the noconstant option."
		exit
		}


	/* Check for null hypothesis list */

	matrix `xnull'=J(1,`numxc',.)
	if "`null'" != "" {
		if wordcount("`null'") == ``numx'' {
			forvalues k=1(1)``numx'' {
				matrix `xnull'[1,`k']=real(word("`null'",`k'))
				}
			}
		else {
			di as error "Incorrect elements for null. Must contain ``numx''"
			di as error "elements, use missing (.) for regressors without null."
			exit
			}
		}



	mat `covmat'=J(`numxc',`numxc',0)
	mat rownames `covmat'= ``tmpx''
	mat colnames `covmat'= ``tmpx''
	scalar `e_df_r'=e(df_r)
	scalar `e_df_m'=e(df_m)
	local `depvar'=e(depvar)
	scalar `e_N'=e(N)
	scalar `e_S'=e(S)
	local `e_title'=e(title)
	local `e_properties'=e(properties)
	local `e_predict'=e(predict)
	local `e_model'=e(model)
	local `e_estat_cmd'=e(estat_cmd)
	scalar `e_r2'=e(r2)
	scalar `e_r2_a'=e(r2_a)


	/* Write initial t-stats */

	scalar `tmpi'=`numxc'+3
	matrix `tmat'=[1,0,1,J(1,`numxc',.)]
	matrix colnames `tmat'=iter src type ``xlistc''
	matrix `bmat'=`tmat'
	matrix `bmat'[1,3]=0
	
	forvalues j=1(1)`=`numxc'' {
		local `tmpj' : word `j' of ``xlistc''
		if missing(`xnull'[1,`j']) matrix `tmat'[1,`j'+3]=`b'[1,`j']/_se[``tmpj'']
		else matrix `tmat'[1,`j'+3]=(`b'[1,`j']-`xnull'[1,`j'])/_se[``tmpj'']

		matrix `bmat'[1,`j'+3]=`b'[1,`j']
		}

	/*
	* Generate variables with null imposed 
	* - The variable tmpx contains the imposed null
	* - The local tmpx contains the variables without an imposed null
	*/


	qui gen `tmpy'=``depvar''
	qui gen `tmpx'=0
	local `tmpx'=""
	forvalues k=1(1)``numx'' {
		local `tmpv' : word `k' of ``xlist''
		if missing(`xnull'[1,`k']) local `tmpx' "``tmpx'' ``tmpv''"
		else {
			qui replace `tmpy'=`tmpy'-`xnull'[1,`k']*``tmpv''
			qui replace `tmpx'=`tmpx'+`xnull'[1,`k']*``tmpv''
			}
		}

	qui reg `tmpy' ``tmpx'' if `touse' `weight', ``opt''
	qui predict `yhat' if `touse', xb
	qui predict `ehat' if `touse', resid
	qui replace `yhat'=`yhat'+`tmpx' if `touse'

	qui sort `bootcluster'
	qui save `data'


	/* Get number of clusters */

	scalar `tmpi'=wordcount("`cluster' `bootcluster'")
	matrix `gmat'=J(1,`tmpi',.)
	scalar `tmpi'=0
	foreach k in `cluster' `bootcluster' {
		scalar `tmpi'=`tmpi'+1
		qui unique `k' if `touse'
		matrix `gmat'[1,`tmpi']=_result(18)
		}


	/*  Begin bootstrap */

	scalar `nreps'=1
	_dots 0, title("Bootstrap reps") reps(`reps')
	_dots 1 0
	if "`seed'" ~= "" set seed `seed'
	scalar `nbad'=0
	scalar `tmpi' = 2
	while `=`tmpi''<=`reps' {
		qui use `data', replace
		qui by `bootcluster': gen `tmp'=uniform()
		qui by `bootcluster': gen `pos'=(`tmp'[1]<0.5)
		qui gen `we'=`ehat'*(2*`pos'-1)
		qui gen `wy'=`yhat'+`we'
		capture cgmreg `wy' ``xlist'' if `touse' `weight', ``opt'' cluster(`cluster' `bootcluster')
		if _rc==0 {
			scalar `tmpi' = `tmpi'+1
			_dots `tmpi' 0
			scalar `nreps'=`nreps'+1
			local `tmpx' : colnames e(b)
			matrix `tmat'=[`tmat' \ `nreps' ,1,1,J(1,`numxc',.)]
			matrix `bmat'=[`bmat' \ `nreps' ,1,0,J(1,`numxc',.)]
			forvalues j=1(1)`=`numxc'' {
				local `tmpj' : word `j' of ``tmpx''
				if missing(`xnull'[1,`j']) matrix `tmat'[`nreps',`j'+3]=(_b[``tmpj'']-`b'[1,`j'])/_se[``tmpj'']
				else matrix `tmat'[`nreps',`j'+3]=(_b[``tmpj'']-`xnull'[1,`j'])/_se[``tmpj'']
				matrix `bmat'[`nreps',`j'+3]=_b[``tmpj'']
				}
			scalar `nbad'=0
			}
		else if `nbad'<`maxbad' scalar `nbad'=`nbad'+1
		else {
			di as error "Number of failed regressions exceeded `maxbad' in a row"
			exit
			}

		qui drop `we' `wy'
		}


	
	/* End of bootstrap */

	/* Summarize and display results */
	matrix `tmat'=`tmat'[1..`nreps',1...]
	matrix `bmat'=`bmat'[1..`nreps',1...]
	matrix `tmat'=[ `tmat' \ `bmat' ]

	qui drop _all

	local `matnames' : colnames `tmat'

	foreach k of local `matnames' {
		_ms_parse_parts `k'
		if length("`r(level)'")==0 local `colN' `"`=r(name)'"'
		else local `colN' `"`=r(name)'_`r(level)'"
		local `colN' : subinstr local `colN' "_cons" "cons"
		local `colnames' "``colnames'' ``colN''"
		}

	*noisily di "``colnames''"


	matrix colnames `tmat' = ``colnames''

	qui svmat `tmat', names(col)


	/*

	Matrix has columns
		iter				Bootstrap iteration
		src				Original (0) or bootstrapped (1)
		type				Coefficient (0) or t-stat (1)
		<variable names>	Either coefficient or t-stat, depending on type

	*/

	local `ig' "in green"
	local `iy' "in yellow"
	local `c1' "_col(16) %10.0g"
	local `c2' "_col(28) %10.0g"
	local `c3' "_col(40) %10.0g"
	local `c4' "_col(52) %10.0g"
	local `c5' "_col(64) %10.0g"
	


	di
	di ``ig'' "Regress with clustered SEs/Wild bootstrap (" ``iy'' `=`nreps'' ``ig'' " successful resamples)"
	di ``ig'' "Number of clustvars" _column(20) "=" _column(21) %5.0f ``iy'' `=`e_NC'' ///
		``ig'' _col (50) "Number of obs" _col(64) "=" _column(66) %8.0f ``iy'' `=`e_N''
   	di ``ig'' "Num combinations" _column(20) "=" _column(21) %5.0f in yellow `=`e_S'' ///
		``ig'' _column(50) "R-squared" _column(64) "=" _column(66) %8.4f ``iy'' `=`e_r2''
	di ``ig'' _column(50) "Adj R-squared" _column(64) "=" _column(66) %8.4f ``iy'' `=`e_r2_a''

	if "`if'"~="" di in green _column(50) "If condition" _column(64) "= `if'"
	if "`in'"~="" di in green _column(50)     "In condition" _column(64) "= `in'"
	if "`weight'"~="" di in green _column(50) "Weights are" _column(64) "= `weight'"

	scalar `tmpi'=0
	foreach k in `cluster' `bootcluster' {
		scalar `tmpi' = `tmpi' + 1
		di _column(50) ``ig'' "G(`k')" _column(64) "=" _column(66) %8.0f ``iy'' `gmat'[1,`tmpi']
		tempname N_`=`tmpi'' NM_`=`tmpi''
		local `NM_`=`tmpi''' "`k'"
		scalar `N_`=`tmpi''' = `gmat'[1,`tmpi']
		} /* end getting num obs by cluster var */
	di _column(50) ``ig'' "(Bootstrapped)"


	di ``ig'' "{hline 12}{c TT}{hline 60}"
	di ``ig'' %12s abbrev("``depvar''",12) "{c |}" %12s "Coef." %12s "Null" %12s "p-value" %24s "[95% Conf. Interval]"
	di ``ig'' "{hline 12}{c +}{hline 60}"

	qui gen `n'=.

	forvalues k=1(1)`=`numxc'' {
		local `kC' = `k'+3
		local `tmpv' : word ``kC'' of ``colnames''

		if `omit'[1,`k']==0 {

			qui summ ``tmpv'' if type==1
			scalar `numt'=r(N)
			qui sort type ``tmpv''
			qui by type: replace `n'=_n
			/* Average observations if several close to observed t-stat */
			qui summ ``tmpv'' if type==1 & src==0
			scalar `tmpi'=r(mean)

			qui summ `n' if abs(``tmpv''-`tmpi') < 0.000001 & type==1
			scalar `p'=2*cond(`tmpi'<0,min(r(mean)/`numt',0.5),cond(`tmpi'>0,min(1-r(mean)/`numt',0.5),1,.),.)


			/* Fake covariance matrix and (real) confidence interval */
			/* Pick biggest/smallest if no match */
			qui summ ``tmpv'' if type==0 & src==0
			scalar `tmpi'=r(mean)
			mat `covmat'[`k',`k']=cond(`p'==0,1/maxfloat(),cond(`tmpi'==0 | `p'==1,maxfloat(),(`tmpi'/invttail(`e_df_r',`p'/2))^2,.),.)


			if `covmat'[`k',`k']==. mat `covmat'[`k',`k']=0
			qui summ `n' if type==0 & ~missing(``tmpv'')
			scalar `cL'=r(min)
			scalar `cH'=r(max)
			scalar `cN'=r(max)-r(min)+1
			qui summ ``tmpv'' if type==0 & `n'==floor(`cL'+0.025*(`cN'))
			if r(N)==0 {
				qui summ ``tmpv'' if type==0
				scalar `cL'=r(min)
				}
			else scalar `cL'=r(mean)
	
			qui summ ``tmpv'' if type==0 & `n'==ceil(`cH'-0.025*(`cN'))
			if r(N)==0 {
				qui summ ``tmpv'' if type==0
				scalar `cH'=r(max)
				}
			else scalar `cH'=r(mean)
			di ``ig'' %12s abbrev("``tmpv''",12) "{c |}" ``iy'' ``c1'' `b'[1,`k'] ``c2'' `xnull'[1,`k'] ``c3'' `p' ``c4'' `cL' ``c5'' `cH'
			}
		else {
			di ``ig'' %12s abbrev("``tmpv''",12) "{c |}" ``c1'' "(dropped)"
			}
		}

	di ``ig'' "{hline 12}{c BT}{hline 60}"


	/* Post results */
	qui use `data', clear

	ereturn post `b' `covmat', e(`touse') depname(``depvar'')

	ereturn scalar N		= `e_N'
	ereturn scalar df_m	= `e_df_m'
	ereturn scalar df_r	= `e_df_r'
	ereturn scalar r2		= `e_r2'
	ereturn scalar r2_a	= `e_r2_a'
	ereturn scalar nreps	= `nreps'
	ereturn scalar NC		= `e_NC'
	ereturn scalar S		= `e_S'
	forvalues k=1(1)`=`e_NC'' {
		ereturn scalar N_``NM_`k''' = `N_`k''
		}
	
	ereturn local title		= "``e_title''"
	ereturn local depvar		= "``depvar''"
	ereturn local cmd			= "cgmwildboot"
	ereturn local properties	= "``e_properties''"
	ereturn local predict		= "``e_predict''"
	ereturn local model		= "``e_model''"
	ereturn local estat_cmd		= "``e_estat_cmd''"
	ereturn local vcetype		= "FAKE"
	ereturn local clustvar		= trim("`cluster' `bootcluster'")
	ereturn local bootclust		= "`bootcluster'"

	ereturn matrix bootresults=`tmat'

	restore

end