/* change log
2015-04-24: extraction of cluster variable from models fixed.
- more robust treatment of unidentified covariates
2015-08-26: word count more robust to very long lists
*/
program rhausman, rclass
	syntax [anything] [if] [in] [aw pw /] [, reps(integer 100) subset(string) ///
	bsdata(string) cluster detail]
	version 12
	
	*errors
	if wordcount("`anything'")!=2 {
		di as err "you must specify two models"
		exit 198
	}
	local e1 : word 1 of `anything'
	local e2 : word 2 of `anything'
	if "`e1'" == "`e2'" {
		di as err "the two models need to be different"
		exit 198
   	}
	
	if "`detail'"=="detail" 	loc hide ""
	else						loc hide "qui"
	
	*time stamp	
	loc t1=string(  clock("$S_DATE $S_TIME", "DMYhms"), "%14.0f"  )
	
	*model 1
	`hide' est res `e1'
	loc x1: colf e(b)
	*di "x1: `x1'"
	mat b1=e(b)'
	loc k1= rowsof(b1)
	loc cmd1 "`e(cmdline)'"
	loc dep "`e(depvar)'"
	gettoken (local) regcmd1 (local) regopt1: cmd1, p(,)

	*extract cluster variable if specified
	if "`cluster'"!="" {
		if "`e(clustvar)'"!="" 	loc clustvar "`e(clustvar)'"
		else 					loc clustvar "`e(ivar)'"
	}
	
	*produce error if factor variables are used
	if regexm("`regcmd1'","(.*)[#](.*)") | regexm("`regcmd1'","[ci]\.(.*)") {	
		di as err "Please avoid factor variables in your model."
		exit
	}
	
	*model 2
	`hide' est res `e2'
	loc cmd2 "`e(cmdline)'"
	mat b2=e(b)'
	loc k2= rowsof(b2)
	loc x2: colf e(b)
	`hide' di "x1: `x1'"
	`hide' di "x2: `x2'"
	
	*obtain union of variable lists
	loc x1: list x1 | x2

	*clean the x-list from prefixes
	loc length: word count `x1' 
	foreach v of loc x1 {
		if regexm("`v'","(.?)o[.](.*)") { // if omitted
		*di "`v'"
			loc vclean = regexr("`v'","(.*)o[.]","")
			loc xzero "`xzero' `vclean'"
		}
		else { // if variable not omitted
		loc vclean = regexr("`v'","(.*)[:]","") // drop equation name
		loc xclean "`xclean' `vclean'"
		}
	}
	
	* remove duplicates
	loc xclean: list uniq xclean
		
	*replace x-list with subset if specified
	if "`subset'"!="" {
	loc xclean: list xclean & subset
	}
	*exclude constant
	loc constant "_cons"
	`hide' di "xclean before: `xclean'"
	loc xclean: list xclean - constant
	`hide' di "xclean after: `xclean'"
	`hide' di "xzero: 	`xzero'"
	
	* remove duplicates
	loc xclean: list uniq xclean
	
	* exclude coefficients that are not identified (if not yet excluded)
	`hide' est res `e1'
	foreach v of loc xclean {
		cap di _b[`v']
		if !_rc {
			if (_b[`v']==0)	loc xclean: list xclean - v
		}
	}
	`hide' est res `e2'
	foreach w of loc xclean {
		cap di _b[`w']
		if !_rc {
			if _b[`w']==0	loc xclean: list xclean - w
		}
	}
	
	
	*extract cluster variable if not yet extracted in model 1
	if "`cluster'"!="" & "`clustvar'"=="" {
		if "`e(clustvar)'"!="" 	loc clustvar "`e(clustvar)'"
		else 					loc clustvar "`e(ivar)'"
	}
	if "`clustvar'"=="" {
		di as err "You must specify a cluster variable in your models " /*
		*/ "when using the 'cluster' option in rhausman"
		exit
	}

	*scalar names: bootstrapped coefficients
	foreach v of loc xclean {
		loc sim1 "`sim1' (b1_`v')"
		loc sim2 "`sim2' (b2_`v')"
	}
	`hide' di "`sim1' " _n "`sim2'"
	
	*for postfile: define coefficient names
	foreach v of loc xclean {
		loc b1names = "`b1names' b1_`v'"
		loc b2names = "`b2names' b2_`v'"
	}
	`hide' di "b1names: `b1names' "
	`hide' di "b2names: `b2names' "

	*--------------
	*start boostrap code
	cap postclose bootstr
	tempfile bootstr
	`hide' postfile bootstr `b1names' `b2names' using `bootstr', replace
	
	*iterate
	forv i=1/`reps' {
	
		preserve
		
		*resample
		if "`cluster'"=="" {
			bsample
		}
		else {
			cap drop `id'
			tempvar id 
			bsample, cluster(`clustvar') idcluster(`id')
			qui xtset `id'
		}
		*estimate model 1
		qui `cmd1'
		foreach v of loc xclean {
		sca b1_`v' = _b[`v']
		}
		mat b1=e(b)'
		*estimate model 2
		qui `cmd2'
		mat b2=e(b)'
		foreach v of loc xclean {
		sca b2_`v' = _b[`v']
		}
		
		*save bootstrapped coefficients in postfile
		post bootstr `sim1' `sim2'
		
		* forecast of computation time
		if (`i'==50 & `reps'>100) {
		loc t2=string(  clock("$S_DATE $S_TIME", "DMYhms"), "%14.0f"  )
		loc trem=(`reps'/50 - 1) * (`t2'-`t1')
		}
		*bootstrap dots
		if (`i'==1) {
		di as text "bootstrap in progress" 
		di as txt "{hline 4}{c +}{hline 3} 1 "	///
				  "{hline 3}{c +}{hline 3} 2 "	///
				  "{hline 3}{c +}{hline 3} 3 "	///
				  "{hline 3}{c +}{hline 3} 4 "	///
				  "{hline 3}{c +}{hline 3} 5 "		
		}
		loc round=`i'/50
		capture confirm integer number `round'
		if !_rc 			di in gr ". " `i'
		if  _rc				di in gr "." _c
		if (`i'==50 & `reps'>100) {
		di "(This bootstrap will approximately take another " ///
		floor(hours(`trem')) "h. " ///
		floor(minutes(`trem') - floor(hours(`trem'))*60 ) "min. " ///
		seconds(`trem') - floor(minutes(`trem'))*60 "sec.)"
		}
		
		restore
		
	}
	postclose bootstr
	*--------------	end of boostrap code

	
	*open file with bootstrapped coefficients
	preserve
	use `bootstr', clear

	*vector of differences in coefficients
	tempname bdif
	foreach v of loc xclean {
		qui est res `e1'
		sca b1_`v' = _b[`v']
		qui est res `e2'
		sca b2_`v' = _b[`v']
		mat `bdif'=nullmat(`bdif') , (b1_`v' - b2_`v')
	}
	`hide' matlist  `bdif'
	local df = colsof(`bdif')

	*generate bootstrapped differences in coefficients
	foreach v of loc xclean {
	g double d_`v'=b1_`v' - b2_`v'
	loc dbeta "`dbeta' d_`v'" 
	}
	
	*covariance matrix of bootstrapped differences
	qui cor `dbeta', cov
	mata:Vdif=st_matrix("r(C)")
	tempname Vdif 
	mat `Vdif' =r(C)
	
	*Hausman test statistic
	mata:bdif=st_matrix("`bdif'")
	//mata:bdif
	//mata:invsym(Vdif)
	mata:chi2 = bdif * invsym(Vdif) * bdif'
	tempname chi2 rank
	mata:st_numscalar("`chi2'",chi2)
	mata:st_numscalar("`rank'",rank(Vdif))
	`hide' di "chi2-statistic: " `chi2'
	`hide' di "df: " `df'
	`hide' di "rank: " `rank'
	`hide' di "pval: " chiprob(`rank',`chi2')

	`hide' mean `dbeta'
	tempname b_dif_boot
	mat `b_dif_boot' = e(b)

	*generate dataset
	if "`bsdata'"!=""  save `bsdata'
	
	restore

	*display results
	di as txt "{hline 80}"
	if "`cluster'"=="cluster" loc clu_label "Cluster-"
	di as res "`clu_label'Robust Hausman Test" 
	di as txt "(based on `reps' bootstrap repetitions)"  _n
	di as txt "{lalign 80:b1: obtained from `cmd1'}"
	di as txt "{lalign 80:b2: obtained from `cmd2'}"
	if "`subset'"!="" {
	di as txt "Included in the test: `xclean'"
	}
	if "`xzero'"!="" {
	di as text "Excluded (not identified, or only identified in one model): `xzero'"
	}


	di _n as txt "    Test:  Ho:  difference in coefficients not systematic"
	di _n as txt "{ralign 25:chi2({res:`=`df''})}" ///
	   " = (b1-b2)' * [V_bootstrapped(b1-b2)]^(-1) * (b1-b2)"

	di as txt _col(27) "=  " as res %10.2f `chi2' _n ///
	   as txt _col(17) "Prob>chi2 =  "               ///
	   as res %10.4f chiprob(`df', `chi2')

	if `rank'<`df' {
	di _n as res _col(5) "Warning: your covariance matrix is rank deficient. " ///
		"It may be that the number of bootstrap repetitions was smaller " ///
		"than the number of independent variables."
	}
	
	
	*return results in r()
	ret scalar p    	= chiprob(`df', `chi2')
	ret scalar df   	= `df'
	ret scalar rank   	= `rank'
	ret scalar chi2 	= `chi2'
	
	ret mat V_dif = `Vdif'
	ret mat b_dif_boot=`b_dif_boot'
	ret mat b_dif = `bdif'
	
	
end