/* surveybias.ado --- 
 * 
 * Filename: surveybias.ado
*! Author: Kai Arzheimer & Jocelyn Evans
*! 1.4 Aug 07 2015
 * URL: 
 * Keywords: 
 * Compatibility: 
 * 
 */

/* Commentary: 
 * 
 * 
 * 
 */

/* Change log:
 * 
 * 
 */

/* Code: */


*! Calculate A, A', B and Chi-2

program define surveybias, eclass byable(recall) properties(svyb svyj svyr)
version 11
	if !replay() {
		syntax varname(numeric)  [if] [in] [fweight pweight iweight aweight], POPvalues(numlist min=2) [VERBose] [prop] [NUMerical] [Level(cilevel)] [VCE(passthru)] [subpop(passthru)] [CLuster(passthru)] [svy]
* prop is a synonym for option previously known as numerical
		if "`prop'" != "" {
			local numerical "numerical"
			}
		marksample touse
		local commandline   "surveybias `*'"
* switch to svy if subpop-Option set & check that var exists & is unambiguous
		if "`subpop'" != "" {
			local checkvar : subinstr local subpop "subpop(" ""
			local checkvar : subinstr local checkvar ")" ""
			confirm var `checkvar'
			local svy "svy"
			}
		if "`svy'" != "" {
			display "Using survey characteristics of your data" _n
			local setsurveyprefix "svy , `subpop' : "
			if "`numerical'" == "" {
				local numerical "numerical"
				local complex "complex"
				}
			}
		if "`cluster'" != "" | "`vce'" != "" {
* Does clustering variable exist? Is it unambiguous? 
			if "`cluster'" != "" {
				local checkvar : subinstr local cluster "cluster(" ""
				local checkvar : subinstr local checkvar ")" ""
				confirm var `checkvar'
				}
			if "`numerical'" == "" {
				}
			local numerical "numerical"
			local complex "complex"
			}
* Take care of weights
		if "`weight'" != "" {
			local weighting "[`weight' `exp']"
			}
		if "`weight'" == "pweight" & "`numerical'" == "" {
			local numerical "numerical"
			local complex "complex"
			}
		if "`weight'" == "aweight" {
			if "`cluster'" != "" | "`vce'" != "" {
				display as error _n "You may not combine aweights with complex variance estimators" _n
				error 406
				}
			if "`numerical'" != "" {
				display  "Switching to analytical methods"
				local numerical ""
				local complex "complex"
				}
			}

* unique values + integer values
		qui inspect `varlist' if `touse'
		local nunique=r(N_unique)
		tempname intvalues
		scalar `intvalues' = r(N_posint) + r(N_negint)
* Min/max
		qui summarize `varlist' if `touse'
		local mincode=`r(min)' 
		local maxcode=`r(max)'

* Test for problems with variable
	
		if `nunique' <2 {
			display as error "There is no variation in `varlist'" _newline
			error 409
			* No variation in variable
		}
		else if `nunique' >12 {
			display "Warning: `varlist' has many (`nunique') unique values" _newline
    		display "combining values/recoding `varlist' is strongly recommended" _newline
		}
		else if `r(N)' > `intvalues' {
			display as error "`varlist' has non-integer values" _newline
			exit 498
			}
		else if `nunique' > (`maxcode' - `mincode' +1) {
			display "Warning: category codes non-consecutive"
			}

* Test for problems with numlist
		local listnum : word count `popvalues'
		if `listnum' != `nunique' {
			display as error "Number of categories in  sample (`nunique') and population (`listnum') do not match" _newline
			exit 498
			}

* Test for non-positive values in numlist
		foreach value of local popvalues  {
			if `value'  <=0 {
				display as error "All proportions must be strictly positive" _newline
				error 125
				}
			}
		
		if "`verbose'" != "" {
			dis as txt "Information on `varlist'"	
			codebook `varlist'	
			dis as txt "Please make sure that `varlist' is correctly labelled, and that"
			dis as txt "the sequence of population values matches `varlist' "
			dis _n
			}
* Normalise information on population to proportions

* Calculate sum
		local popvalsum = subinstr("`popvalues'"," "," + ",.)
* Force evaluation of sum	
		local popvalsum = `popvalsum'
* Normalise
		local normpopvalues = ""
		foreach v of local popvalues {
			local normedval =  `v' / `popvalsum'
			local normpopvalues `normpopvalues' `normedval'
			}

* Is there a significant difference between the distribution in the sample and the known distribution in the population?
* Calculate empirical proportions.
* Run a chi-square test further down the line
		tempname observed
		if "`weight'" == "pweight" {
			qui tab `varlist' if `touse' ,matcell(`observed')
			}
		else {
			qui tab `varlist' `weighting' if `touse' ,matcell(`observed')
			}
		local cases = r(N)
		matrix `observed' = `observed''

* Collect labels/numbers in macro
			qui levelsof `varlist',local(surveycats)
			foreach s of local surveycats {
			local valueorlabel : label (`varlist') `s' 12
* Limit length of label and sanitize (chomp spaces)
			local valueorlabel : subinstr local valueorlabel " " "", all
			local surveycatlabels `surveycatlabels' A':`valueorlabel'
				}


/*****************************************************************/
/* Major branch ahead:														  */
/* Analytical solution (new in version 1.2 and now default) vs	  */
/* prop/logit solution (default in previous versions)	     */
/*****************************************************************/

* Share code over both branches
* Build vector of v-components (v/(1-v))
			tempname vcomponent pcomponent v p missmat 
			matrix `vcomponent' = J(1,`listnum',0)
			matrix `v' = J(1,`listnum',0)
			forvalues index = 1/`listnum' {
				local thispopval : word `index' of `normpopvalues'
				matrix `vcomponent'[1,`index'] = (1- `thispopval') / `thispopval'
* Keep untransformed population proportions as vector v
				matrix `v'[1,`index'] = `thispopval'
				}
* Create vector for As
			matrix aprimes = J(1,`listnum',0)

		
		if "`numerical'" == "" {
* Not numerical: Direct estimation starts here
* Create vector of p-components (p/1-p)
			matrix `p' = `observed' / `cases'
			matrix `pcomponent' = `p'
			forvalues i = 1 / `listnum' {
				matrix `pcomponent'[1,`i'] = `pcomponent'[1,`i'] / (1- `pcomponent'[1,`i'])
				matrix aprimes[1,`i'] = ln(`pcomponent'[1,`i'] * `vcomponent'[1,`i'])
				}		

* Build the Jacobian (k*k)
			matrix Jac = J(`listnum',`listnum',0)
			forvalues i =  1/`listnum'  {
			matrix Jac[`i',`i'] = 1 / (`p'[1,`i'] * ( 1 - `p'[1,`i']))
				}


* Next: analytical derivation of variance-covariance matrix
* Sigma_theta for variation of Parameters in theta space (proportions in multinomial distribution)
* Matrix is k*k where k = number of categories = `listnum'
			matrix sigmatheta = J(`listnum',`listnum',0)
* Calculate diagonal elements
			forvalues i =  1/`listnum'  {
				matrix sigmatheta[`i',`i'] = `p'[1,`i']  * (1 -  `p'[1,`i']) / `cases'
				}
* Do off-diagonal elements
			forvalues r = 1/`listnum' {
				local rstart = `r' + 1
				forvalues c = `rstart' / `listnum' {
					matrix sigmatheta[`r',`c'] = -1 * `p'[1,`r'] * `p'[1,`c'] / `cases'
					* symmetric
					matrix sigmatheta[`c',`r']  = sigmatheta[`r',`c'] 
					}
				}
			matrix covs = Jac * sigmatheta * Jac'
			matrix rownames aprimes=A'

			}

		else {

* prop/mlogit procedure starts here

* Estimate proportions (to get SEs)
		 capture `setsurveyprefix' proportion `varlist' `weighting' if `touse',  `cluster' `vce'
* Save number of complete cases
		local cases = e(N)
* proportion gives names to equations if var has val-label, so we cannot use surveycats
   	local propnames = e(namelist)	
* Create vector of p-components (p/1-p)
			matrix `p' = e(b)
			matrix `pcomponent' = `p'
			forvalues i = 1 / `listnum' {
				matrix `pcomponent'[1,`i'] = `pcomponent'[1,`i'] / (1- `pcomponent'[1,`i'])
				matrix aprimes[1,`i'] = ln(`pcomponent'[1,`i'] * `vcomponent'[1,`i'])
				}		

* Build the Jacobian (k*k)
			matrix Jac = J(`listnum',`listnum',0)
			forvalues i =  1/`listnum'  {
			matrix Jac[`i',`i'] = 1 / (`p'[1,`i'] * ( 1 - `p'[1,`i']))
				}

			
* Categories need not be consecutive, so we cannot simply loop from 1 to n but need an index
	
		qui levelsof `varlist',local(surveycats)
			matrix covs = e(V) 
			matrix covs = Jac * covs * Jac'


			}

* Relabel matrices
		local surveycatlabels `surveycatlabels' B:B
		local surveycatlabels `surveycatlabels' B:B_w
	   mata: calcbandbw()
		matrix rownames aprimes=A'
		matrix colnames aprimes=`surveycatlabels'
* Expand covariance matrix to hold missing values for B, B_w
* From v 1.2, we do no longer calculate pseudo SEs for these two
		matrix missmat = J(`listnum',2,0)
		matrix covs = covs , missmat
		matrix missmat = J(2, `listnum'+2,0)
		matrix covs = covs \ missmat 
		matrix rownames covs=`surveycatlabels'
		matrix colnames covs=`surveycatlabels'
		
* Complete chi-square calculations (matrix with observed frequencies calculated above)
* This is only valid for simple variance estimator/no weights
* local complex is a switch - if it's triggered, perform Wald Test instead		
		tempname df
		scalar `df' = `nunique' - 1
		if "`complex'" == "" {
			tempname chisqp
			tempname chisqlr
			tempname pp
			tempname plr
			mata: calcchisquare()
			scalar `pp' = chi2tail(`df',`chisqp')
			scalar `plr' = chi2tail(`df',`chisqlr')
			}
		else {
			}		


* Prepare for posting

		ereturn post aprimes covs,depname(`varlist') esample(`touse') obs(`cases')


* Once more, branch: Chi-Square or Wald?
		if "`complex'" == "" {
* Pearson Chi-2
			ereturn scalar chi2p =  `chisqp'
			ereturn scalar pp = `pp'
* LR Chi-2
			ereturn scalar chi2lr =  `chisqlr'
			ereturn scalar plr = `plr'
			}
*Wald-Land
		else {
			qui test [A']
			tempname chisqwald
			tempname pwald
			scalar `chisqwald' = r(chi2)
			scalar `pwald' = r(p)
			ereturn scalar chi2wald = `chisqwald'
			ereturn scalar pwald = `pwald'
			}
		ereturn scalar df = `df'
		ereturn local cmdline `commandline'
		ereturn local cmd "surveybias"
		}
	else { //replay
		syntax [, Level(cilevel)]
		local cases = e(N)
		}
	ereturn display, level(`level')
	display as text " "
	display as text _col(5) "Ho: no bias"

	display as text _continue _col(5) "Degrees of freedom: "
	display as result  _col(25) e(df)

	if "`complex'" == "" {
		display as text _continue _col(5) "Chi-square (Pearson) = "
		display as result  _col(26) e(chi2p)
		display as text _continue _col(5) "Pr (Pearson) = "
		display as result _col(18) e(pp)

		display as text _continue _col(5) "Chi-square (LR) = "
		display as result   _col(20) e(chi2lr)
		display as text _continue _col(5) "Pr (LR) = "
		display as result _col(13) e(plr)
		}
	else {
		display as text _continue _col(5) "Chi-square (Wald) = "
		display as result   _col(20) e(chi2wald)
		display as text _continue _col(5) "Pr (Wald) = "
		display as result _col(13) e(pwald)
		}

* Small sample size?

	if `cases' < 100 {
		display _n "Warning: The effective sample size is very small (n=`cases')."
		display "Chi square values may be unreliable."
		display "Consider using mgof (ssc describe mgof) for exact tests."
		}

end
mata:
	void calcchisquare()
	{
		myvalidcases = strtoreal(tokens(st_local("cases")))
		expected = strtoreal(tokens(st_local("normpopvalues")))*myvalidcases
		tempstring = st_local("observed")
		/* get observed values */
		observed = st_matrix(tempstring)
		/* calculate Pearson chisquare and LR chisquare */
		chisqp = sum(((observed :- expected):^2) :/ expected)
		chisqlr = 2*sum(ln(observed :/ expected):*observed)
		/* return chisq in temp macro */
		chitempstringp = st_local("chisqp")
		chitempstringlr = st_local("chisqlr")
		st_numscalar(chitempstringp,chisqp)
		st_numscalar(chitempstringlr,chisqlr)
		}
	
	end

* Calculate B and B_w from vector aprimes, append them 	

	mata:
		void calcbandbw()
		{
		/* Get Matrix/Vector auf As from Stata */
		myaprimes = st_matrix("aprimes")
		/* Take absolute values			 */
		myabsprimes = abs(myaprimes)
		/* Get number of categories */
		k = strtoreal(tokens(st_local("listnum")))
		/* Define vectors fuer simple/weighted average */
		simple=J(1,k,1/k)
		/* Calculate b and append */
		b= simple * myabsprimes'
		myaprimes = myaprimes , b
		/* Convert list of normalised proportions in the population to vector of weights */
		v = strtoreal(tokens(st_local("normpopvalues")))
		/* Calculate bw and append  */
		bw = v * myabsprimes'
		myaprimes = myaprimes, bw
		/* Return matrix to Stata-space */
		st_matrix("aprimes",myaprimes)
		}

	end

/* surveybias.ado ends here */