*! version 2, 13Oct2004, John_Hendrickx@yahoo.com
/*
Direct comments to: John Hendrickx <John_Hendrickx@yahoo.com>

Called by perturb.ado to calculate a matrix of reclassification
probabilities.

Version 2, September 30, 2004
Added option -pcnttab- to specify reclassification percentages directly
option -noadjust- to skip making the row and column marginals of the
expected table equal
option -nobestmod- to skip finding a model for misclassification
option -dist- to use a distance model for the association
Version 1, July 11, 2004
Intital version
*/


program define reclass, rclass
	version 7
	syntax varname , [Q(real 1)  U(real 1) Dist(real 1) ASsoc(string) /*
	*/ Pcnttab(string) noADjust noBestmod /*
	*/ Misclass(numlist >0 <100 max=1) Format(string) Verbose]
	preserve

	if "`verbose'" == "" {
		local how "quietly"
	}
	if `"`format'"' == "" {
		local format "%8.3f"
	}
	else {
		capt local tmp : display `format' 1
		if _rc {
			di as err `"invalid %fmt in format(): `format'"'
			exit 120
		}
	}

	`how' tab `varlist', matcell(mrgn)
	local ncat=rowsof(mrgn)
	drop _all
	`how' svmat mrgn, names(artdat)
	`how' expand (`ncat')

	* generate an artificial table with frequencies on the diagonal
	* so it has the same marginals as `varname'
	gen orig=group(`ncat')
	gen dest=1+mod(_n-1,`ncat')
	label variable orig "original variable `varlist'"
	label variable dest "reclassifed variable"
	gen frq=0
	quietly replace frq=mrgn[orig,1] if orig==dest

	* option misclass is maintained for compatiblity
	if "`misclass'" ~= "" {
		tempname inmat
		matrix `inmat'=J(1,`ncat',100-`misclass')
		local pcnttab `inmat'
		local adjust="noadjust"
	}

	if "`pcnttab'" ~= "" {
		local ptype=0
		capture local nrow=rowsof(`pcnttab')
		if _rc ~= 0 {
			tempname inmat
			capture matrix `inmat'=J(1,`ncat',`pcnttab')
			if _rc ~= 0 {
				display as error "Argument `pcnttab' for option {hi:pcnttab} is invalid"
				exit
			}
			local nrow 1
			local ptype=1
			local pcnttab `inmat'
		}
		local ncol=colsof(`pcnttab')
		tempname targtab parmat

		if `nrow' == `ncat' & `ncol' == `ncat' {
			local ptype=3
			mat `targtab'=`pcnttab'
		}
		else if (`nrow' == `ncat' & `ncol' == 1) | (`nrow' == 1 & `ncol' == `ncat') {
			if `ptype' == 0 {local ptype=2}
			tempname inmat
			mat `inmat'=diag(`pcnttab')
			mat `targtab'=`inmat'
			forval i=1/`ncat' {
				mat `targtab'[`i',1]=J(1,`ncat',(100-`inmat'[`i',`i'])/(`ncat'-1))
				mat `targtab'[`i',`i']=`inmat'[`i',`i']
			}
		}
		else {
			display as error "The dimensions of matrix `pcnttab' [`nrow',`ncol'] are invalid for variable `varlist' with `ncat' categories"
			exit
		}
		* create column proportions
		mat `targtab'=inv(diag(`targtab'*J(`ncat',1,1)))*`targtab'
		gen paras=`targtab'[orig,dest]
		display as text _newline "Reclassification probabilities for `varlist':"
		table orig dest [iw=paras], format(`format') col

		mat `parmat'=diag(mrgn)*`targtab'
		quietly replace paras=`parmat'[orig,dest]
		display as text _newline "Initial expected table based on the reclassification probabilities:"
		table orig dest [iw=paras], format(`format') row col

		if "`adjust'" == "noadjust" {
			* don't adjust the margins of the expected table
			* to make them equal to the frequency distribution of the variable
			* just return the cumulative frequencies
			display as text _newline "The reclassification probabilities will used as is and will not be adjusted to let"
			display as text "the expected frequencies of the reclassified variable be equal to those of `varlist'"
			cumul_f `targtab' `ncat'
			return matrix gentab `parmat'
			return matrix margin mrgn
			return matrix classprob rcdcum
			restore
			exit
		}
		else if "`bestmod'" == "nobestmod" {
			* don't try to determine the best model
			* margins will be equal but the association between original/reclassified
			* may be arbitrary
			display as text _newline "The reclassification probabilities will be adjusted to let"
			display as text "the expected frequencies of the reclassified variable be equal to those of `varlist'"
			trg_assoc `parmat'
		}
		else {
			display as text _newline"The reclassification probabilities will be adjusted to let"
			display as text "the expected frequencies of the reclassified variable be equal to those of `varlist'"
			if `ptype' == 3 {
				display as text _newline "A distance model will be determined for the initial expected table"
				get_dist `how'
			}
			else {
				display as text _newline "The expected table will be quasi-independent"
				get_q `ptype' `how'
			}
		}
	}
	else if "`assoc'" ~= "" {
		capture `assoc'
		if _rc ~= 0 {
			local rc=_rc
			display as error "Program `assoc' produced an error (rc `rc')"
			exit
		}
		tempvar checkit
		capture gen `checkit' = sum(missing(paras))
		if _rc ~= 0 {
			local rc=_rc
			display as error "Variable paras was not defined by program `assoc' (rc `rc')"
			exit
		}
		local nmiss=`checkit'[_N]
		if `nmiss' ~= 0 {
			display as error "Variable paras contains `nmiss' missing values!"
			exit
		}

		display as text "Pattern of association defined by program `assoc' will be used for variable `varlist'"
		table orig dest [iw=paras], format(`format')
	}
	else if `q' ~= 1 | `u' ~= 1 | `dist' ~= 1 {
		* parameters for a constrained quasi-uniform association model
		display "Variable `varlist':"
		display "Odds of correct versus misclassification: `q'"
		gen paras=(orig==dest)*ln(`q')
		if `u' ~= 1 {
			display "Uniform association coefficient: `u'"
			quietly replace paras=paras+orig*dest*ln(`u')
		}
		else if `dist' ~= 1 {
			display ", Distance parametr: `dist'"
			quietly replace paras=paras+abs(orig-dest)*ln(`dist')
		}
	}
	else {
		display as error "You must specify either {hi:pcnttab} or {hi:assoc} or one of {hi:q}, {hi:u}, or {hi:dist}"
		exit
	}

	* variable -paras- has been defined above as the log association pattern
	* now estimate the adjusted table using -paras- as an offset variable

	* calculate dummies for 'halfway' effects, i.e. equal main effects
	`how' tabulate orig, generate(rw)
	`how' tabulate dest, generate(cl)
	forval i=1/`ncat' {
		gen hlf`i'=rw`i'+cl`i'
	}
	local nc_1=`ncat'-1

	* estimate a loglinear model with parameters as an offset
	* the predicted values have the same marginals as the input table
	* and the same pattern of association as the offset
	`how' poisson frq hlf1-hlf`nc_1', offset(paras)
	`how' predict pred, n

	display _newline "Adjusted expected table:"
	table orig dest [iw=pred], format(`format') row col

	`how' tabulate orig dest [iw=pred], row nofreq matcell(rcd)
	tempvar rcdprob
	mat `rcdprob'=inv(diag(rcd*J(`ncat',1,1)))*rcd
	quietly replace pred=`rcdprob'[orig,dest]
	display _newline "Final reclassification probabilities:"
	table orig dest [iw=pred], format(`format') col

	* calculate cumulative frequencies
	cumul_f `rcdprob' `ncat'
	return matrix gentab rcd
	return matrix margin mrgn
	return matrix classprob rcdcum
	restore
end

program define trg_assoc
	version 7
	* the input matrix `parmat' has been defined by the pncttab of misclass options
	* as the expected table of original by reclassified
	* next: make `parmat' symmetric, use its log values as an offset
	* to generate the adjusted table
	args parmat
	local nrow=rowsof(`parmat')
	local ncol=colsof(`parmat')
	forval i=1/`nrow' {
		forval j=`i'/`ncol' {
			if `parmat'[`i',`j'] == 0 {
				mat `parmat'[`i',`j'] = 1e-12
			}
			if `parmat'[`j',`i'] == 0 {
				mat `parmat'[`j',`i'] = 1e-12
			}
			mat `parmat'[`i',`j']=( ln(`parmat'[`i',`j'])+ln(`parmat'[`j',`i']) )/2
			mat `parmat'[`j',`i']=`parmat'[`i',`j']
		}
	}
	quietly replace paras=`parmat'[orig,dest]
end

program define get_q
	version 7
	args ptype how
	gen d=(orig==dest)
	if `ptype' ~= 1 {
		quietly replace d=d*orig
	}
	`how' xi: poisson paras i.orig i.dest i.d
	quietly replace paras=0
	foreach var of varlist _Id_* {
		quietly replace paras=paras+`var'*_b[`var']
		display as text "ln(q)=: " as result %13.3f _b[`var']
	}
end

program define get_dist
	version 7
	args how
	gen diag=(orig==dest)
	gen u=orig*dest
	gen dist=abs(orig-dest)
	`how' xi: poisson paras i.orig i.dest diag u
	`how' poisgof
	scalar dev1=r(chi2)
	scalar df1=r(df)
	estimates hold qu
	`how' xi: poisson paras i.orig i.dest diag dist
	`how' poisgof
	if r(chi2) < dev1 {
		quietly replace paras=diag*_b[diag]+dist*_b[dist]
		display as text "using a quasi distance model with ln(q)=" /*
		*/ as result %8.3f _b[diag] "{text: and ln(dist)=}" as result  %8.3f _b[dist]
		display as text "deviance: " as result %12.3f r(chi2) "{text: with }" as result %2.0f r(df) "{text: df}"
		estimates drop qu
	}
	else {
		estimates unhold qu
		quietly replace paras=diag*_b[diag]+u*_b[u]
		display as text "using a quasi uniform association model with ln(q)=" /*
		*/ as result %8.3f _b[diag] "{text: and ln(u)=}" as result  %8.3f _b[u]
		display as text "deviance: " as result %12.3f dev1 "{text: with }" as result %2.0f df1 "{text: df}"
	}
end

program define cumul_f
	version 7
	args inmat ncat
	mat rcdcum=`inmat'
	forval j=2/`ncat' {
		matrix rcdcum[1,`j']=rcdcum[1...,`j']+ rcdcum[1...,`j'-1]
	}
end