*! chaidforest - version 2.0 - 9/28/2015 - Joseph N. Luchman

program define chaidforest, eclass																//CHAID-based random forest - program history and notes at end of file

version 12.1

syntax varlist [fw] [if] [in], [NTree(integer 100) NVuse(integer -1) ORDered(varlist) ///
UNOrdered(varlist) DVOrdered Alpha(real .5) MINSplit(integer 0) MINNode(integer 0) ///
NOIsily PROos(real -1) XTile(string) noSamp MISSing]

local full_cmdline "`0'"																		//save the contents of the cmdline for ereturn-ing later

/*exit and warning conditions*/
if (`:list sizeof unordered' == 0) & (`:list sizeof ordered' == 0) & ///
(`:list sizeof xtile' == 0) {																	//exit if no splitting variables are entered

	display "{err}No splitting variables entered."
	
	exit 111
	
}

if `minsplit' < `minnode' {																		//exit if you tell chaidforest that the minimum node/cluster size is larger than the minimum sample size to make a split - which is an infeasible situation for clustering

	display "{err}{opt minsplit()} cannot be smaller than {opt minnode()}."

	exit 198
	
}

if (`alpha' <= 0) | (`alpha' >= 1) {															//ensure that the alpha probabilities are in the possible range of 0-1

	display "{err}{opt alpha()} must range betweeen (but not include) 0 and 1."
	
	exit 198

} 

if `minnode' < 0  {																				//ensure that minimum node/cluster size is a non-negative integer (i.e., a possible sample size)

	display "{err}{opt minnode()} must be a non-negative integer value."
	
	exit 198

}

capture which lmoremata.mlib																	//is moremata present? - chaidforest needs moremata

if _rc != 0 {																					//if moremata cannot be found, tell user to install it

	display "{err}Module {cmd:moremata} not found.  Install {cmd:moremata} here " ///
	"{stata ssc install moremata}."
	
	exit 198

}

/*begin data processing*/
tempvar keep touse stop cluster useob 															//tempvar declarations - begin using "touse"; add "keep" markout variables in unordered() and ordered()

tempname cellcount fit nvars N_tree obs															//tempname declaration for matrices

quietly generate byte `keep' = 1 `if' `in'														//allows keeping of missings as valid category, but still adjusting estimation sample for "if"s and "in"s

if strlen("`xtile'") {																			//parse and check xtile

	gettoken xtile xtile_opt: xtile, parse(",")													//separate out the "to xtile" varlist from the options to pass to xtile

	quietly summarize `xtile'																	//summarize the xtile varlist - if it can't be summarized, then its not a variable
		
}	
		
mark `touse'																					//mark estimation sample

if strlen("`missing'") markout `touse' `keep'													//if missing invoked only markout "if" and "in"...

else markout `touse' `varlist' `unordered' `ordered' `keep' `xtile'								//...otherwise markout "if", "in", and missing values

if strlen("`xtile'") {																			//implement convenience option for creating binned continuous predictors in chaidforest and adding to "ordered" list

	foreach var of varlist `xtile' {															//loop through all the variables in the xtile list...								

		capture inspect xt`var'																	//...see if a variable with the name "xt`varname'" already exists...
	
		if !_rc quietly drop xt`var'															//...if it does, drop the variable "xt`varname'" - users are warned of this behavior in the help file

		quietly xtile xt`var' = `var' if `touse' [`weight'`exp'] `xtile_opt'					//conduct the xtile with options as applicable - with estimation sample
							
		local ordered "`ordered' xt`var'"														//add the xtile'd variable to the ordered list

	}
	
}

quietly inspect `varlist'																		//inspect response variable to obtain values that are 0 and positive (if they don't equal the total - something's negative and that's not allowed)
	
if r(N) != (r(N_0) + r(N_posint)) {																//chaidforest doesn't work with non-negarive integers - disallow them using the results from the "inspect" command
	
	display "{err}Levels of response variable must be non-negative integer valued."
		
	exit 198
	
}

quietly levelsof `varlist' if `touse', local(dvlevs) `missing'									//obtain levels of response variable

if regexm("`dvlevs'", "\.") mata: st_local("dvlevs_count", ///									//removes multiple missings from consideration in the levels count for response
invtokens((select(tokens(st_local("dvlevs")), ///
editmissing(strtoreal(tokens(st_local("dvlevs"))), .):!=.), ".")))

if `:list sizeof dvlevs_count' > 20 {															//if there are too many levels of the response variable...
	
	display "{err}Number of distinct values in {cmd:`varlist'} is larger than allowed. " ///
	_newline "Consider collapsing across similar categories for unordered or using " ///
	_newline "{opt xtile()} option/command for ordered variables to reduce the number of " ///
	"levels."
		
		exit 198
	
}
		
local dvlist "`varlist'"																		//produce a local macro that is updated with tempvar names for use in Mata

local dvdisp "`varlist'"																		//produce a local macro that is updated with displayed names of response variable
	
local type "`type' d"																			//note type of variable for response variable - type is "d" for "dependent"

if `:list sizeof unordered' > 0 {																//process unordered splitting variables when present just as was done with the response variable above...

	foreach var of varlist `unordered' {														//macros created for parsing and different puropses - unordered

		quietly inspect `var'																	//inspect unordered splitting variable to obtain values that are 0 and positive (if they don't equal the total - something's negative and that's not allowed)
	
		if r(N) != (r(N_0) + r(N_posint)) {														//chaidforest doesn't work with non-negarive integers - disallow them using the results from the "inspect" command
	
			display "{err}Levels of each splitting variable must be non-negative integer " ///
			"valued.  Variable {cmd:`var'} has non-negative integer values."
		
			exit 198
	
		}
	
		quietly levelsof `var' if `touse', local(`var'levs) `missing'							//obtain levels of unordered splitting variable
		
		if regexm("``var'levs'", "\.") mata: st_local("`var'levs_count", ///					//removes multiple missings from consideration in the levels count for unordered splitting variable
		invtokens((select(tokens(st_local("`var'levs")), ///
		editmissing(strtoreal(tokens(st_local("`var'levs"))), .):!=.), ".")))
		
		if `:list sizeof `var'levs_count' > 20 {
	
			display "{err}Number of distinct values in {cmd:`var'} is larger than " ///
			"allowed." _newline "Consider collapsing across similar categories for " ///
			"unordered or using " _newline "{opt xtile()} option/command for ordered " ///
			"variables to reduce the number of levels."
		
			exit 198
	
		}
		
		foreach val of numlist ``var'levs' {													//for each level of splitters, register binary variable...
		
			if `:list posof "`val'" in `var'levs' == 1 {										//leading text for locals to be fed to chaidforest()...				
			
				local type "`type' ("													
				
				local unorddisp "`unorddisp' ("
				
				local unorderedlist "`unorderedlist' ("
			
			}
		
			if !missing(`val') {																//...if not a missing value...
			
				tempvar `var'`val'																//create tempvars with name of level of unordered, binary variable
				
				quietly generate byte ``var'`val'' = `var' == `val'								//generate binaries for each level of unordered variable
	
				local unorderedlist "`unorderedlist' ``var'`val''"								//include name of unordered binary variable in list of tempvar names
			
				local unorddisp "`unorddisp' `var'_`val'"										//include displayed name of unordered binary variable in list of tempvar names
			
				local type "`type' u"															//include variable type in type macro
	
			} 

			else {																				//...if any kind of missing value...
			
				mata: st_local("miss_name", strofreal(st_isname("``var'ms'")))					//determine if the missing binary variable is already present
				
				if !`miss_name' {																//if the missing binary variable is not found...
				
					tempvar `var'ms																//create tempvar for unordered varible indicating missing
				
					quietly generate byte ``var'ms' = `var' == `val'							//generate indicator for missing category
							
					local unorderedlist "`unorderedlist' ``var'ms'"								//include missing tempvar name in unordered list
			
					local unorddisp "`unorddisp' `var'_ms"										//include missing display name in unordered display list
				
					local type "`type' u"														//include type still as unordered"
					
				}
				
				else quietly replace ``var'ms' = ``var'ms' + (`var' == `val')					//...else add in the other missing code... only valid with multiple missing codes (., .a, .b, etc.)
					
			}
			
			if (`: list posof "`val'" in `var'levs' == `: list sizeof `var'levs') {				//trailing text for all the locals to be fed to chaidforest()...
			
				local type "`type')"
				
				local unorddisp "`unorddisp')"
				
				local unorderedlist "`unorderedlist')"
			
			}
				
		}
			
	}
			
}
	
if `:list sizeof ordered' > 0 {																	//process ordered splitting variables when present just as was done with the response/unordered variable above...

	foreach var of varlist `ordered' {															//macros created for parsing and different puropses - ordered
		
		quietly inspect `var'																	//inspect ordered splitting variable to obtain values that are 0 and positive (if they don't equal the total - something's negative and that's not allowed)
	
		if r(N) != (r(N_0) + r(N_posint)) {														//chaidforest doesn't work with non-negarive integers - disallow them using the results from the "inspect" command
	
			display "{err}Levels of each splitting variable must be non-negative integer " ///
			"valued.  Variable {cmd:`var'} has non-negative integer values."
		
			exit 198
	
		}
		
		quietly levelsof `var' if `touse', local(`var'levs) `missing'							//put levels of ordered splitting variable into local macro
		
		if regexm("``var'levs'", "\.") mata: st_local("`var'levs_count", ///					//removes multiple missings from consideration in the levels count for ordered splitting variable
		invtokens((select(tokens(st_local("`var'levs")), ///
		editmissing(strtoreal(tokens(st_local("`var'levs"))), .):!=.), ".")))
		
		if `:list sizeof `var'levs_count' > 20 {
	
			display "{err}Number of distinct values in {cmd:`var'} is larger than " ///
			"allowed." _newline "Consider collapsing across similar categories for " ///
			"unordered or using " _newline "{opt xtile()} option/command for ordered " ///
			"variables to reduce the number of levels."
		
			exit 198
	
		}

		foreach val of numlist ``var'levs' {													//for each level of splitters, register binary variable...
		
			if `: list posof "`val'" in `var'levs' == 1 {										//leading text for locals to be fed to chaidforest()...				
			
				local type "`type' ("
				
				local orddisp "`orddisp' ("
				
				local orderedlist "`orderedlist' ("
			
			}
		
			if !missing(`val') {																//...if not a missing value...
			
				tempvar `var'`val'																//declare tempvar for each value of ordered variable
				
				quietly generate byte ``var'`val'' = `var' == `val'								//generate binary for each level of ordered variable
			
				local orderedlist "`orderedlist' ``var'`val''"									//include ordered tempvar in orderedlist 
			
				local orddisp "`orddisp' `var'_`val'"											//include display name of variable in ordered display macro
			
				local type "`type' o"															//include variable type in type macro
				
			}
	
			else {																				//...if any kind of missing value...
			
				mata: st_local("miss_name", strofreal(st_isname("``var'ms'")))					//determine if the missing binary variable is already present
				
				if !`miss_name' {																//if the missing binary variable is not found...
				
					tempvar `var'ms																//create tempvar for ordered varible indicating missing
				
					quietly generate byte ``var'ms' = `var' == `val'							//generate indicator for missing category
							
					local orderedlist "`orderedlist' ``var'ms'"									//include missing tempvar name in ordered list
			
					local orddisp "`orddisp' `var'_ms"											//include missing display name in ordered display list
				
					local type "`type' f"														//include type as floating"
					
				}
				
				else quietly replace ``var'ms' = ``var'ms' + (`var' == `val')					//...else add in the other missing code... only valid with multiple missing codes (., .a, .b, etc.)
					
			}
			
			if (`: list posof "`val'" in `var'levs' == `: list sizeof `var'levs') {				//trailing text for all the locals to be fed to chaidforest()...
			
				local type "`type')"
				
				local orddisp "`orddisp')"
				
				local orderedlist "`orderedlist')"
			
			}
					
		}
		
	}

}

/*Process number of variables to select in random forest*/
if `nvuse' <= -1 scalar `nvars' = ///															//by default use rounded square root of number of splitting variables...
round(sqrt(`=`:list sizeof ordered' + `:list sizeof unordered''))

else if (`nvuse' > `=`:list sizeof ordered' + `:list sizeof unordered'') & (`nvuse' > 0) {		//...exit if too many splitting variables indicated in macro nvuse...
	
	display "{err}{opt nvuse()} must be less than or equal to the number of splitting " ///
	"variables" _newline "included in options {opt xtile()}, {opt ordered()} and/or " ///
	"{opt unordered()}"
		
	exit 198
	
}

else scalar `nvars' = `nvuse'																	//...otherwise use user-specified number of splitting variables per tree

if strlen("`weight'") local chaid_exp = subinstr("`exp'", "=", "", 1)							//if there is a frequency weight, parse the "exp" to obtain only the weight variable

if strlen("`weight'") {																			//obtain number of obsevations with frequency weights...
	
	quietly summarize `chaid_exp' if `touse', meanonly
		
	scalar `obs' = r(sum)
	
}
	
else {																							//...otherwise obtain number of observations without frequency weights
	
	quietly count if `touse'
		
	scalar `obs' = r(N)
		
}

if `minsplit' == 0 local minsplit = `=ceil(`obs'*.01)'											//default minsplit is 1% of sample size
	
if `minnode' == 0 local minnode = `=ceil(`obs'*.005)'											//default minnode is .5% of sample size

/*Pull data into Mata for processing*/
mata: results = chaidforest(`ntree', `=`nvars'', "`type'", ///									//run chaidforest() function in Mata - returns "results" object
"`varlist' `unorderedlist' `orderedlist'", ///
"`varlist' `unordered' `ordered'", `alpha', `minsplit', `minnode', ///
"`touse'", "`dvordered'", "`varlist' `unorddisp' `orddisp'", "`noisily'", ///
"`samp'", `proos', "`chaid_exp'") 

if `proos' > -1 scalar `N_tree' = `=round(`obs'*(1 - `proos'))'									//obtain observations per tree if proos() used...

else scalar `N_tree' = `obs'																	//...otherwise observations per tree is equivalent to total number of observations (i.e., sample size due to bootstrapping)

/*Return basic results*/
ereturn post , depname(`varlist') esample(`touse')

ereturn scalar ntree = `ntree'

ereturn scalar nvuse = `nvars'

ereturn scalar N_tree = `N_tree'

ereturn scalar minsplit = `minsplit'

ereturn scalar minnode = `minnode'

ereturn local splitvars = "`unordered' `ordered'"

ereturn local predict = "chaidforest_pr"

if strlen("`weight'") {

	ereturn local wtype = "fweight"
	
	ereturn local wexp "`exp'"
	
}

ereturn local estat_cmd = "chaidforest_estat"

ereturn local title = "CHAID-based Random Forest"

ereturn local cmd = "chaidforest"

ereturn local cmdline = "chaidforest `full_cmdline'"

if strlen("`missing'") ereturn hidden scalar validmiss = 1

else ereturn hidden scalar validmiss = 0

if strlen("`dvordered'") ereturn hidden scalar dvorder = 1

else ereturn hidden scalar dvorder = 0

if `proos' == -1 ereturn hidden scalar w_o_replace = 0

else ereturn hidden scalar w_o_replace = 1

end


/*Object for chaidforest's results storage*/
version 12.1

mata:

mata set matastrict on

class chaidtree {

	real colvector clusters																		//each tree retains clusters
	
	string matrix CHAID_rules																	//each tree retains if/then statements
	
	string scalar used_vars																		//each tree retains variables used
	
	real colvector bootstrap_weight																//each tree retains frequency weights from bootstrap or e(sample)-like data from prop_oos

}
	
end


/*Mata-version of CHAID based on Goodman association models*/
version 12.1

mata:

mata set matastrict on

function chaid_g(real matrix usedata, string rowvector type, ///
string rowvector names, string rowvector vars, real scalar alpha, ///
real scalar minimum_2_split, real scalar minimum_node_size, string scalar ordered_response, ///
string scalar noisily, real colvector weight)
{
	/*declarations*/
	class chaidtree scalar current_tree

	real matrix select2variables, check_table, compare_table, data4analysis, ///
	data4merging
	
	real rowvector result, merger_pvalues, select_merge_categories, randomly_resolve, ///
	branches, merger_r2s
	
	real colvector proceed, cluster, use_obs, select_obs, comare_var1, comare_var2, ///
	wgt4analysis
	
	real scalar category, split, token, position, moresplit, compare_pvalue, ///
	current_pvalue, more_merge, cluster_count, current_cluster, num_combos, ///
	possible_combos, floating, compare_aic, current_aic
	
	string matrix splits, CHAID_splits
	
	string rowvector select_type, collapse_variables, select_names, compare_names, ///
	select2names, select2vars
	
	transmorphic cvp_setup, cvp_template	
	
	/*set-up and notification; CHAID is a noisy program to find bugs primarily*/
	if (strlen(noisily)) "CHAID Begins Execution"	//
																																							
	if (strlen(noisily)) vars	//
	
	position = 2																				//updatable position of focal splitting variable; begins at 2 because response is variable 1
	
	moresplit = 1																				//scalar used to by chaid_g() to indicate "please continue splitting"; when this turns 0, chaid_g() is finished looking for any and all splits in the data
	
	compare_pvalue = 1																			//used as the "comparison" lowest p-value; if the current p-value from an analysis is smaller/larger, variable is split/merged
	
	compare_aic = .																				//used as the "comparison" lowest AIC value; only used for splitting purposes - if the current AIC from an analysis is larger, variable is split
	
	cluster_count = 1																			//the number of clusters to this point - starts at 1 indicating that everyone is in the same cluster (i.e., cluster #1)
	
	current_cluster = 1																			//the cluster currently being processed to find more splits - starts at cluster #1
	
	compare_names = ("")																		//holds the variable names of the associated with the variables splits currently best (as chosen by p-values or AIC); changes when another splitting variable is chosen as best
	
	branches = (0)																				//the number of branches the current tree has by cluster/node - starts at 0; is updated as more splits are found
	
	splits = ("")																				//the current data implemented/chosen partitions/splits associated with the current chaid_g() run
	
	result = J(1, 5, .)																			//Goodman model results - chi-square, DF, log-likelihood, and number of parameters
	
	if (strlen(noisily)) names	//

	if (strlen(noisily)) type	//
	
	proceed = J(rows(usedata), 1, 1)															//vector to use to indicate continue splitting on these observations; ; a kind of updatable "touse" that focuses on whether clustering has been attempted and failed to find further clusters
	
	cluster = J(rows(usedata), 1, 1)															//cluster number associated with each observation; starts at cluster #1 for all
	
	use_obs = J(rows(usedata), 1, 0)															//vector to indicate use the set of observations for splitting in current splitting run; a kind of updatable "touse" which accounts for "proceed" above as well as the combination of "current_cluster" and "cluster"
	
	CHAID_splits = ("Splits")																	//matrix to use to return partitioning rules and splits
	
	/*begin looking for splits in the data*/
	while (moresplit) {																			//do following process at least once - continue if between-variable splitting process is to continue (i.e., moresplit == 1)

		use_obs = (use_obs:*0):+((proceed:+(cluster:==current_cluster)):==2)					//update observations to use - ones that can be "proceed"ed on and that are in the current cluster
		
		select_names = names[position]															//pull out names for first splitting variable 												
		
		if (strlen(noisily)) select_names	//
		
		select_type = type[position]															//pull out types for first splitting variable using same procedure as above												
		
		if (strlen(noisily)) select_type	//
		
		more_merge = 1																			//updatable scalar to indicate continute merging levels of a splitting variable - here merging always proceeds until only 2 levels remain
		
		data4analysis = select(usedata, use_obs)												//subset data to the relevent set of observations
		
		wgt4analysis = select(weight, use_obs)													//subset weights to the relevant set of observations
		
		check_table = mm_freq(data4analysis[., 1], wgt4analysis)								//determine frequency of values on response variable
		
		if (strlen(noisily)) check_table	//
																										
		if (!sum(proceed)) {																	//if proceeds are all 0, and thus there are no observations to proceed on...														
		
			moresplit = 0																		//don't split anymore - chaid is done
			
			more_merge = 0																		//don't merge anymore either
			
		}
		
		else if ((rows(check_table) == 1) | (colsum(check_table) < minimum_2_split)) {			//...or if there is only one response value left in a cluster (i.e., it's pure)/the remaining sample size is below the minimum for continuing looking for splits...
			
			more_merge = 0																		//stop merging on this variable

			proceed = proceed:-use_obs															//stop clustering on these observations; use_obs subtracts 1 from proceed thus making previously "proceed"ed observations, no longer "proceed"able
			
			current_cluster++																	//increment current cluster - start using the next cluster up and look for splits there; so long as there is another cluster
			
			position = 2																		//restart "position" used in merging at 2 for the new cluster
			
		}
																										
		else {																					//...otherwise the response variable looks good - begin splitting variable category merging
			
			check_table = colsum(select(data4analysis, ///										//how many levels of the splitting variable remain?
			colsum(J(cols(tokens(select_names)), 1, ///
			tokens(invtokens(names))):==(tokens(select_names)'))))
		
			if (strlen(noisily)) check_table	//
		
			select_names = select(tokens(select_names), sign(check_table))						//separate out the binary variables associted with the focal splitting variable
		
			if (strlen(noisily)) select_names	//
			
			select_type = select(tokens(select_type), sign(check_table))						//separate out the types associated with the splitting variable
			
			data4merging = select(data4analysis, colsum(J(cols(select_names), ///				//only keep the focal splitting variable and response variable
			1, tokens(invtokens(names))):==(select_names')))
		
			while (more_merge == 1) {															//so long as this is indicated, keep merging levels...
		
				collapse_variables = ("")														//replace/generate collapse_variables including names of variables to be collapsed into single variable
			
				merger_pvalues = (.)															//vector of p-values to search through to determine which categories should be merged
				
				merger_r2s = (.)																//vector of pseudo-R2 values to search through to determine which categories should be merged
				
				floating = 0																	//does this splitting variable have a floating missing? (i.e., missing with an splitting ordered variable)
			
				if (cols(select_names) > 2)  {													//if there are more than 2 categories of the splitting variable remaining?  If yes, attempt mergers
		
					if (select_type[1] == "u") {												//if the variable is unordered...
							
						cvp_template = (J(2, 1, 1) \ J(cols(select_names) - 2, 1, 0))			//setup vector for use in cvpermute() to select all sets of 2 variables

						if (strlen(noisily)) cvp_template	//
						
						cvp_setup = cvpermutesetup(cvp_template)								//register "cvp_template" vector with cvpermute()
				
						possible_combos = comb(cols(select_names), 2)							//number of possible combinations - unordered
				
					}
					
					else if (																	//... or if the variable is "floating"...
					(select_type[cols(select_type)] == "f") & ///
					(colsum(regexm(select_names[cols(select_names)], "ms$")) == 1) & ///
					(colsum(cols(tokens(select_names[cols(select_names)]))) == 1)) {
					
						possible_combos = (cols(select_names) - 1)*2 - 1						//number of possible combinations - floating
						
						floating = 1															//indicate that there's a floating variable
						
						cvp_template = (J(1, 1, 1) \ J(cols(select_names) - 2, 1, 0))			//setup vector for use in cvpermute() to select all sets of 2 variables
						
						if (strlen(noisily)) cvp_template	//
						
						cvp_setup = cvpermutesetup(cvp_template)								//register "cvp_template" vector with cvpermute()						
					
					}
					
					else possible_combos = cols(select_names) - 1								//...otherwise the other possibility is ordered...
						
					num_combos = 1																//scalar to keep track of how many combinations have been looped through to this point...
				
					while (num_combos <= possible_combos) {										//go through all different combinations of predictor levels to find best levels to collapse
				
						if (select_type[1] == "u") select2variables = cvpermute(cvp_setup)'		//invoke instance of cvpermute() for selector vector - unordered
				
						else if (((select_type[1] == "o") & (floating == 0)) | ///
						((floating == 1) & (sign(num_combos*2 - possible_combos) != 1))) ///	//invoke instance of cvpermute() for selector vector - ordered or floating when the floating category has been merged into another category
						select2variables = (J(num_combos - 1, 1, 0) \ J(2, 1, 1) \ ///
						J(cols(select_names) - num_combos - 1, 1, 0))'
						
						else if ((floating == 1) & ///											//invoke instance of cvpermute() for selector vector - floating, when the floating category has not been merged yet
						(sign(num_combos*2 - possible_combos) == 1)) ///	
						select2variables = (cvpermute(cvp_setup)', 1)
				
						if (strlen(noisily)) select2variables	//
				
						collapse_variables = (collapse_variables, ///							//create string vector indicating position of category, binary variables to obtain a chi-square on among all the categories representing the focal splitting variable
						invtokens(strofreal(select2variables)))		
				
						if (strlen(noisily)) collapse_variables	//
				
						select2names = select(select_names, select2variables)					//proceed to select 2 columns of splitting variable to evaluate for potentially collapsing
				
						if (strlen(noisily)) select2names	//
					
						if (strlen(noisily)) select2names[1]	//
					
						select2vars = tokens(select2names[1])									//separate out binary variables represeing a category (sometimes will already have been collapsed - this step is necessary to ensure previously collapsed binaries are appropriately used) - category 1
						
						comare_var1 = rowsum(select(data4merging, ///							//implement the previously collapsed binaries into single category - category 1
						colsum(J(cols(select2vars), 1, ///	
						tokens(invtokens(select_names))):==(select2vars)')))
						
						if (strlen(noisily)) sum(comare_var1)	//
					
						if (strlen(noisily)) select2names[2]	//
					
						select2vars = tokens(select2names[2])									//separate out binary variables represeing a category (sometimes will already have been collapsed - again, this step is necessary to ensure previously collapsed binaries are appropriately used) - category 2
						
						comare_var2 = rowsum(select(data4merging, ///							//implement the previously collapsed binaries into single category - category 2
						colsum(J(cols(select2vars), 1, ///		
						tokens(invtokens(select_names))):==(select2vars)')))
						
						if (strlen(noisily)) sum(comare_var2)	//
					
						if (strlen(noisily)) mean((comare_var1, comare_var2))	//					
						
						if (rows(uniqrows(select(data4analysis[., 1], ///						//invoke Goodman association model - only runs so long as 0's and 1's exist for both categories 1 and 2 of the splitting variable given the previous mergers - contingency tables need 4 cells!
						(comare_var1:+comare_var2)))) > 1) ///	
						result = goodman(select((data4analysis[., 1], comare_var1), ///
						(comare_var1:+comare_var2)), sign(strlen(ordered_response)), ///
						sign(strlen(noisily)), select(wgt4analysis, ///
						(comare_var1:+comare_var2)))
						
						else result = J(1, 5, .)												//if goodman() doesn't run due to too few categories, ensure this result is merged (or likely to be merged)
						
						if (strlen(noisily)) result	//
						
						if (result[1] != .) {													//if all went well...
						
							merger_pvalues = ///
							(merger_pvalues, chi2tail(result[2], result[1]))					//record p-value
							
							merger_r2s = ///
							(merger_r2s, 1 - (result[3]/result[5]))								//record pseudo-r2
							
						}
						
						else {																	//...otherwise, something's wrong
						
							merger_pvalues = (merger_pvalues, 1)								//bad comparison, probably merge them; pvalue
						
							merger_r2s = (merger_r2s, 0)										//bad comparison, probably merge them; r2
						
						}
						
						num_combos++															//increment num_combos - make the next comaprison
					
						if (strlen(noisily)) num_combos	//
				
				
					}
			
					merger_pvalues = strmatch(strofreal(merger_pvalues), ///					//find the position of the largest p-value criterion
					strofreal(max(merger_pvalues)))		
				
					if (strlen(noisily)) merger_pvalues	//
					
					if (rowsum(merger_pvalues) > 1) {											//if there are multiple, identical, large p-values pick one using r2 (pvalues might just be too big...) AIC is invalid given the change in N per merger comparison...
							
						merger_pvalues = strmatch(strofreal(merger_r2s), ///					//find the position of the smallest r2
						strofreal(min(merger_r2s)))	
						
						if (strlen(noisily)) merger_pvalues	//
						
						if (rowsum(merger_pvalues) > 1) {										//if there are multiple, identical, small r2's pick one randomly - chaid_g() can't tell the difference...
						 
							randomly_resolve = runiform(1, cols(merger_pvalues))				//random number with a number of columns equal to the number of identical p-values
					
							merger_pvalues = strmatch(strofreal(merger_pvalues), ///			//find the p-values that match - again
							strofreal(max(merger_pvalues)))	
						
							merger_pvalues = randomly_resolve:*merger_pvalues					//keep only the random numbers which are associated with the matching p-values
						
							merger_pvalues = strmatch(strofreal(merger_pvalues), ///			//select the biggest random p-value
							strofreal(max(merger_pvalues)))	
								
							if (strlen(noisily)) merger_pvalues	//
							
						}
					
					}
				
					select_merge_categories = strtoreal(tokens(select(collapse_variables, ///	//pull out binary variables associated with largest p-value
					merger_pvalues)))			
						
					if (strlen(noisily)) select_merge_categories	//
						
					if (select_type[1] == "u") select_names = ///								//merge together chaid_g() identified categories associated with unordered - stick the newly merged category on the end
					(invtokens(select(select_names, select_merge_categories)), ///
					select(select_names, abs(select_merge_categories:-1)))
						
					else if (select_type[1] == "o") {											//merge together chaid_g() identified categories associated with ordered - order matters now, requires more syntax
						
						num_combos = (strpos(invtokens(strofreal(select_merge_categories)), ///	//use num_combos as indicator of "where" the merger needs to occur in the vector of category names
						"1") + 1)/2	
							
						if (strlen(noisily)) num_combos	//
							
						if (select_merge_categories[num_combos] == ///
						select_merge_categories[num_combos + 1]) { 								//when the merged categories are next to one another/most circumstances...
						
							if (num_combos == 1) select_names = ///								//if the merger is at the beginning of the vector/list...
							(invtokens(select(select_names, select_merge_categories)), ///
							select(select_names[num_combos + ///
							2..cols(select_merge_categories)], ///
							abs(select_merge_categories[num_combos + ///
							2..cols(select_merge_categories)]:-1)))
							
							else if (num_combos == cols(select_merge_categories) - 1) ///		//...or if the merger is at the middle of the vector/list...
							select_names = ///	
							(select(select_names[1..num_combos - 1], ///
							abs(select_merge_categories[1..num_combos - 1]:-1)), ///
							invtokens(select(select_names, select_merge_categories)))
							
							else select_names = ///
							(select(select_names[1..num_combos - 1], ///						//...otherwise the merger is at the end of the vector/list
							abs(select_merge_categories[1..num_combos - 1]:-1)), ///
							invtokens(select(select_names, select_merge_categories)), ///
							select(select_names[num_combos + ///
							2..cols(select_merge_categories)], ///
							abs(select_merge_categories[num_combos + ///
							2..cols(select_merge_categories)]:-1)))
							
						}
							
						else {																	//...otherwise it's a floating category which will be considered a member of the ordered list now
							
							if (num_combos == 1) select_names = ///								//if the merger is at the beginning of the vector/list...
							(invtokens(select(select_names, select_merge_categories)), ///
							select(select_names[num_combos + ///
							1..cols(select_merge_categories) - 1], ///
							abs(select_merge_categories[num_combos + ///
							1..cols(select_merge_categories) - 1]:-1)))
							
							else if (num_combos == cols(select_merge_categories) - 1) ///		//...or if the merger is at the middle of the vector/list...
							select_names = ///	
							(select(select_names[1..num_combos - 1], ///
							abs(select_merge_categories[1..num_combos - 1]:-1)), ///
							invtokens(select(select_names, select_merge_categories)))
							
							else select_names = ///
							(select(select_names[1..num_combos - 1], ///						//...otherwise the merger is at the end of the vector/list
							abs(select_merge_categories[1..num_combos - 1]:-1)), ///
							invtokens(select(select_names, select_merge_categories)), ///
							select(select_names[num_combos + ///
							1..cols(select_merge_categories) - 1], ///
							abs(select_merge_categories[num_combos + ///
							1..cols(select_merge_categories) - 1]:-1)))
							
						}
						
					}
					
					if (strlen(noisily)) select_names	//
				
				}
			
				else more_merge = 0																//if only 2 categories of splitting variable remain - stop merging
						
				if (strlen(noisily)) "Completed One Merging Session"	//
			
			}
			
			data4merging = data4analysis[., 1]													//refresh data to use - set it up for goodman() to decide on splitting - start by just using the response variable
			
			if (select_type[1] == "o") data4merging = ///
			(data4merging, J(rows(data4merging), 1, 0))											//when the focal splitting variable is ordered, add a variable which can be used to sum the binaries into - goodman() requires ordered splitting variables to be a single variable, not multiple
			
			for (category = 1; category <= cols(select_names); category++) {					//for all the categories of the splitting variable that are optimally merged...  though, there should only be 2, which makes the below unnecessary - relic of allowing > 2 categories previously, now disallowed
			
				select2vars = tokens(select_names[category])									//pull out all specific merged categories to turn into single binary
				
				if (strlen(noisily)) select2vars	//
				
				if (select_type[1] == "u") data4merging = ///									//if unordered, just put each merged category-variable in as its own dummy code
				(data4merging, rowsum(select(data4analysis, colsum(J(cols(select2vars), 1, ///
				tokens(invtokens(names))):==(select2vars')))))
				
				else if (select_type[1] == "o") data4merging[., 2] = data4merging[., 2] + ///	//... or if ordered, make that merged category-variable have it's own "category" that's summed into the vector of 0's constructed above - preserves ordering
				rowsum(category:*select(data4analysis, colsum(J(cols(select2vars), 1, ///
				tokens(invtokens(names))):==(select2vars'))))
				
			} 
			
			if (select_type[1] == "u") check_table = ///										//if unordered how many levels of the splitting variable remain?...
			mm_freq(rowsum(data4merging[., ///
			2..cols(data4merging)]:*(1..cols(data4merging)-1)), ///
			wgt4analysis)
			
			else if (select_type[1] == "o") check_table = ///									//...otherwise ordered - how many levels of the splitting variable remain?
			mm_freq(data4merging[., 2], ///			
			wgt4analysis)				
		
			if (strlen(noisily)) check_table	//
			
			if (rows(check_table) > 1) {														//so long as there is > 1 category of the splitting variable remaining (wouldn't necessarily be caught previously)
			
				result = goodman(data4merging, sign(strlen(ordered_response)), ///				//invoke goodman() to check on split-ability of the splitting variable as merged
				sign(strlen(noisily)), wgt4analysis)	
						
				if (strlen(noisily)) result	//
			
				if (result[1] != .) {															//if all went well - record p-value & AIC...
				
					current_pvalue = chi2tail(result[2], result[1])								//compute p-value
					
					current_aic = -2*result[3] + 2*result[4]									//compute AIC
					
				}
			
				else {																			//...otherwise, something's wrong - probably shouldn't split on this variable
				
					current_pvalue = 1															//assign p-value of 1; assures no splitting
					
					current_aic = .																//won't split but for completeness, AIC is missing
					
				}
				
			}
			
			else {																				//...otherwise, no splitting possible - only single category on splitting variable
			
				current_pvalue = 1																//assign p-value of 1; assures no splitting
				
				current_aic = .																	//won't split but for completeness, AIC is missing
			
			}
				
			if (strlen(noisily)) {
			
				current_pvalue	//
		
				compare_pvalue	//
				
				current_aic	//
				
				compare_aic	//
			
			}
		
			if (current_pvalue < compare_pvalue) {												//this splitting variable did better than the previous! - record its p-value and AIC
		
				if (strlen(noisily)) "better!"	//
		
				compare_pvalue = current_pvalue													//update lowest p-value
		
				if (strlen(noisily)) compare_pvalue	//
			
				compare_names = select_names 													//record merged categories of splitting variable
			
				if (strlen(noisily)) compare_names	//
				
				compare_table = check_table														//record number of people in each cell of splitting variable (ensure that they're not too small)
				
				if (strlen(noisily)) compare_table	//
				
				compare_aic = current_aic														//update AIC
			
			}
		
			else if (current_pvalue == compare_pvalue) {										//identical p-values - decide based on AIC
		
				if (current_aic < compare_aic) {												//this splitting variable did better than the previous! - record its p-value and AIC
				
					if (strlen(noisily)) "better!"	//
		
					compare_aic = current_aic													//update lowest AIC-value
		
					if (strlen(noisily)) compare_aic	//
			
					compare_names = select_names 												//record merged categories  of splitting variable
			
					if (strlen(noisily)) compare_names	//
				
					compare_table = check_table													//record number of people in each cell of splitting variable (ensure that they're not too small)
				
					if (strlen(noisily)) compare_table	//
					
					compare_pvalue = current_pvalue												//update p-value
					
				}
				
				else if (current_aic == compare_aic) {											//identical AIC-values - randomly decide who gets kept
				
					randomly_resolve = runiform(1, 2)											//generate random uniform matrix size 2
			
					if (randomly_resolve[1] < randomly_resolve[2]) {							//resolve the improvement in association randomly
		
						if (strlen(noisily)) "better!"	//
		
						compare_pvalue = current_pvalue											//update p-value
		
						if (strlen(noisily)) compare_pvalue	//
			
						compare_names = select_names 											//record merged categories of splitting variable
			
						if (strlen(noisily)) compare_names	//
						
						compare_table = check_table												//record number of people in each cell of splitting variable (ensure that they're not too small)
						
						if (strlen(noisily)) compare_table	//
						
						compare_aic = current_aic												//update AIC-value
						
					}
						
				}
		
			}
		
			position =  position + 1															//increment position - try next splitting variable
		
			if (strlen(noisily)) position	//
		
			if (strlen(noisily)) "Completed Merging Run"	//
		
			if (strlen(noisily)) compare_names	//             
	
			if (position == cols(vars) + 1) {													//if all splitting variables have been attempted...
		
				if ((compare_pvalue < alpha) & (min(compare_table) >= minimum_node_size)) {		//if the best p-value meets alpha p-value criterion, and smallest cluster produced is above minimum cluster/node size restriction
		
					if (strlen(noisily)) "split!"	//
					
					branches[current_cluster] = branches[current_cluster] + 1					//Another branch is grown for the focal cluster - add it to the number currently
					
					if (strlen(noisily)) branches //
					
					select_names = (" ")														//reset select_names for use in creating a path for the focal cluster
					
					for (split = 1; split <= cols(compare_names); split++) {					//generate the "split" to put in the splits row of the CHAID_splits matrix
					
						if (strlen(noisily)) compare_names[split]	// 
					
						select_names = (select_names, compare_names[split], ",")				//separate the different merged categories of the splitting variable with commas
					
					}
					
					if (strlen(noisily)) select_names	//
					
					if (branches[current_cluster] > cols(splits)) splits = ///					//make a new row in splits matrix if needed to represent the new cluster's "path"
					(splits, J(rows(splits), 1, ""))
					
					splits[current_cluster, branches[current_cluster]] = compare_names[1]		//names of the binaries (varname with catrgories) of first category of optimally merged variable into first row in the cluster's "path"
					
					if (strlen(noisily)) splits	//
					
					CHAID_splits = (CHAID_splits, invtokens(select_names))						//add the new split rules to the rules matrix - forms the first row
					
					for (split = 2; split <= cols(compare_names); split++) {					//for the remaining levels of the optimally merged splitting variable...
					
						cluster_count++															//increment cluster_count as there's another cluster obtained
						
						if (strlen(noisily)) cluster_count	//
						
						branches = (branches, branches[current_cluster])						//update branches for the next cluster - adding in that cluster too
						
						if (strlen(noisily)) branches	//
						
						splits = (splits \ splits[current_cluster, .])							//add row to splits matrix representing that clusters path - all elements before the current one in the path will be the same as the first split and will be copied
						
						splits[cluster_count, branches[cluster_count]] = compare_names[split]	//add specific, new split information into the cluster's path
						
						if (strlen(noisily)) splits	//
																		
						select_names = tokens(compare_names[split])								//tokenize the elements associated with the binaries so that the cluster numbers can be updated in the data
					
						for (token = 1; token <= cols(select_names); token++) {					//for each element just tokenized above...
							
							select_obs = (use_obs:+select(usedata, ///							//find where cluster number needs updating in the observations
							tokens(invtokens(names)):==select_names[token])):==2
							
							cluster = ///
							(cluster:*abs(select_obs:-1)):+(select_obs:*cluster_count)			//change cluster number on the observations that need updating
							
							if (strlen(noisily)) mm_freq(cluster, weight)	//
							
						}
					
					}
					
					position = 2																//restart position at the first splitting variable now that there's been a partition
					
					compare_names = ("")														//restart best split so far
					
					compare_pvalue = 1															//restart best p-value so far
					
					compare_aic = .																//restart best AIC so far
				
				}
				
				else {																			//...otherwise, don't meet splitting criteria were not met
				
					position = 2																//restart splitting variable search

					proceed = proceed:*(cluster:!=current_cluster)								//make these observations "stop" splitting - removed from the "touse"-like vector
					
					current_cluster++															//increment cluster - the current cluster cannot be split anymore; look for splits in others
				
					if (max(cluster) < current_cluster) moresplit = 0							//if this is the last cluster (i.e., current cluster just incremented past where the current max is) - chaid ends/turn of moresplit
				
					if (strlen(noisily)) current_cluster	//
				
					compare_pvalue = 1															//reset best p-value so far
					
					compare_aic = .																//reset best AIC so far
					
					compare_names = ("")														//reset best splits so far
						
				}
		
			}
		
		}
	
	}
	
	for (split = 1; split <= rows(splits); split++) {											//for all the rows in the splits matrix...
	
		if (cols(CHAID_splits) > 1) CHAID_splits = ///											//add data to "rules" matrix - specifically, the splits
		(CHAID_splits \ ("path" + strofreal(split), ///
		splits[split, .], J(1, cols(CHAID_splits)-cols(splits)-1, "")))
	
	}
	
	if (strlen(noisily)) "CHAID Finished Execution"	//
	
	current_tree.clusters = cluster																//add clusters to object to be returned
	
	current_tree.CHAID_rules = CHAID_splits														//add rules to object to be returned
	
	return(current_tree)																		//return a chaidtree object

}  

end

/*Mata function to implement the CHAID-based random forest algorithm*/
version 12.1

mata:

mata set matastrict on

function chaidforest(real scalar number_o_trees, real scalar num_vars_used, ///
string scalar type, string scalar names, string scalar vars, real scalar alpha, ///
real scalar minimum_2_split, real scalar minimum_node_size, string scalar touse, ///
string scalar ordered_response, string rowvector displays, string scalar noisily, ///
string scalar sampling, real scalar prop_oos, string scalar weight_name) {
	
	/*declarations*/
	class chaidtree vector current_tree
	
	real matrix data, usedata, chaid_weight
	
	real colvector clusters, weight
	
	real rowvector rowsel, varsel
	
	real scalar tree
	
	string rowvector select_names, select_type, select_vars, names_list, type_list, ///
	displayvec
	
	string scalar wchars, pchars, qchars
	
	transmorphic token_setup
	
	/*begin processing input from Stata*/
	current_tree = chaidtree(number_o_trees)													//produce vector of chaidtree objects - its length will be the number of trees; efficient way to store tree information
	
	if (strlen(noisily)) names	//
	
	token_setup = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()"))					//set up the tempvar binary names for parsing
		
	tokenset(token_setup, names)																//apply the above rules to tempvar binary names
	
	names_list = subinstr(subinstr(tokengetall(token_setup), ")", ""), "(", "")					//obtain all tokens of tempvar binary names, remove binding parentheses
	
	if (strlen(noisily)) names_list	//
	
	token_setup = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()"))					//set up the types for parsing
			
	tokenset(token_setup, type)																	//apply the above rules to types
	
	type_list = subinstr(subinstr(tokengetall(token_setup), ")", ""), "(", "")					//obtain all tokens of types, remove binding parentheses
	
	if (strlen(noisily)) type_list	//
	
	token_setup = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()"))					//set up the binary display names for parsing
			
	tokenset(token_setup, displays)																//apply the above rules to binary display names
	
	displayvec = subinstr(subinstr(tokengetall(token_setup), ")", ""), "(", "")					//obtain all tokens of binary display names, remove binding parentheses
	
	if (strlen(noisily)) displayvec	//
	
	data = ///																					//pull data out of Stata; specifically, the tempvar binaries and the response - removing the binding parentheses and filtering by e(sample)/touse
	st_data(., st_varindex(tokens(subinstr(subinstr(names, ")", ""), "(", ""))), ///
	st_varindex(touse))
	
	data = editmissing(data, .)																	//change all missing values to "." - really only applies to the response
	 
	if (strlen(weight_name)) weight = st_data(., weight_name, st_varindex(touse))				//if fweight-ed, pull the weight into Mata...
	
	else weight = J(rows(data), 1, 1)															//...otherwise, no fweight and thus equal weights - all obs in sample get a 1

	if (number_o_trees > 19) {																	//keep track of progress - header/display
	
		printf("\n{txt}Progress in growing all " + ///
		"{res}%10.0g {txt}CHAID trees\n{res}0%%{txt}" + ///
		"{hline 6}{res}50%%{txt}{hline 6}{res}100%%\n", number_o_trees)
		
		printf("{txt}.")
		
		displayflush()
		
	}
	
	/*grow and store the chaidforest*/
	for (tree = 1; tree <= number_o_trees; tree++) {											//for the number of desired trees...
		
		if (number_o_trees > 19) {																//keep track of progress - dots
	
			if (sum(strmatch(strofreal(tree), ///
			strofreal(floor(mm_quantile(1::number_o_trees, 1, (1::20):/20)))))) {				//only want ~ 20 dots... this ensures there will only be ~ 20
			
				printf("{txt}.")
				
				displayflush()	
				
			}
		}

		if (strlen(sampling)) rowsel = mm_expand(1::rows(data), weight, 0)						//if no bootstrap "bagging" is desired expand the observations using weight for collapsing later...
		
		else if (prop_oos > -1) ///																//...or if sampling without replacement is desired, use 1-prop_oos as proportion sampled...
		rowsel = mm_sample(round(sum(weight)*(1-prop_oos)), rows(data), ., weight, 1, 0)	
		
		else rowsel = mm_sample(sum(weight), rows(data), ., weight, 0, 0)						//...otherwise default to select observations with replacement as a bootstrap sample
		
		varsel = mm_sample(num_vars_used, cols(tokens(vars))-1, ., ., 1, 1)						//selection vector for splitting variables obtained without replacement - need to obtain their dummy codes and types too
		
		if (strlen(noisily)) varsel	//
		
		select_vars = (tokens(vars)[1], select(tokens(vars)[2..cols(tokens(vars))], varsel'))	//implement the selection of splitting variables to include	by overall variable name (i.e., not by associated binaries)
		
		if (strlen(noisily)) select_vars	//
		
		select_names = (displayvec[1], select(displayvec[2..cols(names_list)], varsel'))		//pull binary indicators associated with splitting variables selected
			
		if (strlen(noisily)) select_names	//
		
		select_type = (type_list[1], select(type_list[2..cols(type_list)], varsel'))			//pull types associated with splitting variables selected
		
		if (strlen(noisily)) select_type	//
		
		usedata = select(data[rowsel, .], ///													//select observations and splitting variables from the overall dataset to pass to chaid_g()
		colsum(J(rows(tokens(invtokens(select_names))'), 1, ///
		tokens(subinstr(subinstr(displays, ")", ""), ///
		"(", ""))):==(tokens(invtokens(select_names))')))	
		
		usedata = mm_collapse(usedata, 1, rowsel)												//collapse dataset on bootstrapped observations
		
		chaid_weight = mm_collapse(J(rows(rowsel), 1, 1), 1, rowsel, &dosum())					//collapse boostrapped observations into a frequency weight chaid_g() can use
		
		current_tree[tree] = ///																//pass arguments and data to chaid_g(), the goodman association model-based chaid learner
		chaid_g(usedata[., 2..cols(usedata)], select_type, select_names, select_vars, ///				
		alpha, minimum_2_split, minimum_node_size, ordered_response, noisily, ///
		chaid_weight[., 2])
		
		if (strlen(noisily)) current_tree[tree].CHAID_rules	//
		
		clusters = exp(ln(rowsum((J(1, rows(chaid_weight), ///									//this "merge"s in clusters to the data in a form that's conformable to how its represented in the Stata dataset - non-sampled observations get missing value
		1::rows(data)):==(chaid_weight[., 1]')):*current_tree[tree].clusters')))
		
		current_tree[tree].clusters = clusters													//update chaidtree clusters entry
		
		current_tree[tree].used_vars = select_vars												//update chaidtree used variables entry
		
		current_tree[tree].bootstrap_weight = mm_freq(rowsel, 1, (1::rows(data)))				//update chaidtree bootstrap frequency weights entry
		
	}
	
	return(current_tree)																		//return vector of chaidtrees to Mata

}

end

/*Goodman association model based on loglinear/Poisson regression*/
version 12.1

mata:

mata set matastrict on

real rowvector goodman(real matrix data, real scalar ordered_response, ///
real scalar noisily, real colvector weight) {

	/*declarations*/
	real matrix IVs, DV
	
	real colvector observed_count, expected_count, combined_row_col
	
	real rowvector b, rescaled_b, xb

	real scalar converged, categories_IV, chi2, df, mean_count, responses_missing, ///
	ll, number_of_estimates, base_ll
	
	transmorphic row, col, estimation_object

	/*process inputs to goodman to estimate as poisson/loglinear model*/
	if (noisily) "Goodman/loglinear modeling Begins"	//
	
	if (sum(rowmissing(uniqrows(data[., 1])))) {												//are there missing values on response? Change them to -1's and activate "responses_missing"
	
		data[., 1] = ///																		//changes all missings to -1's
		editmissing(data[., 1], -1)
		
		responses_missing = 1																	//responses_missing is activated
		
	}
	
	else responses_missing = 0																	//otherwise deactivate responses_missing
	
	row = rowsum((J(1, rows(uniqrows(data[., 1])), ///
	data[., 1]):==(uniqrows(data[., 1])')):*(1..rows(uniqrows(data[., 1]))))					//set up response into column vector
	
	col = data[., 2]:*100																		//set up binary splitting variable/IV into column vector that can be "added to" response
	
	combined_row_col = (row + col)																//combines unique row/column cross-tab combinations in a vector
	
	if (noisily) uniqrows(combined_row_col)'	//
	
	observed_count = mm_freq(combined_row_col, weight, J(rows(uniqrows(col)), 1, ///			//obtain observed counts for all unique combinations in the cross-tabulation
	uniqrows(row)):+(uniqrows(col)#J(rows(uniqrows(row)), 1, 1)))
	
	if (noisily) observed_count	//	
	
	/*row and/or column Goodman model - needs ml estimation*/
	if ((ordered_response) & (rows(observed_count) > 3)) {										//if there is a ordered response variable and there is a contingency table worth of data (at least 4 rows)...
	
		col = uniqrows(data[., 2])#J(rows(uniqrows(data[., 1])), 1, 1)							//set up columns of the cross-tabulation approproately for analysis (i.e., spread them across levels of the response as binary dummies - kind of irrelevant as only binaries are ever applied in chaidforest...)
			
		categories_IV = rows(uniqrows(data[., 2]))												//how many unique splitting variable levels are there?
			
		row = J(categories_IV, 1, uniqrows(data[., 1]))											//ordered response - treat row variable as ordered IV "predicing" counts as a loglinear model
				
		if (responses_missing) row = (editvalue(row, -1, 1), row:==min(row))					//when there is an ordered response, separate out the floating missing category into its own dummy (missing effect) but make it equal to the smallest value to reduce collinearity in the ordered variable
		
		if (noisily) row	//
		
		if (noisily) col	//
	
		estimation_object = moptimize_init()													//initalize ml estimation object
		
		//obtain starting values - based on poisson.ado's routine
		mean_count = mean(observed_count)														//obtains a mean of the counts as an imput to starting values													
	
		IVs = cross((row, col), 1 , (row, col), 1)												//cross-products of IV's (for a linear regression)
	
		DV = cross((row, col), 1 , observed_count:-mean_count, 0)								//cross-products of IV's with DV (for a linear regression)
	
		b  = invsym(IVs)*DV																		//linear regression estimation by least squares
	
		rescaled_b = b:/mean_count																//rescale all estimates by mean number of observations which are used as initial/start values for poisson model
		
		/*set up ml estimation*/
		moptimize_init_evaluator(estimation_object, &loglin())									//tell ml evaluator what function to optimize
	
		moptimize_init_evaluatortype(estimation_object, "lf2")									//tell ml evaluator the optimization scheme (likelihood function with known 1st and 2nd derivatives of likelihood)
	
		moptimize_init_depvar(estimation_object, 1, observed_count)								//tell ml evaluator what is the dependent variable is
	
		moptimize_init_eq_indepvars(estimation_object, 1, (row, col))							//tell ml evaluator what is the independent variables are
		
		moptimize_init_search_rescale(estimation_object, "off") 								//tell ml evaluator not to spend time rescaling estimates to improve estimation - takes time to do
	
		moptimize_init_search(estimation_object, "off")											//tell ml evaluator not to spend time searching for better start values to improve estimation - takes time to do
	
		moptimize_init_eq_coefs(estimation_object, 1, rescaled_b')								//tell ml evaluator what the starting values of the parameters are
		
		moptimize_init_conv_maxiter(estimation_object, 20)										//tell ml evaluator to stop after 20 iterations - really should converge or have pretty reasonable values by then; keeps speed managable
		
		if (noisily) moptimize_init_tracelevel(estimation_object, "value")						//tell ml evaluator to show iterations if "noisily" is asked for by user...
		
		else moptimize_init_tracelevel(estimation_object, "none") 								//...otherwise show nothing
		
		moptimize_init_conv_warning(estimation_object, "off")									//tell ml evaluator not to warn when convergence is not reached
	
		moptimize(estimation_object)															//tell ml evaluator to proceed with estimation
	
		xb = moptimize_result_coefs(estimation_object)											//pull estimates from results
	
		if (noisily) xb	//
		
		converged = moptimize_result_converged(estimation_object)								//is everyting well?  That is, did estimation converge?
		
		if (sum(rowmissing(xb)) == 0) converged = 1												//keep estimates if they're usable in computation, even if convergence wasn't reached - we're data mining here
	
		expected_count = exp((row, col, J(rows(observed_count), 1, 1))*xb')						//compute expected frequencies
	
		if (noisily) expected_count	//
	
		df = (rows(uniqrows(data[., 2])))*(rows(uniqrows(data[., 1]))) - cols(xb)				//obtain DF
		
		number_of_estimates = cols(xb)															//obtain number of parameters estimated
	
	}
	
	/*unstructured model - traditional contingency table/product of margins approach*/
	else if (rows(observed_count) > 3) {														//if a contingency table can be made...
	
		row = mm_freq(data[., 1], weight)														//row/response variable margins
		
		if (noisily) row	//
		
		col = mm_freq(rowsum(data[., 2..cols(data)]:*(1..cols(data)-1)), weight)				//column/splitting variable margins
		
		if (noisily) col	//
		
		expected_count = (col#row):/sum(row)													//compute expected frequencies
		
		if (noisily) expected_count	//
	
		df = (rows(uniqrows(data[., 2])) - 1)*(rows(uniqrows(data[., 1])) - 1)					//obtain DF
		
		number_of_estimates = (rows(uniqrows(data[., 2])))*(rows(uniqrows(data[., 1]))) - ///
		(rows(uniqrows(data[., 2])) - 1)*(rows(uniqrows(data[., 1])) - 1)
		
		converged = 1																			//is everything well? (answer is always yes here)
	
	}
	
	else converged = 0																			//...otherwise, no contingency table - all's not well...
	
	if (converged) {																			//if all's well...
		
		chi2 = sum(((observed_count - expected_count):^2):/expected_count)						//obtain chi-square value
		
		row = mm_freq(data[., 1], weight)														//row/response variable margins to infer the ll from the lr test stat
		
		expected_count = select(expected_count, observed_count:>0)								//remove expected counts that will affect a sum
		
		observed_count = select(observed_count, observed_count:>0)								//remove 0's that will affect a sum
		
		ll = sum(observed_count:*ln(observed_count:/expected_count)) + ///						//take the liklihood ratio chi-square and use the constant-only multinomial distribution to obtain model log-likelihood by working backward
		sum((ln((row:/sum(row)))):*row)							
		
		base_ll = sum((ln((row:/sum(row)))):*row)												//baseline/constant-only log-likilihood
		
	}
	
	else {																						//...otherwise return nothing
	
		chi2 = .			
		
		ll = .
		
	}
	
	return((chi2, df, ll, number_of_estimates, base_ll))										//chaid needs the chi-square, degrees of freedom, base and model log-likelihood, and number of estimates - pass it to chaid

}

end

/*Loglinear model for moptimize()*/
version 12.1

mata:

mata set matastrict on

function loglin(transmorphic estimation_object, todo, real rowvector b, ll, S, H) {

	real rowvector inter, y, xb
	
	y = moptimize_util_depvar(estimation_object, 1)
	
	xb  = moptimize_util_xb(estimation_object, b, 1)
	
	ll = -exp(xb) + xb:*y - lngamma(y:+1)
	
	S = -exp(xb) + y
	
	inter = -exp(xb)
	
	H = moptimize_util_matsum(estimation_object, 1, 1, inter, moptimize_util_sum(estimation_object, ll))
	
}

end


/*sum function for mm_collapse()*/
version 12.1

mata: 

mata set matastrict on

function dosum(real colvector data, real scalar nouse) {
	
	real scalar result

	result = sum(data)
	
	return(result)

}

end

/* programming notes and history

- chaidforest version 1.0 - date - October 27, 2014

Basic version

-----

- chaidforest version 2.0 - date - September 28, 2015

a] added version #'s to Stata and Mata commands/functions
b] checked all declarations in Mata to ensure functionality
c] if/in added to syntax to allow conditional - chaidforest - runs
d] fixed inaccurate unordered splitter chi-square computations with goodman() (row and column frequencies reversed - added more random element)
e] predicted values moved to predict command postestimation
f] fixed progress dots that extend beyond 100% w/ small # of trees
g] collapsing (by bootstrap weight) to improve run-time speed in internal chaid_g() function
h] incorporate weights into chaid by default to speed run time/fixed fweight incorporation
i] splitting AIC; merging McFadden's pseudo-R2
j] saves variables used and bootstrap data in chaidtree object, nixes predictions
k] fixed extended missing value compatability with - missing - option - ensure extended missings follow through to predict and estat