*! chaidforest - version 2.0 - 9/28/2015 - Joseph N. Luchman program define chaidforest, eclass //CHAID-based random forest - program history and notes at end of file version 12.1 syntax varlist [fw] [if] [in], [NTree(integer 100) NVuse(integer -1) ORDered(varlist) /// UNOrdered(varlist) DVOrdered Alpha(real .5) MINSplit(integer 0) MINNode(integer 0) /// NOIsily PROos(real -1) XTile(string) noSamp MISSing] local full_cmdline "`0'" //save the contents of the cmdline for ereturn-ing later /*exit and warning conditions*/ if (`:list sizeof unordered' == 0) & (`:list sizeof ordered' == 0) & /// (`:list sizeof xtile' == 0) { //exit if no splitting variables are entered display "{err}No splitting variables entered." exit 111 } if `minsplit' < `minnode' { //exit if you tell chaidforest that the minimum node/cluster size is larger than the minimum sample size to make a split - which is an infeasible situation for clustering display "{err}{opt minsplit()} cannot be smaller than {opt minnode()}." exit 198 } if (`alpha' <= 0) | (`alpha' >= 1) { //ensure that the alpha probabilities are in the possible range of 0-1 display "{err}{opt alpha()} must range betweeen (but not include) 0 and 1." exit 198 } if `minnode' < 0 { //ensure that minimum node/cluster size is a non-negative integer (i.e., a possible sample size) display "{err}{opt minnode()} must be a non-negative integer value." exit 198 } capture which lmoremata.mlib //is moremata present? - chaidforest needs moremata if _rc != 0 { //if moremata cannot be found, tell user to install it display "{err}Module {cmd:moremata} not found. Install {cmd:moremata} here " /// "{stata ssc install moremata}." exit 198 } /*begin data processing*/ tempvar keep touse stop cluster useob //tempvar declarations - begin using "touse"; add "keep" markout variables in unordered() and ordered() tempname cellcount fit nvars N_tree obs //tempname declaration for matrices quietly generate byte `keep' = 1 `if' `in' //allows keeping of missings as valid category, but still adjusting estimation sample for "if"s and "in"s if strlen("`xtile'") { //parse and check xtile gettoken xtile xtile_opt: xtile, parse(",") //separate out the "to xtile" varlist from the options to pass to xtile quietly summarize `xtile' //summarize the xtile varlist - if it can't be summarized, then its not a variable } mark `touse' //mark estimation sample if strlen("`missing'") markout `touse' `keep' //if missing invoked only markout "if" and "in"... else markout `touse' `varlist' `unordered' `ordered' `keep' `xtile' //...otherwise markout "if", "in", and missing values if strlen("`xtile'") { //implement convenience option for creating binned continuous predictors in chaidforest and adding to "ordered" list foreach var of varlist `xtile' { //loop through all the variables in the xtile list... capture inspect xt`var' //...see if a variable with the name "xt`varname'" already exists... if !_rc quietly drop xt`var' //...if it does, drop the variable "xt`varname'" - users are warned of this behavior in the help file quietly xtile xt`var' = `var' if `touse' [`weight'`exp'] `xtile_opt' //conduct the xtile with options as applicable - with estimation sample local ordered "`ordered' xt`var'" //add the xtile'd variable to the ordered list } } quietly inspect `varlist' //inspect response variable to obtain values that are 0 and positive (if they don't equal the total - something's negative and that's not allowed) if r(N) != (r(N_0) + r(N_posint)) { //chaidforest doesn't work with non-negarive integers - disallow them using the results from the "inspect" command display "{err}Levels of response variable must be non-negative integer valued." exit 198 } quietly levelsof `varlist' if `touse', local(dvlevs) `missing' //obtain levels of response variable if regexm("`dvlevs'", "\.") mata: st_local("dvlevs_count", /// //removes multiple missings from consideration in the levels count for response invtokens((select(tokens(st_local("dvlevs")), /// editmissing(strtoreal(tokens(st_local("dvlevs"))), .):!=.), "."))) if `:list sizeof dvlevs_count' > 20 { //if there are too many levels of the response variable... display "{err}Number of distinct values in {cmd:`varlist'} is larger than allowed. " /// _newline "Consider collapsing across similar categories for unordered or using " /// _newline "{opt xtile()} option/command for ordered variables to reduce the number of " /// "levels." exit 198 } local dvlist "`varlist'" //produce a local macro that is updated with tempvar names for use in Mata local dvdisp "`varlist'" //produce a local macro that is updated with displayed names of response variable local type "`type' d" //note type of variable for response variable - type is "d" for "dependent" if `:list sizeof unordered' > 0 { //process unordered splitting variables when present just as was done with the response variable above... foreach var of varlist `unordered' { //macros created for parsing and different puropses - unordered quietly inspect `var' //inspect unordered splitting variable to obtain values that are 0 and positive (if they don't equal the total - something's negative and that's not allowed) if r(N) != (r(N_0) + r(N_posint)) { //chaidforest doesn't work with non-negarive integers - disallow them using the results from the "inspect" command display "{err}Levels of each splitting variable must be non-negative integer " /// "valued. Variable {cmd:`var'} has non-negative integer values." exit 198 } quietly levelsof `var' if `touse', local(`var'levs) `missing' //obtain levels of unordered splitting variable if regexm("``var'levs'", "\.") mata: st_local("`var'levs_count", /// //removes multiple missings from consideration in the levels count for unordered splitting variable invtokens((select(tokens(st_local("`var'levs")), /// editmissing(strtoreal(tokens(st_local("`var'levs"))), .):!=.), "."))) if `:list sizeof `var'levs_count' > 20 { display "{err}Number of distinct values in {cmd:`var'} is larger than " /// "allowed." _newline "Consider collapsing across similar categories for " /// "unordered or using " _newline "{opt xtile()} option/command for ordered " /// "variables to reduce the number of levels." exit 198 } foreach val of numlist ``var'levs' { //for each level of splitters, register binary variable... if `:list posof "`val'" in `var'levs' == 1 { //leading text for locals to be fed to chaidforest()... local type "`type' (" local unorddisp "`unorddisp' (" local unorderedlist "`unorderedlist' (" } if !missing(`val') { //...if not a missing value... tempvar `var'`val' //create tempvars with name of level of unordered, binary variable quietly generate byte ``var'`val'' = `var' == `val' //generate binaries for each level of unordered variable local unorderedlist "`unorderedlist' ``var'`val''" //include name of unordered binary variable in list of tempvar names local unorddisp "`unorddisp' `var'_`val'" //include displayed name of unordered binary variable in list of tempvar names local type "`type' u" //include variable type in type macro } else { //...if any kind of missing value... mata: st_local("miss_name", strofreal(st_isname("``var'ms'"))) //determine if the missing binary variable is already present if !`miss_name' { //if the missing binary variable is not found... tempvar `var'ms //create tempvar for unordered varible indicating missing quietly generate byte ``var'ms' = `var' == `val' //generate indicator for missing category local unorderedlist "`unorderedlist' ``var'ms'" //include missing tempvar name in unordered list local unorddisp "`unorddisp' `var'_ms" //include missing display name in unordered display list local type "`type' u" //include type still as unordered" } else quietly replace ``var'ms' = ``var'ms' + (`var' == `val') //...else add in the other missing code... only valid with multiple missing codes (., .a, .b, etc.) } if (`: list posof "`val'" in `var'levs' == `: list sizeof `var'levs') { //trailing text for all the locals to be fed to chaidforest()... local type "`type')" local unorddisp "`unorddisp')" local unorderedlist "`unorderedlist')" } } } } if `:list sizeof ordered' > 0 { //process ordered splitting variables when present just as was done with the response/unordered variable above... foreach var of varlist `ordered' { //macros created for parsing and different puropses - ordered quietly inspect `var' //inspect ordered splitting variable to obtain values that are 0 and positive (if they don't equal the total - something's negative and that's not allowed) if r(N) != (r(N_0) + r(N_posint)) { //chaidforest doesn't work with non-negarive integers - disallow them using the results from the "inspect" command display "{err}Levels of each splitting variable must be non-negative integer " /// "valued. Variable {cmd:`var'} has non-negative integer values." exit 198 } quietly levelsof `var' if `touse', local(`var'levs) `missing' //put levels of ordered splitting variable into local macro if regexm("``var'levs'", "\.") mata: st_local("`var'levs_count", /// //removes multiple missings from consideration in the levels count for ordered splitting variable invtokens((select(tokens(st_local("`var'levs")), /// editmissing(strtoreal(tokens(st_local("`var'levs"))), .):!=.), "."))) if `:list sizeof `var'levs_count' > 20 { display "{err}Number of distinct values in {cmd:`var'} is larger than " /// "allowed." _newline "Consider collapsing across similar categories for " /// "unordered or using " _newline "{opt xtile()} option/command for ordered " /// "variables to reduce the number of levels." exit 198 } foreach val of numlist ``var'levs' { //for each level of splitters, register binary variable... if `: list posof "`val'" in `var'levs' == 1 { //leading text for locals to be fed to chaidforest()... local type "`type' (" local orddisp "`orddisp' (" local orderedlist "`orderedlist' (" } if !missing(`val') { //...if not a missing value... tempvar `var'`val' //declare tempvar for each value of ordered variable quietly generate byte ``var'`val'' = `var' == `val' //generate binary for each level of ordered variable local orderedlist "`orderedlist' ``var'`val''" //include ordered tempvar in orderedlist local orddisp "`orddisp' `var'_`val'" //include display name of variable in ordered display macro local type "`type' o" //include variable type in type macro } else { //...if any kind of missing value... mata: st_local("miss_name", strofreal(st_isname("``var'ms'"))) //determine if the missing binary variable is already present if !`miss_name' { //if the missing binary variable is not found... tempvar `var'ms //create tempvar for ordered varible indicating missing quietly generate byte ``var'ms' = `var' == `val' //generate indicator for missing category local orderedlist "`orderedlist' ``var'ms'" //include missing tempvar name in ordered list local orddisp "`orddisp' `var'_ms" //include missing display name in ordered display list local type "`type' f" //include type as floating" } else quietly replace ``var'ms' = ``var'ms' + (`var' == `val') //...else add in the other missing code... only valid with multiple missing codes (., .a, .b, etc.) } if (`: list posof "`val'" in `var'levs' == `: list sizeof `var'levs') { //trailing text for all the locals to be fed to chaidforest()... local type "`type')" local orddisp "`orddisp')" local orderedlist "`orderedlist')" } } } } /*Process number of variables to select in random forest*/ if `nvuse' <= -1 scalar `nvars' = /// //by default use rounded square root of number of splitting variables... round(sqrt(`=`:list sizeof ordered' + `:list sizeof unordered'')) else if (`nvuse' > `=`:list sizeof ordered' + `:list sizeof unordered'') & (`nvuse' > 0) { //...exit if too many splitting variables indicated in macro nvuse... display "{err}{opt nvuse()} must be less than or equal to the number of splitting " /// "variables" _newline "included in options {opt xtile()}, {opt ordered()} and/or " /// "{opt unordered()}" exit 198 } else scalar `nvars' = `nvuse' //...otherwise use user-specified number of splitting variables per tree if strlen("`weight'") local chaid_exp = subinstr("`exp'", "=", "", 1) //if there is a frequency weight, parse the "exp" to obtain only the weight variable if strlen("`weight'") { //obtain number of obsevations with frequency weights... quietly summarize `chaid_exp' if `touse', meanonly scalar `obs' = r(sum) } else { //...otherwise obtain number of observations without frequency weights quietly count if `touse' scalar `obs' = r(N) } if `minsplit' == 0 local minsplit = `=ceil(`obs'*.01)' //default minsplit is 1% of sample size if `minnode' == 0 local minnode = `=ceil(`obs'*.005)' //default minnode is .5% of sample size /*Pull data into Mata for processing*/ mata: results = chaidforest(`ntree', `=`nvars'', "`type'", /// //run chaidforest() function in Mata - returns "results" object "`varlist' `unorderedlist' `orderedlist'", /// "`varlist' `unordered' `ordered'", `alpha', `minsplit', `minnode', /// "`touse'", "`dvordered'", "`varlist' `unorddisp' `orddisp'", "`noisily'", /// "`samp'", `proos', "`chaid_exp'") if `proos' > -1 scalar `N_tree' = `=round(`obs'*(1 - `proos'))' //obtain observations per tree if proos() used... else scalar `N_tree' = `obs' //...otherwise observations per tree is equivalent to total number of observations (i.e., sample size due to bootstrapping) /*Return basic results*/ ereturn post , depname(`varlist') esample(`touse') ereturn scalar ntree = `ntree' ereturn scalar nvuse = `nvars' ereturn scalar N_tree = `N_tree' ereturn scalar minsplit = `minsplit' ereturn scalar minnode = `minnode' ereturn local splitvars = "`unordered' `ordered'" ereturn local predict = "chaidforest_pr" if strlen("`weight'") { ereturn local wtype = "fweight" ereturn local wexp "`exp'" } ereturn local estat_cmd = "chaidforest_estat" ereturn local title = "CHAID-based Random Forest" ereturn local cmd = "chaidforest" ereturn local cmdline = "chaidforest `full_cmdline'" if strlen("`missing'") ereturn hidden scalar validmiss = 1 else ereturn hidden scalar validmiss = 0 if strlen("`dvordered'") ereturn hidden scalar dvorder = 1 else ereturn hidden scalar dvorder = 0 if `proos' == -1 ereturn hidden scalar w_o_replace = 0 else ereturn hidden scalar w_o_replace = 1 end /*Object for chaidforest's results storage*/ version 12.1 mata: mata set matastrict on class chaidtree { real colvector clusters //each tree retains clusters string matrix CHAID_rules //each tree retains if/then statements string scalar used_vars //each tree retains variables used real colvector bootstrap_weight //each tree retains frequency weights from bootstrap or e(sample)-like data from prop_oos } end /*Mata-version of CHAID based on Goodman association models*/ version 12.1 mata: mata set matastrict on function chaid_g(real matrix usedata, string rowvector type, /// string rowvector names, string rowvector vars, real scalar alpha, /// real scalar minimum_2_split, real scalar minimum_node_size, string scalar ordered_response, /// string scalar noisily, real colvector weight) { /*declarations*/ class chaidtree scalar current_tree real matrix select2variables, check_table, compare_table, data4analysis, /// data4merging real rowvector result, merger_pvalues, select_merge_categories, randomly_resolve, /// branches, merger_r2s real colvector proceed, cluster, use_obs, select_obs, comare_var1, comare_var2, /// wgt4analysis real scalar category, split, token, position, moresplit, compare_pvalue, /// current_pvalue, more_merge, cluster_count, current_cluster, num_combos, /// possible_combos, floating, compare_aic, current_aic string matrix splits, CHAID_splits string rowvector select_type, collapse_variables, select_names, compare_names, /// select2names, select2vars transmorphic cvp_setup, cvp_template /*set-up and notification; CHAID is a noisy program to find bugs primarily*/ if (strlen(noisily)) "CHAID Begins Execution" // if (strlen(noisily)) vars // position = 2 //updatable position of focal splitting variable; begins at 2 because response is variable 1 moresplit = 1 //scalar used to by chaid_g() to indicate "please continue splitting"; when this turns 0, chaid_g() is finished looking for any and all splits in the data compare_pvalue = 1 //used as the "comparison" lowest p-value; if the current p-value from an analysis is smaller/larger, variable is split/merged compare_aic = . //used as the "comparison" lowest AIC value; only used for splitting purposes - if the current AIC from an analysis is larger, variable is split cluster_count = 1 //the number of clusters to this point - starts at 1 indicating that everyone is in the same cluster (i.e., cluster #1) current_cluster = 1 //the cluster currently being processed to find more splits - starts at cluster #1 compare_names = ("") //holds the variable names of the associated with the variables splits currently best (as chosen by p-values or AIC); changes when another splitting variable is chosen as best branches = (0) //the number of branches the current tree has by cluster/node - starts at 0; is updated as more splits are found splits = ("") //the current data implemented/chosen partitions/splits associated with the current chaid_g() run result = J(1, 5, .) //Goodman model results - chi-square, DF, log-likelihood, and number of parameters if (strlen(noisily)) names // if (strlen(noisily)) type // proceed = J(rows(usedata), 1, 1) //vector to use to indicate continue splitting on these observations; ; a kind of updatable "touse" that focuses on whether clustering has been attempted and failed to find further clusters cluster = J(rows(usedata), 1, 1) //cluster number associated with each observation; starts at cluster #1 for all use_obs = J(rows(usedata), 1, 0) //vector to indicate use the set of observations for splitting in current splitting run; a kind of updatable "touse" which accounts for "proceed" above as well as the combination of "current_cluster" and "cluster" CHAID_splits = ("Splits") //matrix to use to return partitioning rules and splits /*begin looking for splits in the data*/ while (moresplit) { //do following process at least once - continue if between-variable splitting process is to continue (i.e., moresplit == 1) use_obs = (use_obs:*0):+((proceed:+(cluster:==current_cluster)):==2) //update observations to use - ones that can be "proceed"ed on and that are in the current cluster select_names = names[position] //pull out names for first splitting variable if (strlen(noisily)) select_names // select_type = type[position] //pull out types for first splitting variable using same procedure as above if (strlen(noisily)) select_type // more_merge = 1 //updatable scalar to indicate continute merging levels of a splitting variable - here merging always proceeds until only 2 levels remain data4analysis = select(usedata, use_obs) //subset data to the relevent set of observations wgt4analysis = select(weight, use_obs) //subset weights to the relevant set of observations check_table = mm_freq(data4analysis[., 1], wgt4analysis) //determine frequency of values on response variable if (strlen(noisily)) check_table // if (!sum(proceed)) { //if proceeds are all 0, and thus there are no observations to proceed on... moresplit = 0 //don't split anymore - chaid is done more_merge = 0 //don't merge anymore either } else if ((rows(check_table) == 1) | (colsum(check_table) < minimum_2_split)) { //...or if there is only one response value left in a cluster (i.e., it's pure)/the remaining sample size is below the minimum for continuing looking for splits... more_merge = 0 //stop merging on this variable proceed = proceed:-use_obs //stop clustering on these observations; use_obs subtracts 1 from proceed thus making previously "proceed"ed observations, no longer "proceed"able current_cluster++ //increment current cluster - start using the next cluster up and look for splits there; so long as there is another cluster position = 2 //restart "position" used in merging at 2 for the new cluster } else { //...otherwise the response variable looks good - begin splitting variable category merging check_table = colsum(select(data4analysis, /// //how many levels of the splitting variable remain? colsum(J(cols(tokens(select_names)), 1, /// tokens(invtokens(names))):==(tokens(select_names)')))) if (strlen(noisily)) check_table // select_names = select(tokens(select_names), sign(check_table)) //separate out the binary variables associted with the focal splitting variable if (strlen(noisily)) select_names // select_type = select(tokens(select_type), sign(check_table)) //separate out the types associated with the splitting variable data4merging = select(data4analysis, colsum(J(cols(select_names), /// //only keep the focal splitting variable and response variable 1, tokens(invtokens(names))):==(select_names'))) while (more_merge == 1) { //so long as this is indicated, keep merging levels... collapse_variables = ("") //replace/generate collapse_variables including names of variables to be collapsed into single variable merger_pvalues = (.) //vector of p-values to search through to determine which categories should be merged merger_r2s = (.) //vector of pseudo-R2 values to search through to determine which categories should be merged floating = 0 //does this splitting variable have a floating missing? (i.e., missing with an splitting ordered variable) if (cols(select_names) > 2) { //if there are more than 2 categories of the splitting variable remaining? If yes, attempt mergers if (select_type[1] == "u") { //if the variable is unordered... cvp_template = (J(2, 1, 1) \ J(cols(select_names) - 2, 1, 0)) //setup vector for use in cvpermute() to select all sets of 2 variables if (strlen(noisily)) cvp_template // cvp_setup = cvpermutesetup(cvp_template) //register "cvp_template" vector with cvpermute() possible_combos = comb(cols(select_names), 2) //number of possible combinations - unordered } else if ( //... or if the variable is "floating"... (select_type[cols(select_type)] == "f") & /// (colsum(regexm(select_names[cols(select_names)], "ms$")) == 1) & /// (colsum(cols(tokens(select_names[cols(select_names)]))) == 1)) { possible_combos = (cols(select_names) - 1)*2 - 1 //number of possible combinations - floating floating = 1 //indicate that there's a floating variable cvp_template = (J(1, 1, 1) \ J(cols(select_names) - 2, 1, 0)) //setup vector for use in cvpermute() to select all sets of 2 variables if (strlen(noisily)) cvp_template // cvp_setup = cvpermutesetup(cvp_template) //register "cvp_template" vector with cvpermute() } else possible_combos = cols(select_names) - 1 //...otherwise the other possibility is ordered... num_combos = 1 //scalar to keep track of how many combinations have been looped through to this point... while (num_combos <= possible_combos) { //go through all different combinations of predictor levels to find best levels to collapse if (select_type[1] == "u") select2variables = cvpermute(cvp_setup)' //invoke instance of cvpermute() for selector vector - unordered else if (((select_type[1] == "o") & (floating == 0)) | /// ((floating == 1) & (sign(num_combos*2 - possible_combos) != 1))) /// //invoke instance of cvpermute() for selector vector - ordered or floating when the floating category has been merged into another category select2variables = (J(num_combos - 1, 1, 0) \ J(2, 1, 1) \ /// J(cols(select_names) - num_combos - 1, 1, 0))' else if ((floating == 1) & /// //invoke instance of cvpermute() for selector vector - floating, when the floating category has not been merged yet (sign(num_combos*2 - possible_combos) == 1)) /// select2variables = (cvpermute(cvp_setup)', 1) if (strlen(noisily)) select2variables // collapse_variables = (collapse_variables, /// //create string vector indicating position of category, binary variables to obtain a chi-square on among all the categories representing the focal splitting variable invtokens(strofreal(select2variables))) if (strlen(noisily)) collapse_variables // select2names = select(select_names, select2variables) //proceed to select 2 columns of splitting variable to evaluate for potentially collapsing if (strlen(noisily)) select2names // if (strlen(noisily)) select2names[1] // select2vars = tokens(select2names[1]) //separate out binary variables represeing a category (sometimes will already have been collapsed - this step is necessary to ensure previously collapsed binaries are appropriately used) - category 1 comare_var1 = rowsum(select(data4merging, /// //implement the previously collapsed binaries into single category - category 1 colsum(J(cols(select2vars), 1, /// tokens(invtokens(select_names))):==(select2vars)'))) if (strlen(noisily)) sum(comare_var1) // if (strlen(noisily)) select2names[2] // select2vars = tokens(select2names[2]) //separate out binary variables represeing a category (sometimes will already have been collapsed - again, this step is necessary to ensure previously collapsed binaries are appropriately used) - category 2 comare_var2 = rowsum(select(data4merging, /// //implement the previously collapsed binaries into single category - category 2 colsum(J(cols(select2vars), 1, /// tokens(invtokens(select_names))):==(select2vars)'))) if (strlen(noisily)) sum(comare_var2) // if (strlen(noisily)) mean((comare_var1, comare_var2)) // if (rows(uniqrows(select(data4analysis[., 1], /// //invoke Goodman association model - only runs so long as 0's and 1's exist for both categories 1 and 2 of the splitting variable given the previous mergers - contingency tables need 4 cells! (comare_var1:+comare_var2)))) > 1) /// result = goodman(select((data4analysis[., 1], comare_var1), /// (comare_var1:+comare_var2)), sign(strlen(ordered_response)), /// sign(strlen(noisily)), select(wgt4analysis, /// (comare_var1:+comare_var2))) else result = J(1, 5, .) //if goodman() doesn't run due to too few categories, ensure this result is merged (or likely to be merged) if (strlen(noisily)) result // if (result[1] != .) { //if all went well... merger_pvalues = /// (merger_pvalues, chi2tail(result[2], result[1])) //record p-value merger_r2s = /// (merger_r2s, 1 - (result[3]/result[5])) //record pseudo-r2 } else { //...otherwise, something's wrong merger_pvalues = (merger_pvalues, 1) //bad comparison, probably merge them; pvalue merger_r2s = (merger_r2s, 0) //bad comparison, probably merge them; r2 } num_combos++ //increment num_combos - make the next comaprison if (strlen(noisily)) num_combos // } merger_pvalues = strmatch(strofreal(merger_pvalues), /// //find the position of the largest p-value criterion strofreal(max(merger_pvalues))) if (strlen(noisily)) merger_pvalues // if (rowsum(merger_pvalues) > 1) { //if there are multiple, identical, large p-values pick one using r2 (pvalues might just be too big...) AIC is invalid given the change in N per merger comparison... merger_pvalues = strmatch(strofreal(merger_r2s), /// //find the position of the smallest r2 strofreal(min(merger_r2s))) if (strlen(noisily)) merger_pvalues // if (rowsum(merger_pvalues) > 1) { //if there are multiple, identical, small r2's pick one randomly - chaid_g() can't tell the difference... randomly_resolve = runiform(1, cols(merger_pvalues)) //random number with a number of columns equal to the number of identical p-values merger_pvalues = strmatch(strofreal(merger_pvalues), /// //find the p-values that match - again strofreal(max(merger_pvalues))) merger_pvalues = randomly_resolve:*merger_pvalues //keep only the random numbers which are associated with the matching p-values merger_pvalues = strmatch(strofreal(merger_pvalues), /// //select the biggest random p-value strofreal(max(merger_pvalues))) if (strlen(noisily)) merger_pvalues // } } select_merge_categories = strtoreal(tokens(select(collapse_variables, /// //pull out binary variables associated with largest p-value merger_pvalues))) if (strlen(noisily)) select_merge_categories // if (select_type[1] == "u") select_names = /// //merge together chaid_g() identified categories associated with unordered - stick the newly merged category on the end (invtokens(select(select_names, select_merge_categories)), /// select(select_names, abs(select_merge_categories:-1))) else if (select_type[1] == "o") { //merge together chaid_g() identified categories associated with ordered - order matters now, requires more syntax num_combos = (strpos(invtokens(strofreal(select_merge_categories)), /// //use num_combos as indicator of "where" the merger needs to occur in the vector of category names "1") + 1)/2 if (strlen(noisily)) num_combos // if (select_merge_categories[num_combos] == /// select_merge_categories[num_combos + 1]) { //when the merged categories are next to one another/most circumstances... if (num_combos == 1) select_names = /// //if the merger is at the beginning of the vector/list... (invtokens(select(select_names, select_merge_categories)), /// select(select_names[num_combos + /// 2..cols(select_merge_categories)], /// abs(select_merge_categories[num_combos + /// 2..cols(select_merge_categories)]:-1))) else if (num_combos == cols(select_merge_categories) - 1) /// //...or if the merger is at the middle of the vector/list... select_names = /// (select(select_names[1..num_combos - 1], /// abs(select_merge_categories[1..num_combos - 1]:-1)), /// invtokens(select(select_names, select_merge_categories))) else select_names = /// (select(select_names[1..num_combos - 1], /// //...otherwise the merger is at the end of the vector/list abs(select_merge_categories[1..num_combos - 1]:-1)), /// invtokens(select(select_names, select_merge_categories)), /// select(select_names[num_combos + /// 2..cols(select_merge_categories)], /// abs(select_merge_categories[num_combos + /// 2..cols(select_merge_categories)]:-1))) } else { //...otherwise it's a floating category which will be considered a member of the ordered list now if (num_combos == 1) select_names = /// //if the merger is at the beginning of the vector/list... (invtokens(select(select_names, select_merge_categories)), /// select(select_names[num_combos + /// 1..cols(select_merge_categories) - 1], /// abs(select_merge_categories[num_combos + /// 1..cols(select_merge_categories) - 1]:-1))) else if (num_combos == cols(select_merge_categories) - 1) /// //...or if the merger is at the middle of the vector/list... select_names = /// (select(select_names[1..num_combos - 1], /// abs(select_merge_categories[1..num_combos - 1]:-1)), /// invtokens(select(select_names, select_merge_categories))) else select_names = /// (select(select_names[1..num_combos - 1], /// //...otherwise the merger is at the end of the vector/list abs(select_merge_categories[1..num_combos - 1]:-1)), /// invtokens(select(select_names, select_merge_categories)), /// select(select_names[num_combos + /// 1..cols(select_merge_categories) - 1], /// abs(select_merge_categories[num_combos + /// 1..cols(select_merge_categories) - 1]:-1))) } } if (strlen(noisily)) select_names // } else more_merge = 0 //if only 2 categories of splitting variable remain - stop merging if (strlen(noisily)) "Completed One Merging Session" // } data4merging = data4analysis[., 1] //refresh data to use - set it up for goodman() to decide on splitting - start by just using the response variable if (select_type[1] == "o") data4merging = /// (data4merging, J(rows(data4merging), 1, 0)) //when the focal splitting variable is ordered, add a variable which can be used to sum the binaries into - goodman() requires ordered splitting variables to be a single variable, not multiple for (category = 1; category <= cols(select_names); category++) { //for all the categories of the splitting variable that are optimally merged... though, there should only be 2, which makes the below unnecessary - relic of allowing > 2 categories previously, now disallowed select2vars = tokens(select_names[category]) //pull out all specific merged categories to turn into single binary if (strlen(noisily)) select2vars // if (select_type[1] == "u") data4merging = /// //if unordered, just put each merged category-variable in as its own dummy code (data4merging, rowsum(select(data4analysis, colsum(J(cols(select2vars), 1, /// tokens(invtokens(names))):==(select2vars'))))) else if (select_type[1] == "o") data4merging[., 2] = data4merging[., 2] + /// //... or if ordered, make that merged category-variable have it's own "category" that's summed into the vector of 0's constructed above - preserves ordering rowsum(category:*select(data4analysis, colsum(J(cols(select2vars), 1, /// tokens(invtokens(names))):==(select2vars')))) } if (select_type[1] == "u") check_table = /// //if unordered how many levels of the splitting variable remain?... mm_freq(rowsum(data4merging[., /// 2..cols(data4merging)]:*(1..cols(data4merging)-1)), /// wgt4analysis) else if (select_type[1] == "o") check_table = /// //...otherwise ordered - how many levels of the splitting variable remain? mm_freq(data4merging[., 2], /// wgt4analysis) if (strlen(noisily)) check_table // if (rows(check_table) > 1) { //so long as there is > 1 category of the splitting variable remaining (wouldn't necessarily be caught previously) result = goodman(data4merging, sign(strlen(ordered_response)), /// //invoke goodman() to check on split-ability of the splitting variable as merged sign(strlen(noisily)), wgt4analysis) if (strlen(noisily)) result // if (result[1] != .) { //if all went well - record p-value & AIC... current_pvalue = chi2tail(result[2], result[1]) //compute p-value current_aic = -2*result[3] + 2*result[4] //compute AIC } else { //...otherwise, something's wrong - probably shouldn't split on this variable current_pvalue = 1 //assign p-value of 1; assures no splitting current_aic = . //won't split but for completeness, AIC is missing } } else { //...otherwise, no splitting possible - only single category on splitting variable current_pvalue = 1 //assign p-value of 1; assures no splitting current_aic = . //won't split but for completeness, AIC is missing } if (strlen(noisily)) { current_pvalue // compare_pvalue // current_aic // compare_aic // } if (current_pvalue < compare_pvalue) { //this splitting variable did better than the previous! - record its p-value and AIC if (strlen(noisily)) "better!" // compare_pvalue = current_pvalue //update lowest p-value if (strlen(noisily)) compare_pvalue // compare_names = select_names //record merged categories of splitting variable if (strlen(noisily)) compare_names // compare_table = check_table //record number of people in each cell of splitting variable (ensure that they're not too small) if (strlen(noisily)) compare_table // compare_aic = current_aic //update AIC } else if (current_pvalue == compare_pvalue) { //identical p-values - decide based on AIC if (current_aic < compare_aic) { //this splitting variable did better than the previous! - record its p-value and AIC if (strlen(noisily)) "better!" // compare_aic = current_aic //update lowest AIC-value if (strlen(noisily)) compare_aic // compare_names = select_names //record merged categories of splitting variable if (strlen(noisily)) compare_names // compare_table = check_table //record number of people in each cell of splitting variable (ensure that they're not too small) if (strlen(noisily)) compare_table // compare_pvalue = current_pvalue //update p-value } else if (current_aic == compare_aic) { //identical AIC-values - randomly decide who gets kept randomly_resolve = runiform(1, 2) //generate random uniform matrix size 2 if (randomly_resolve[1] < randomly_resolve[2]) { //resolve the improvement in association randomly if (strlen(noisily)) "better!" // compare_pvalue = current_pvalue //update p-value if (strlen(noisily)) compare_pvalue // compare_names = select_names //record merged categories of splitting variable if (strlen(noisily)) compare_names // compare_table = check_table //record number of people in each cell of splitting variable (ensure that they're not too small) if (strlen(noisily)) compare_table // compare_aic = current_aic //update AIC-value } } } position = position + 1 //increment position - try next splitting variable if (strlen(noisily)) position // if (strlen(noisily)) "Completed Merging Run" // if (strlen(noisily)) compare_names // if (position == cols(vars) + 1) { //if all splitting variables have been attempted... if ((compare_pvalue < alpha) & (min(compare_table) >= minimum_node_size)) { //if the best p-value meets alpha p-value criterion, and smallest cluster produced is above minimum cluster/node size restriction if (strlen(noisily)) "split!" // branches[current_cluster] = branches[current_cluster] + 1 //Another branch is grown for the focal cluster - add it to the number currently if (strlen(noisily)) branches // select_names = (" ") //reset select_names for use in creating a path for the focal cluster for (split = 1; split <= cols(compare_names); split++) { //generate the "split" to put in the splits row of the CHAID_splits matrix if (strlen(noisily)) compare_names[split] // select_names = (select_names, compare_names[split], ",") //separate the different merged categories of the splitting variable with commas } if (strlen(noisily)) select_names // if (branches[current_cluster] > cols(splits)) splits = /// //make a new row in splits matrix if needed to represent the new cluster's "path" (splits, J(rows(splits), 1, "")) splits[current_cluster, branches[current_cluster]] = compare_names[1] //names of the binaries (varname with catrgories) of first category of optimally merged variable into first row in the cluster's "path" if (strlen(noisily)) splits // CHAID_splits = (CHAID_splits, invtokens(select_names)) //add the new split rules to the rules matrix - forms the first row for (split = 2; split <= cols(compare_names); split++) { //for the remaining levels of the optimally merged splitting variable... cluster_count++ //increment cluster_count as there's another cluster obtained if (strlen(noisily)) cluster_count // branches = (branches, branches[current_cluster]) //update branches for the next cluster - adding in that cluster too if (strlen(noisily)) branches // splits = (splits \ splits[current_cluster, .]) //add row to splits matrix representing that clusters path - all elements before the current one in the path will be the same as the first split and will be copied splits[cluster_count, branches[cluster_count]] = compare_names[split] //add specific, new split information into the cluster's path if (strlen(noisily)) splits // select_names = tokens(compare_names[split]) //tokenize the elements associated with the binaries so that the cluster numbers can be updated in the data for (token = 1; token <= cols(select_names); token++) { //for each element just tokenized above... select_obs = (use_obs:+select(usedata, /// //find where cluster number needs updating in the observations tokens(invtokens(names)):==select_names[token])):==2 cluster = /// (cluster:*abs(select_obs:-1)):+(select_obs:*cluster_count) //change cluster number on the observations that need updating if (strlen(noisily)) mm_freq(cluster, weight) // } } position = 2 //restart position at the first splitting variable now that there's been a partition compare_names = ("") //restart best split so far compare_pvalue = 1 //restart best p-value so far compare_aic = . //restart best AIC so far } else { //...otherwise, don't meet splitting criteria were not met position = 2 //restart splitting variable search proceed = proceed:*(cluster:!=current_cluster) //make these observations "stop" splitting - removed from the "touse"-like vector current_cluster++ //increment cluster - the current cluster cannot be split anymore; look for splits in others if (max(cluster) < current_cluster) moresplit = 0 //if this is the last cluster (i.e., current cluster just incremented past where the current max is) - chaid ends/turn of moresplit if (strlen(noisily)) current_cluster // compare_pvalue = 1 //reset best p-value so far compare_aic = . //reset best AIC so far compare_names = ("") //reset best splits so far } } } } for (split = 1; split <= rows(splits); split++) { //for all the rows in the splits matrix... if (cols(CHAID_splits) > 1) CHAID_splits = /// //add data to "rules" matrix - specifically, the splits (CHAID_splits \ ("path" + strofreal(split), /// splits[split, .], J(1, cols(CHAID_splits)-cols(splits)-1, ""))) } if (strlen(noisily)) "CHAID Finished Execution" // current_tree.clusters = cluster //add clusters to object to be returned current_tree.CHAID_rules = CHAID_splits //add rules to object to be returned return(current_tree) //return a chaidtree object } end /*Mata function to implement the CHAID-based random forest algorithm*/ version 12.1 mata: mata set matastrict on function chaidforest(real scalar number_o_trees, real scalar num_vars_used, /// string scalar type, string scalar names, string scalar vars, real scalar alpha, /// real scalar minimum_2_split, real scalar minimum_node_size, string scalar touse, /// string scalar ordered_response, string rowvector displays, string scalar noisily, /// string scalar sampling, real scalar prop_oos, string scalar weight_name) { /*declarations*/ class chaidtree vector current_tree real matrix data, usedata, chaid_weight real colvector clusters, weight real rowvector rowsel, varsel real scalar tree string rowvector select_names, select_type, select_vars, names_list, type_list, /// displayvec string scalar wchars, pchars, qchars transmorphic token_setup /*begin processing input from Stata*/ current_tree = chaidtree(number_o_trees) //produce vector of chaidtree objects - its length will be the number of trees; efficient way to store tree information if (strlen(noisily)) names // token_setup = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()")) //set up the tempvar binary names for parsing tokenset(token_setup, names) //apply the above rules to tempvar binary names names_list = subinstr(subinstr(tokengetall(token_setup), ")", ""), "(", "") //obtain all tokens of tempvar binary names, remove binding parentheses if (strlen(noisily)) names_list // token_setup = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()")) //set up the types for parsing tokenset(token_setup, type) //apply the above rules to types type_list = subinstr(subinstr(tokengetall(token_setup), ")", ""), "(", "") //obtain all tokens of types, remove binding parentheses if (strlen(noisily)) type_list // token_setup = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()")) //set up the binary display names for parsing tokenset(token_setup, displays) //apply the above rules to binary display names displayvec = subinstr(subinstr(tokengetall(token_setup), ")", ""), "(", "") //obtain all tokens of binary display names, remove binding parentheses if (strlen(noisily)) displayvec // data = /// //pull data out of Stata; specifically, the tempvar binaries and the response - removing the binding parentheses and filtering by e(sample)/touse st_data(., st_varindex(tokens(subinstr(subinstr(names, ")", ""), "(", ""))), /// st_varindex(touse)) data = editmissing(data, .) //change all missing values to "." - really only applies to the response if (strlen(weight_name)) weight = st_data(., weight_name, st_varindex(touse)) //if fweight-ed, pull the weight into Mata... else weight = J(rows(data), 1, 1) //...otherwise, no fweight and thus equal weights - all obs in sample get a 1 if (number_o_trees > 19) { //keep track of progress - header/display printf("\n{txt}Progress in growing all " + /// "{res}%10.0g {txt}CHAID trees\n{res}0%%{txt}" + /// "{hline 6}{res}50%%{txt}{hline 6}{res}100%%\n", number_o_trees) printf("{txt}.") displayflush() } /*grow and store the chaidforest*/ for (tree = 1; tree <= number_o_trees; tree++) { //for the number of desired trees... if (number_o_trees > 19) { //keep track of progress - dots if (sum(strmatch(strofreal(tree), /// strofreal(floor(mm_quantile(1::number_o_trees, 1, (1::20):/20)))))) { //only want ~ 20 dots... this ensures there will only be ~ 20 printf("{txt}.") displayflush() } } if (strlen(sampling)) rowsel = mm_expand(1::rows(data), weight, 0) //if no bootstrap "bagging" is desired expand the observations using weight for collapsing later... else if (prop_oos > -1) /// //...or if sampling without replacement is desired, use 1-prop_oos as proportion sampled... rowsel = mm_sample(round(sum(weight)*(1-prop_oos)), rows(data), ., weight, 1, 0) else rowsel = mm_sample(sum(weight), rows(data), ., weight, 0, 0) //...otherwise default to select observations with replacement as a bootstrap sample varsel = mm_sample(num_vars_used, cols(tokens(vars))-1, ., ., 1, 1) //selection vector for splitting variables obtained without replacement - need to obtain their dummy codes and types too if (strlen(noisily)) varsel // select_vars = (tokens(vars)[1], select(tokens(vars)[2..cols(tokens(vars))], varsel')) //implement the selection of splitting variables to include by overall variable name (i.e., not by associated binaries) if (strlen(noisily)) select_vars // select_names = (displayvec[1], select(displayvec[2..cols(names_list)], varsel')) //pull binary indicators associated with splitting variables selected if (strlen(noisily)) select_names // select_type = (type_list[1], select(type_list[2..cols(type_list)], varsel')) //pull types associated with splitting variables selected if (strlen(noisily)) select_type // usedata = select(data[rowsel, .], /// //select observations and splitting variables from the overall dataset to pass to chaid_g() colsum(J(rows(tokens(invtokens(select_names))'), 1, /// tokens(subinstr(subinstr(displays, ")", ""), /// "(", ""))):==(tokens(invtokens(select_names))'))) usedata = mm_collapse(usedata, 1, rowsel) //collapse dataset on bootstrapped observations chaid_weight = mm_collapse(J(rows(rowsel), 1, 1), 1, rowsel, &dosum()) //collapse boostrapped observations into a frequency weight chaid_g() can use current_tree[tree] = /// //pass arguments and data to chaid_g(), the goodman association model-based chaid learner chaid_g(usedata[., 2..cols(usedata)], select_type, select_names, select_vars, /// alpha, minimum_2_split, minimum_node_size, ordered_response, noisily, /// chaid_weight[., 2]) if (strlen(noisily)) current_tree[tree].CHAID_rules // clusters = exp(ln(rowsum((J(1, rows(chaid_weight), /// //this "merge"s in clusters to the data in a form that's conformable to how its represented in the Stata dataset - non-sampled observations get missing value 1::rows(data)):==(chaid_weight[., 1]')):*current_tree[tree].clusters'))) current_tree[tree].clusters = clusters //update chaidtree clusters entry current_tree[tree].used_vars = select_vars //update chaidtree used variables entry current_tree[tree].bootstrap_weight = mm_freq(rowsel, 1, (1::rows(data))) //update chaidtree bootstrap frequency weights entry } return(current_tree) //return vector of chaidtrees to Mata } end /*Goodman association model based on loglinear/Poisson regression*/ version 12.1 mata: mata set matastrict on real rowvector goodman(real matrix data, real scalar ordered_response, /// real scalar noisily, real colvector weight) { /*declarations*/ real matrix IVs, DV real colvector observed_count, expected_count, combined_row_col real rowvector b, rescaled_b, xb real scalar converged, categories_IV, chi2, df, mean_count, responses_missing, /// ll, number_of_estimates, base_ll transmorphic row, col, estimation_object /*process inputs to goodman to estimate as poisson/loglinear model*/ if (noisily) "Goodman/loglinear modeling Begins" // if (sum(rowmissing(uniqrows(data[., 1])))) { //are there missing values on response? Change them to -1's and activate "responses_missing" data[., 1] = /// //changes all missings to -1's editmissing(data[., 1], -1) responses_missing = 1 //responses_missing is activated } else responses_missing = 0 //otherwise deactivate responses_missing row = rowsum((J(1, rows(uniqrows(data[., 1])), /// data[., 1]):==(uniqrows(data[., 1])')):*(1..rows(uniqrows(data[., 1])))) //set up response into column vector col = data[., 2]:*100 //set up binary splitting variable/IV into column vector that can be "added to" response combined_row_col = (row + col) //combines unique row/column cross-tab combinations in a vector if (noisily) uniqrows(combined_row_col)' // observed_count = mm_freq(combined_row_col, weight, J(rows(uniqrows(col)), 1, /// //obtain observed counts for all unique combinations in the cross-tabulation uniqrows(row)):+(uniqrows(col)#J(rows(uniqrows(row)), 1, 1))) if (noisily) observed_count // /*row and/or column Goodman model - needs ml estimation*/ if ((ordered_response) & (rows(observed_count) > 3)) { //if there is a ordered response variable and there is a contingency table worth of data (at least 4 rows)... col = uniqrows(data[., 2])#J(rows(uniqrows(data[., 1])), 1, 1) //set up columns of the cross-tabulation approproately for analysis (i.e., spread them across levels of the response as binary dummies - kind of irrelevant as only binaries are ever applied in chaidforest...) categories_IV = rows(uniqrows(data[., 2])) //how many unique splitting variable levels are there? row = J(categories_IV, 1, uniqrows(data[., 1])) //ordered response - treat row variable as ordered IV "predicing" counts as a loglinear model if (responses_missing) row = (editvalue(row, -1, 1), row:==min(row)) //when there is an ordered response, separate out the floating missing category into its own dummy (missing effect) but make it equal to the smallest value to reduce collinearity in the ordered variable if (noisily) row // if (noisily) col // estimation_object = moptimize_init() //initalize ml estimation object //obtain starting values - based on poisson.ado's routine mean_count = mean(observed_count) //obtains a mean of the counts as an imput to starting values IVs = cross((row, col), 1 , (row, col), 1) //cross-products of IV's (for a linear regression) DV = cross((row, col), 1 , observed_count:-mean_count, 0) //cross-products of IV's with DV (for a linear regression) b = invsym(IVs)*DV //linear regression estimation by least squares rescaled_b = b:/mean_count //rescale all estimates by mean number of observations which are used as initial/start values for poisson model /*set up ml estimation*/ moptimize_init_evaluator(estimation_object, &loglin()) //tell ml evaluator what function to optimize moptimize_init_evaluatortype(estimation_object, "lf2") //tell ml evaluator the optimization scheme (likelihood function with known 1st and 2nd derivatives of likelihood) moptimize_init_depvar(estimation_object, 1, observed_count) //tell ml evaluator what is the dependent variable is moptimize_init_eq_indepvars(estimation_object, 1, (row, col)) //tell ml evaluator what is the independent variables are moptimize_init_search_rescale(estimation_object, "off") //tell ml evaluator not to spend time rescaling estimates to improve estimation - takes time to do moptimize_init_search(estimation_object, "off") //tell ml evaluator not to spend time searching for better start values to improve estimation - takes time to do moptimize_init_eq_coefs(estimation_object, 1, rescaled_b') //tell ml evaluator what the starting values of the parameters are moptimize_init_conv_maxiter(estimation_object, 20) //tell ml evaluator to stop after 20 iterations - really should converge or have pretty reasonable values by then; keeps speed managable if (noisily) moptimize_init_tracelevel(estimation_object, "value") //tell ml evaluator to show iterations if "noisily" is asked for by user... else moptimize_init_tracelevel(estimation_object, "none") //...otherwise show nothing moptimize_init_conv_warning(estimation_object, "off") //tell ml evaluator not to warn when convergence is not reached moptimize(estimation_object) //tell ml evaluator to proceed with estimation xb = moptimize_result_coefs(estimation_object) //pull estimates from results if (noisily) xb // converged = moptimize_result_converged(estimation_object) //is everyting well? That is, did estimation converge? if (sum(rowmissing(xb)) == 0) converged = 1 //keep estimates if they're usable in computation, even if convergence wasn't reached - we're data mining here expected_count = exp((row, col, J(rows(observed_count), 1, 1))*xb') //compute expected frequencies if (noisily) expected_count // df = (rows(uniqrows(data[., 2])))*(rows(uniqrows(data[., 1]))) - cols(xb) //obtain DF number_of_estimates = cols(xb) //obtain number of parameters estimated } /*unstructured model - traditional contingency table/product of margins approach*/ else if (rows(observed_count) > 3) { //if a contingency table can be made... row = mm_freq(data[., 1], weight) //row/response variable margins if (noisily) row // col = mm_freq(rowsum(data[., 2..cols(data)]:*(1..cols(data)-1)), weight) //column/splitting variable margins if (noisily) col // expected_count = (col#row):/sum(row) //compute expected frequencies if (noisily) expected_count // df = (rows(uniqrows(data[., 2])) - 1)*(rows(uniqrows(data[., 1])) - 1) //obtain DF number_of_estimates = (rows(uniqrows(data[., 2])))*(rows(uniqrows(data[., 1]))) - /// (rows(uniqrows(data[., 2])) - 1)*(rows(uniqrows(data[., 1])) - 1) converged = 1 //is everything well? (answer is always yes here) } else converged = 0 //...otherwise, no contingency table - all's not well... if (converged) { //if all's well... chi2 = sum(((observed_count - expected_count):^2):/expected_count) //obtain chi-square value row = mm_freq(data[., 1], weight) //row/response variable margins to infer the ll from the lr test stat expected_count = select(expected_count, observed_count:>0) //remove expected counts that will affect a sum observed_count = select(observed_count, observed_count:>0) //remove 0's that will affect a sum ll = sum(observed_count:*ln(observed_count:/expected_count)) + /// //take the liklihood ratio chi-square and use the constant-only multinomial distribution to obtain model log-likelihood by working backward sum((ln((row:/sum(row)))):*row) base_ll = sum((ln((row:/sum(row)))):*row) //baseline/constant-only log-likilihood } else { //...otherwise return nothing chi2 = . ll = . } return((chi2, df, ll, number_of_estimates, base_ll)) //chaid needs the chi-square, degrees of freedom, base and model log-likelihood, and number of estimates - pass it to chaid } end /*Loglinear model for moptimize()*/ version 12.1 mata: mata set matastrict on function loglin(transmorphic estimation_object, todo, real rowvector b, ll, S, H) { real rowvector inter, y, xb y = moptimize_util_depvar(estimation_object, 1) xb = moptimize_util_xb(estimation_object, b, 1) ll = -exp(xb) + xb:*y - lngamma(y:+1) S = -exp(xb) + y inter = -exp(xb) H = moptimize_util_matsum(estimation_object, 1, 1, inter, moptimize_util_sum(estimation_object, ll)) } end /*sum function for mm_collapse()*/ version 12.1 mata: mata set matastrict on function dosum(real colvector data, real scalar nouse) { real scalar result result = sum(data) return(result) } end /* programming notes and history - chaidforest version 1.0 - date - October 27, 2014 Basic version ----- - chaidforest version 2.0 - date - September 28, 2015 a] added version #'s to Stata and Mata commands/functions b] checked all declarations in Mata to ensure functionality c] if/in added to syntax to allow conditional - chaidforest - runs d] fixed inaccurate unordered splitter chi-square computations with goodman() (row and column frequencies reversed - added more random element) e] predicted values moved to predict command postestimation f] fixed progress dots that extend beyond 100% w/ small # of trees g] collapsing (by bootstrap weight) to improve run-time speed in internal chaid_g() function h] incorporate weights into chaid by default to speed run time/fixed fweight incorporation i] splitting AIC; merging McFadden's pseudo-R2 j] saves variables used and bootstrap data in chaidtree object, nixes predictions k] fixed extended missing value compatability with - missing - option - ensure extended missings follow through to predict and estat