*!chaid version 2.2 2/12/2015 - Joseph N. Luchman program define chaid, eclass //version history at end of file //program declarations version 12.1 //version declaration if (replay()) { //replay syntax if ("`e(cmd)'" != "chaid") error 301 if (_by()) error 190 Display exit 0 } syntax varlist(numeric min=1 max=1) [in] [if] [fw], /// //only one dv and max 1 dv - allows if and in [MINNode(integer 100) /// //minimum number of observations in a terminal node MINSplit(integer 200) /// //minimum number of observations to allow splitting a node MAXBranch(integer -1) /// //maximum number of splits/interactions stemming from the root node UNOrdered(varlist numeric) /// //variable list for unordered splitting treatment in CHAID ORDered(varlist numeric) /// //variable list for ordered splitting treatment in CHAID MISsing /// //treat missing values as floating category MERGAlpha(real .95) /// //alpha level for merging 2 allowable categories RESPAlpha(real -1) /// //alpha level for respliting 2 allowable, merged categories SPLTAlpha(real .05) /// //alpha level for implementing a split in the data NOIsily /// //shows a lot of output - if the user is interested DVOrdered /// //declares that the outcome is ordered and uses ologit instead of mlogit noadj /// //no Bonferroni adjustment - more permissive splitting noDisplay /// //don't display the results PREDicted /// //obtain predicted values - modal value for category IMPortance /// //obtain permutation importance for input splitting variables Permute /// //use permute to obtain p-values svy /// //incorporate svy design to obtain p-values ///mi(string) /// //incorporate multiple imputation to obatain p-values (next relaease?) XTile(string) /// //convenience command for obtaining ordered quantiles of predictors EXhaust] //use exhaustive CHAID (binary split) instead of default multi-way splits /*exit and warning conditions*/ if ((`:list sizeof unordered' == 0) & (`:list sizeof ordered' == 0) & /// (`:list sizeof xtile' == 0)) { //exit if no splitting vars entered display "{err}No independent variables entered." exit 111 } if (`minsplit' < `minnode') { //exit if you tell chaid that the minnode/cluster size is larger than the min size to make a split display "{err}{opt minsplit()} cannot be smaller than " /// "{opt minnode()}." exit 198 } if ((`mergalpha' <= 0) | (`mergalpha' >= 1) | (`spltalpha' <= 0) | /// (`spltalpha' >= 1)) { //ensure that the alphas are possible display "{err}{opt mergalpha()} and {opt spltalpha()} must range " /// "betweeen (but not include) 0 and 1." exit 198 } if (`minnode' <= 0) { //ensure that min cluster size is a possible number display "{err}{opt minnode()} must be a positive integer value." exit 198 } if ((`respalpha' >= 1) | (`respalpha' > 1 - `mergalpha') | (`respalpha' == 0)) { //ensure that alphas are possible values that don't induce splitting forever display "{err}{opt resplapha()}, when invoked, must range between (but not include)" _newline /// "0 and 1 and must be less than 1 - {opt mergalpha()} to prevent infinite merging and resplitting." exit 198 } if (("`weight'" != "") & (("`permute'" != "") | ("`svy'" != ""))) { display "{err}{cmd:fweight}s not allowed with {opt permute} or {opt svy}." exit 198 } if (("`permute'" != "") & ("`svy'" != "")) { display "{err}{opt permute} and {opt svy} not allowed together." exit 198 } //1) no fw and permute, 2) no fw and svy, 3) no permute and svy capture which lmoremata.mlib //is moremata present? if (_rc != 0) { //if moremata cannot be found, tell user to install it. display "{err}Module {cmd:moremata} not found. Install {cmd:moremata} here {stata ssc install moremata}." exit 198 } if strlen("`permute'") local noadj "noadj" /*begin data processing*/ //mark the estimation sample tempvar keep touse stop cluster useob dv //tempvar declarations - begin using "touse"; add "keep" markout variables in unordered() and ordered() tempname cellcount fit //tempname declaration for matrix quietly generate `keep' = 1 `if' `in' //allows keeping of missings as valid category, but still adjusting for "if"s and "in"s if strlen("`xtile'") { gettoken xtile xtile_opt: xtile, parse(",") foreach x of varlist `xtile' { //convenience command for creating binned continuous predictors in CHAID capture inspect xt`x' if !_rc quietly drop xt`x' quietly xtile xt`x' = `x' if `keep' [`weight'`exp'] `xtile_opt' local ordered "`ordered' xt`x'" } } mark `touse' //mark estimation sample if ("`missing'" == "") markout `touse' `varlist' `unordered' `ordered' `keep' //markout all variables when missings as category is not desired else if ("`missing'" != "") & ("`dvordered'" == "") markout `touse' `keep' //markout only "if" and "in" when missings are a valid category else if ("`missing'" != "") & ("`dvordered'" != "") markout `touse' `varlist' `keep' //markout only "if" and "in" and on dv when missings are valid category and dv is declared ordered drop `keep' //"keep" tempvar is dropped so that it does not find its way into mata foreach x of varlist `varlist' `unordered' `ordered' { //exit if too many categories (20 is max) or categories are negative integers quietly inspect `x' if `touse' if (r(N_unique) > 20) { display "{err}Number of distinct values in {cmd:`x'} is larger than allowed. " _newline /// "Consider collapsing across similar categories for unordered or using " _newline /// "{opt xtile()} option/command for ordered variables to reduce the number of levels." exit 198 } else if (r(N) != r(N_0) + r(N_posint)) { display "{err}Levels of each variable must be non-negative integer values." _newline /// " Variable {cmd:`x'} contains negative or decimal values." exit 198 } } //identify and process variables to use quietly tab `varlist' [`weight'`exp'], matrow(`cellcount') //identify levels of dependent/outcome variable to create inidicator varaibles mata: st_local("dvlevs", invtokens(strofreal(st_matrix("`cellcount'")'))) //put levels into a macro for use local dvlist "`varlist'" //produce local macro that's updated with tempvar names for use in mata local dvdisp "`varlist'" //produce local macro that's updated with displayed names of outcome and levels local type "`type' d" //note "type" of variable for dv quietly generate `dv' = `varlist' //local macro to denote variable type for splitting purposes - here "d" denotes dependent variable that will not be split if ("`missing'" != "") quietly count if missing(`dv') //discern whether outcome is missing values if missings are of interest if (("`missing'" != "") & (r(N) > 0) & ("`dvordered'" == "")) { //if "missing" invoked and dv is not ordered, turn missing into last category quietly summarize `dv' [`weight'`exp'] //find out what the larget category is quietly recode `dv' (missing = `=r(max)+1') //make all missings one bigger than the biggest category } if (`:list sizeof unordered' > 0) { //process unordered splitting variables when present foreach x of varlist `unordered' { quietly tabulate `x' if `touse' [`weight'`exp'], matrow(`cellcount') `missing' //discern levels of each unordered variable for creating binary variables mata: st_local("`x'levs", invtokens(strofreal(st_matrix("`cellcount'")'))) //put levels of unordered variable into local macro foreach y of numlist ``x'levs' { if (`:list posof "`y'" in `x'levs' == 1) { local type "`type' (" local unorddisp "`unorddisp' (" } if ("`y'" != ".") { tempvar `x'`y' //create tempvars with name of level of unordered, binary variable quietly generate byte ``x'`y'' = `x' == `y' //generate binaries for each level of unordered variable label variable ``x'`y'' "`x' @ `y'" //label the tempvar for later processing local unorderedlist "`unorderedlist' ``x'`y''" //include name of unordered binary variable in list of tempvar names local unorddisp "`unorddisp' `x'`y'" //include displayed name of unordered binary variable in list of tempvar names local type "`type' u" //include variable type in type macro } else { //if missing option invoked count missings to see whether following syntax needs to be run tempvar `x'ms //assuming missings are present, create tempvar for unordered varible indicating missing quietly generate byte ``x'ms' = cond(missing(`x'), 1, 0) //generate indicator for missing category label variable ``x'ms' "`x' @ missing" //label the tempvar for later processing local unorderedlist "`unorderedlist' ``x'ms'" //include missing tempvar name in unordered list local unorddisp "`unorddisp' `x'ms" //include missing display name in unordered display list local type "`type' f" //include type "floating" } if (`: list posof "`y'" in `x'levs' == `: list sizeof `x'levs') { local type "`type')" local unorddisp "`unorddisp')" } } } } if (`:list sizeof ordered' > 0) { foreach x of varlist `ordered' { quietly tab `x' if `touse' [`weight'`exp'], matrow(`cellcount') `missing' //if ordered variables present, discern their levels/values mata: st_local("`x'levs", invtokens(strofreal(st_matrix("`cellcount'")'))) //put levels of ordered variable into local macro foreach y of numlist ``x'levs' { if (`: list posof "`y'" in `x'levs' == 1) { local type "`type' (" local orddisp "`orddisp' (" } if ("`y'" != ".") { tempvar `x'`y' //declare tempvar for each value of ordered variable quietly generate byte ``x'`y'' = `x' == `y' //generate binary for each level of ordered variable label variable ``x'`y'' "`x' @ `y'" //label the tempvar for later processing local orderedlist "`orderedlist' ``x'`y''" //include ordered tempvar in orderedlist local orddisp "`orddisp' `x'`y'" //include display name of variable in ordered display macro local type "`type' o" //include variable type in type macro } else { //if missing option invoked count missings to see whether following syntax needs to be run tempvar `x'ms //assuming missings are present, create tempvar for ordered varible indicating missing quietly generate byte ``x'ms' = cond(missing(`x'), 1, 0) //generate indicator for missing category label variable ``x'ms' "`x' @ missing" //label the tempvar for later processing local orderedlist "`orderedlist' ``x'ms'" //include missing tempvar name in ordered list local orddisp "`orddisp' `x'ms" //include missing display name in unordered display list local type "`type' f" //include type "floating" } if (`: list posof "`y'" in `x'levs' == `: list sizeof `x'levs') { local type "`type')" local orddisp "`orddisp')" } } } } /*process estimation options*/ if ("`permute'" != "") local permute "permute `dv' e(chi2) e(ll) e(df_m) e(N):" local test "e(chi2)" if ("`svy'" != "") local test "e(F)" local quietly "quietly" //is the chaid process going to display everything? if ("`noisily'" != "") local quietly "" //make it non-quiet quietly generate byte `stop' = 1 if `touse' //indicator telling chaid to not look for splits on this observation quietly generate byte `cluster' = 1 if `touse' //cluster membership - everyone starts in same cluster #1 quietly generate byte `useob' = 0 if `touse' //indicator for "use this ob" in current analysis `quietly' summarize `stop' `cluster' [`weight'`exp'] //look at data if non-quiet local reg "mlogit" //default analysis is mlogit if ("`dvordered'" ! = "") local reg "ologit" //if dvordered then ologit /*begin chaid algorithm in Mata*/ `quietly' mata: CHAIDsplit = chaid("`unorderedlist'", "`orderedlist'", "`type'", /// "`dvdisp' `unorddisp' `orddisp'", "`dv' `unordered' `ordered'", `mergalpha', /// `spltalpha', `minsplit', `minnode', `maxbranch', `respalpha', "`adj'", "`svy'", /// "`exhaust'") //pass arguments to Mata /*process chaid results*/ capture xi: count //nix xi: used in chaid - cleans up dataset capture inspect _CHAID //is there already a _CHAID variable? if (_rc == 0) drop _CHAID //if there is a _CHAID variable, drop it rename `cluster' _CHAID //rename the current tempvar "cluster" _CHAID label variable _CHAID "CHAID-defined cluster" //label the clustering variable perm_import if `touse', dv("`varlist'") list(0) wexp("`exp'") `svy' scalar `fit' = r(fit) if ("`importance'" != "") mata: ifstmnts = J(rows(CHAIDsplit)-1, cols(CHAIDsplit), "") mata: st_local("clusters", strofreal(rows(CHAIDsplit)-1)) if (`clusters' > 0) { forvalues x = 1/`clusters' { if ("`noisily'" == "noisily") mata: CHAIDsplit[`=`x'+1', .] local path`x' "" mata: st_local("search", strofreal(cols(CHAIDsplit))) forvalues z = 2/`search' { mata: st_local("currentname", CHAIDsplit[`=`x'+1', `z']) local vallist "" if ("`currentname'" != "") { foreach y of local currentname { local name "`:variable label ``y'''" gettoken varnam level: name, parse("@") gettoken atsign level: level, parse("@") if ("`level'" != "") local vallist "`vallist'`level'" } local varnam = trim("`varnam'") local vallist = trim("`vallist'") local path`x' "`path`x''`varnam'@`vallist';" if ("`importance'" != "") { if regexm("`vallist'", "missing") local vallist = subinword("`vallist'", "missing", ".", .) local ifcomma = subinstr("`vallist'", " ", ",", .) if (`z' == 2) mata: ifstmnts[`x', `=`z'-1'] = " inlist(" + st_local("varnam") + "," + st_local("ifcomma") + ")" else mata: ifstmnts[`x', `=`z'-1'] = " & inlist(" + st_local("varnam") + "," + st_local("ifcomma") + ")" } } } } } if ("`importance'" != "") & (`clusters' > 0) { tempname imp_v mata: imp_n = "" mata: imp_v = . foreach x of varlist `ordered' `unordered' { perm_import if `touse', permvar("`x'") dv("`varlist'") list(`clusters') /// predlist(`ordered' `unordered') wexp("`exp'") `missing' `svy' mata: imp_n = (imp_n, "`x'") mata: imp_v = (imp_v, st_numscalar("r(fit)")) } mata: imp_v = st_numscalar("`fit'"):-imp_v mata: imp_v = imp_v:/rowsum(imp_v) mata: imp_v = (imp_v \ (., mm_ranks((imp_v[1, 2..cols(imp_v)]:*-1)',1,3)')) mata: st_local("imp_n", invtokens(imp_n)) mata: st_matrix("`imp_v'", imp_v[., 2..cols(imp_v)]) matrix colnames `imp_v' = `imp_n' matrix rownames `imp_v' = "raw" "rank" } ereturn post , depname(`varlist') esample(`touse') //clear ereturned values, post dv and obs mata: st_local("search", strofreal(cols(CHAIDsplit))) local splitcount = 1 forvalues x = 2/`search' { mata: t = tokeninit("", pchars=(",")) //set up processing CHAIDsplit matrix mata: tokenset(t, CHAIDsplit[1, `x']) mata: nameslist = tokengetall(t) mata: st_local("search_name", strofreal(cols(nameslist))) forvalues y = 1/`search_name' { local levels "" mata: st_local("currentname", nameslist[`y']) if !regexm("`currentname'", ",") { foreach z of local currentname { local name "`:variable label ``z'''" gettoken name level : name, parse("@") local level = subinstr("`level'", "@", "", .) local levels = rtrim("`levels' `level'") } local alllevels "`alllevels' (`levels')" if ("`quietly'" == "") { display "`name'" // display "`alllevels'" // } } } ereturn local split`splitcount++' = itrim("`name' `alllevels'") local alllevels "" } ereturn matrix branches = `branchmat' quietly summarize _CHAID if e(sample) [`weight'`exp'], meanonly ereturn scalar N = r(N) quietly inspect _CHAID ereturn scalar N_clusters = r(N_unique) forvalues x = 1/`=e(N_clusters)' { ereturn local path`x' "`path`x''" } quietly tabulate _CHAID, matcell(`cellcount') matrix `cellcount' = `cellcount'' ereturn matrix sizes = `cellcount' if ("`importance'" != "") & (`clusters' > 0) ereturn matrix importance = `imp_v' if ("`predicted'" == "predicted") { tempvar order tempname maxcell maxval capture inspect CHAID_predict if (_rc == 0) drop CHAID_predict quietly inspect _CHAID quietly generate long CHAID_predict = . forvalues x = 1/`=r(N_unique)' { quietly tabulate `varlist' [`weight'`exp'] if _CHAID == `x', matcell(`maxcell') matrow(`maxval') `missing' mata: predicted = select(st_matrix("`maxval'"), st_matrix("`maxcell'"):==max(st_matrix("`maxcell'"))) mata: st_numscalar("`maxcell'", rows(predicted)) if (`maxcell' == 1) mata: st_numscalar("`maxval'", predicted) else mata: st_numscalar("`maxval'", min(predicted)) quietly replace CHAID_predict = `maxval' if _CHAID == `x' } label variable CHAID_predict "predicted `varlist' values by CHAID" } ereturn scalar fit = `fit' ereturn local title = "Chi-Square Automated Interaction Detection (CHAID)" ereturn local cmd = "chaid" ereturn local cmdline = "chaid `0'" if ("`display'" != "nodisplay") Display end /*model fit program*/ program perm_import, rclass syntax if, dv(string) list(integer) [permvar(varlist) predlist(string) wexp(string) missing svy] tempvar temp_chaid rand_perm tempfile chaid_perm preserve if (!strlen("`svy'")) quietly keep `if' if strlen("`wexp'") quietly expand `wexp' quietly generate int `temp_chaid' = . if (`list' == 0) { quietly replace `temp_chaid' = _CHAID } else { local notpermvar: list predlist-permvar if (!strlen("`svy'")) quietly keep `predlist' `dv' `temp_chaid' quietly save `chaid_perm' quietly keep `permvar' quietly generate `rand_perm' = runiform() quietly sort `rand_perm' if (strlen("`svy'")) quietly merge 1:1 _n using `chaid_perm', nogenerate else quietly merge 1:1 _n using `chaid_perm', nogenerate keepusing(`notpermvar' `dv' `temp_chaid') forvalues x = 1/`list' { mata: stata("quietly replace \`temp_chaid' = \`x' if" + invtokens(ifstmnts[`x', .])) } } if (strlen("`svy'")) { tempvar svy tempname best_F quietly clonevar `svy' = `dv' quietly svy: tabulate `dv' `svy', `missing' scalar `best_F' = e(F_Full) quietly svy: tabulate `dv' `temp_chaid', `missing' capture scalar `best_F' = e(F_Full)/`best_F' if (!_rc) return scalar fit = `best_F' else return scalar fit = 0 } else { quietly tabulate `dv' `temp_chaid', V `missing' capture assert r(CramersV) != . if (!_rc) return scalar fit = abs(r(CramersV)) else return scalar fit = 0 } restore end /*replayable display*/ program Display display _newline "{res}Chi-Square Automated Interaction Detection (CHAID)" /// " Tree Branching Results" _newline "{txt}{hline 80}" _newline if (e(N_clusters) == 1) display "{res}No clusters uncovered. Cluster #1 is null." else { preserve clear mata: st_addobs(st_numscalar("e(N_clusters)")) quietly mata: labels = st_tempname(max(st_matrix("e(branches)")) + 1) quietly mata: st_addvar("str100", labels) forvalues x = 1/`=e(N_clusters)' { local parse "`e(path`x')'" gettoken first second: parse, parse(";") local count = 0 while ("`second'" != "") { mata: st_local("labvar", labels[1, `++count']) quietly replace `labvar' = "`first'" in `x' gettoken first second: second, parse(";") gettoken first second: second, parse(";") } mata: st_local("labvar", labels[1, cols(labels)]) quietly replace `labvar' = "Cluster #`x'" in `x' } mata: st_local("all", invtokens(labels)) sort `all' quietly putmata namemat = (`all'), replace mata: namemat' mata: namemat = namemat[., 1..cols(namemat)-1]' mata: namemat = (J(1, cols(namemat), "root") \ namemat) clear mata: count = 0 mata: column = 1 local column = 1 mata: st_local("totcol", strofreal(cols(namemat))) quietly { generate long x = . generate long y = . generate int col = . generate byte down = . generate str100 label = "" } local justup = 0 forvalues y = 1/`totcol' { mata: currentvec = namemat[., column] if (`y' == 1) { mata: currentvec = select(currentvec, currentvec:!="") mata: st_local("row", strofreal(rows(currentvec))) mata: row = 1 forvalues x = 1/`row' { mata: st_addobs(1) mata: st_store(++count, st_varindex("x"), column) mata: st_store(count, st_varindex("y"), row) mata: st_sstore(count, st_varindex("label"), namemat[row++, column]) } quietly replace down = 1 if missing(down) mata: countback = st_data(2::count-1, (st_varindex("x"), st_varindex("y"))) mata: countback = sort(((rows(countback)::1), countback), 1) mata: st_addobs(rows(countback)) mata: st_store(count+1::count+rows(countback), (st_varindex("x"), st_varindex("y")), countback[., 2..cols(countback)]) mata: count = count + rows(countback) quietly replace down = 0 if missing(down) quietly replace col = `y' if missing(col) mata: column++ } else if (`y' == `totcol') { mata: prevvec = namemat[., column-1] mata: retrace1 = currentvec:==prevvec mata: retrace2 = currentvec:!="" mata: retrace = retrace1:*retrace2 mata: obsvd = st_data(., (st_varindex("x"), st_varindex("y"), st_varindex("col"), st_varindex("down"))) mata: obsvd = select(obsvd[., (1..2,4)], obsvd[.,3]:==column-1) mata: obsvd = select(obsvd[., 1..2], obsvd[.,3]:==1) mata: obsvd = select((obsvd \ J(rows(retrace)-rows(obsvd), 2, .)), retrace) mata: st_addobs(rows(obsvd)) mata: st_store(count+1::count+rows(obsvd), (st_varindex("x"), st_varindex("y")), obsvd) mata: count = count + rows(obsvd) mata: currentvec = select(currentvec, abs(retrace:-1)) mata: currentvec = select(currentvec, currentvec:!="") mata: st_local("row", strofreal(rows(currentvec))) mata: row = rows(obsvd)+1 forvalues x = 1/`row' { mata: st_addobs(1) mata: st_store(++count, st_varindex("x"), column) mata: st_store(count, st_varindex("y"), row) mata: st_sstore(count, st_varindex("label"), namemat[row++, column]) } quietly replace col = `y' if missing(col) quietly replace down = 1 if missing(down) } else { mata: prevvec = namemat[., column-1] mata: retrace1 = currentvec:==prevvec mata: retrace2 = currentvec:!="" mata: retrace = retrace1:*retrace2 mata: obsvd = st_data(., (st_varindex("x"), st_varindex("y"), st_varindex("col"), st_varindex("down"))) mata: obsvd = select(obsvd[., (1..2,4)], obsvd[.,3]:==column-1) mata: obsvd = select(obsvd[., 1..2], obsvd[.,3]:==1) mata: obsvd = select((obsvd \ J(rows(retrace)-rows(obsvd), 2, .)), retrace) mata: st_addobs(rows(obsvd)) mata: st_store(count+1::count+rows(obsvd), (st_varindex("x"), st_varindex("y")), obsvd) mata: count = count + rows(obsvd) mata: currentvec = select(currentvec, abs(retrace:-1)) mata: currentvec = select(currentvec, currentvec:!="") mata: st_local("row", strofreal(rows(currentvec))) mata: row = rows(obsvd)+1 forvalues x = 1/`row' { mata: st_addobs(1) mata: st_store(++count, st_varindex("x"), column) mata: st_store(count, st_varindex("y"), row) mata: st_sstore(count, st_varindex("label"), namemat[row++, column]) } quietly replace col = `y' if missing(col) quietly replace down = 1 if missing(down) mata: countback = st_data(., (st_varindex("x"), st_varindex("y"), st_varindex("col"), st_varindex("down"))) mata: countback = select(countback[., (1..2,4)], countback[.,3]:==column) mata: countback = select(countback[., 1..2], countback[.,3]:==1) mata: countback = sort(((rows(countback)::1), countback), 1) mata: st_addobs(rows(countback)) mata: st_store(count+1::count+rows(countback), (st_varindex("x"), st_varindex("y")), countback[., 2..cols(countback)]) mata: count = count + rows(countback) quietly replace down = 0 if missing(down) quietly replace col = `y' if missing(col) mata: column++ } } quietly summarize y quietly replace y = (y - r(min))*-1 + r(max) tempname max scalar `max' = r(max) quietly summarize x quietly replace x = `=r(max)/2+r(min)' if y == `max' quietly generate int group = . forvalues x = `=`max'-1'(-1)1 { quietly replace group = y == `x' & label != "" quietly replace group = group + group[_n-1] in 2/`=_N' quietly summarize group forvalues y = `=r(min)'/`=r(max)' { quietly summarize x if group == `y' & y <= `x' quietly replace x = `=r(max)/2+r(min)' if group == `y' & y == `x' } quietly replace group = . } graph twoway scatter y x, mlabel(lab) connect(direct) capture matrix list e(importance) if !_rc { display _newline "{res}Splitting variable permutation importance" matrix list e(importance), noheader } restore } end /*mata function to recursively partition data based on CHAID algorithm*/ version 12.1 mata: mata set matastrict on string matrix chaid(string rowvector unordvar, string rowvector ordvar, string scalar type, /// string scalar names, string scalar vars, real scalar alpha1, real scalar alpha3, /// real scalar minsplit, real scalar minnode, real scalar maxbranch, real scalar alpha2, /// string scalar bfadj, string scalar svy, string scalar exhaust) { real matrix selectvar, select2var, countcheck, comparecount real rowvector var1, var2, merger_ps, merger_aic, withinselect, randresolve, /// branches real colvector unordpcomp real scalar position, moresplit, compare_p, current_p, compare_aic, current_aic, /// moremerge, clustercount, currentcluster, adj_p, countmerge, temploc1, temploc2, /// mergeruns, floating, match_p string matrix splits, CHAIDsplit string rowvector varsvec, namesvec, typevec, selecttype, collapsevec, selectnames, /// clusnames, comparenames, select2names, select2vec string scalar temploc1s, temploc2s transmorphic basis tokens(vars) // position = 2 //updatable position of data to find in varsvec varsvec = tokens(vars) //vectorize the tempvars varsvec[position] // moresplit = 1 //scalar to indicate continue splitting process between-variable process compare_p = 1 //currently lowest p-value for deciding which variable will be splitting variable compare_aic = . //currently lowesr AIC value for deciding which variable will be splitting variable clustercount = 1 currentcluster = 1 comparenames = ("") branches = (0) splits = ("") q = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()")) //vectorize the display names tokenset(q, names) namesvec = tokengetall(q) namesvec // q = tokeninit(wchars = (" "), pchars = (" "), qchars = ("()")) //vectorize the type inputs tokenset(q, type) typevec = tokengetall(q) typevec // CHAIDsplit = ("Splits") while (moresplit == 1) { //do following process at least once - continue if between-variable splitting process is to continue (i.e., moresplit == 1) stata("replace \`useob' = \`stop' & \`cluster' == " + strofreal(currentcluster) + " if \`touse'") selectnames = tokens(subinstr(subinstr(namesvec[position], ")", ""), "(", "")) //pull out names for first independent variable (i.e., in position = 2] binaries using same procedure as with dependent variable - select() to pull out data - strpos() to create vector of 1's, substr() to truncate names, strlen() to truncate substr() statement at length of focal independent variable to match selectnames // selecttype = tokens(subinstr(subinstr(typevec[position], ")", ""), "(", "")) //pull out types for first independent variable using same procedure as with dependent variable - select() to pull out data - strpos() to create vector of 1's, substr() to truncate names, strlen() to truncate substr() statement at length of focal independent variable to match selecttype // moremerge = 1 //updatable scalar to indicate continute merging within-variable process stata("tabulate \`dv' \`useob' if \`touse' [\`weight'\`exp'], matcell(\`cellcount')") //tabulate dv with the current cluster set, keep the resulting matrix stata("ereturn matrix cellcount = \`cellcount'") //ereturn the matrix to pull it into Mata - probably unnecessary, but it works countcheck = st_matrix("e(cellcount)")' //pull matrix of counts into Mata, transposed countcheck // if (rows(countcheck) > 1) countcheck = countcheck[2,] //keep only the rows corresponding to the "useob" values if there are some that got a 0 on useob countcheck = select(countcheck, sign(countcheck)) //keep only the rows with some value (omit thosw with 0's) countcheck // stata("summarize \`stop' [\`weight'\`exp']") //quick summarize on stopping indicator if (st_numscalar("r(mean)") == 0) { //if no-one has a 1 on stop, then end chaid as the process is now over moresplit = 0 //don't split anymore - chaid is done moremerge = 0 //don't merge anymore on this variable } else if ((rowsum(sign(countcheck)) == 1) | (rowsum(countcheck) <= minsplit)) { //if there is only one value left in a cluster (i.e., it's pure) or it's size is below the minimum splitting size moremerge = 0 //stop merging on this variable stata("replace \`stop' = 0 if \`useob' & \`touse'") //update cluster to stop using this variable currentcluster++ //increment current cluster - start using the next cluster up, look for splits there position = 2 //restart "position" used in merging } /*begin splitting variable category merging*/ else { stata("tabulate " + varsvec[position] + /// " \`useob' if \`touse' [\`weight'\`exp'], \`missing' matcell(\`cellcount')") stata("ereturn matrix cellcount = \`cellcount'") countcheck = st_matrix("e(cellcount)")' countcheck // if (rows(countcheck) > 1) countcheck = countcheck[2,] selectnames = select(selectnames, sign(countcheck)) selectnames // while (moremerge == 1) { collapsevec = ("") //replace/generate collapsevec including names of variables to be collapsed into single variable merger_ps = (.) merger_aic = (.) floating = 0 if (cols(selectnames) > 2) { //if there are more than 2 categories and the type of variabel is "unordered" then proceed /*Unordered merging*/ if (selecttype[1] == "u") { base = (J(2, 1, 1) \ J(cols(selectnames) - 2, 1, 0)) //setup vector for use in cvpermute() to select all sets of 2 variables base // basis = cvpermutesetup(base) //register "base" vector with cvpermute() mergeruns = comb(cols(selectnames), 2) } else if ( (selecttype[cols(selecttype)] == "f") & /// (colsum(regexm(selectnames[cols(selectnames)], "ms$")) == 1) & /// (colsum(cols(tokens(selectnames[cols(selectnames)]))) == 1)) { mergeruns = (cols(selectnames) - 1)*2 - 1 floating = 1 base = (J(1, 1, 1) \ J(cols(selectnames) - 2, 1, 0)) //setup vector for use in cvpermute() to select all sets of 2 variables base // basis = cvpermutesetup(base) } else { mergeruns = cols(selectnames) - 1 } countmerge = 1 while (countmerge <= mergeruns) { //go through all different combinations of predictor levels to find best levels to collapse if (selecttype[1] == "u") select2var = cvpermute(basis)' //invoke cvpermute for selector vector else if (((selecttype[1] == "o") & (floating == 0)) | /// ((floating == 1) & (sign(countmerge*2 - mergeruns) != 1))) { select2var = (J(countmerge - 1, 1, 0) \ J(2, 1, 1) \ /// J(cols(selectnames) - countmerge - 1, 1, 0))' } else if ((floating == 1) & (sign(countmerge*2 - mergeruns) == 1)) { select2var = (cvpermute(basis)', 1) } select2var // collapsevec = (collapsevec, invtokens(strofreal(select2var))) //create string vector indicating position of variabels to test collapsevec // select2names = select(selectnames, select2var) //proceed to select 2 columns of predictor to evaluate for potentially collapsing select2names // select2names[1] // select2vec = tokens(select2names[1]) temploc1 = st_addvar("byte", st_tempname()) temploc1s = st_varname(temploc1) stata("replace " + temploc1s + "=0") for (y = 1; y <= cols(tokens(select2names[1])); y++) { stata("replace " + temploc1s + "=`" /// + select2vec[y] + "' + " + temploc1s) } select2names[2] // select2vec = tokens(select2names[2]) temploc2 = st_addvar("byte", st_tempname()) temploc2s = st_varname(temploc2) stata("replace " + temploc2s + "=0") for (y = 1; y <= cols(tokens(select2names[2])); y++) { stata("replace " + temploc2s + "=`" /// + select2vec[y] + "' + " + temploc2s) } stata("summarize " + temploc1s + " " + temploc2s + " if \`touse'" /// + " & \`useob' [\`weight'\`exp']") // st_macroexpand("\`permute'version 10: \`reg' \`dv' " + temploc1s + /// " if \`touse' & \`useob' & (" + temploc1s + " | " + temploc2s + ") [\`weight'\`exp']") // if (strlen(svy)) stata("capture version 10: svy, subpop(if \`touse' & \`useob' & (" + temploc1s + /// " | " + temploc2s + ")): \`reg' \`dv' " + temploc1s) else stata("capture \`permute'version 10: \`reg' \`dv' " + temploc1s + /// " if \`touse' & \`useob' & (" + temploc1s + " | " + temploc2s + ") [\`weight'\`exp']") if ((st_numscalar("c(rc)") == 0)) { if ((strlen(svy)) & !((st_numscalar(st_local("test")) == .) | (st_numscalar("df_m") == 0))) { stata("\`reg'") // merger_ps = (merger_ps, Ftail(st_numscalar("e(df_m)"), /// st_numscalar("e(df_r)"), st_numscalar(st_local("test")))) stata("getic, type(aicw) ll(1) parm(1) obs(1)") merger_aic = (merger_aic, st_numscalar("r(ic)")) } else if (!(strlen(svy)) & !((st_numscalar(st_local("test")) == .) | (st_numscalar("df_m") == 0)) & /// !(rows(st_matrix("r(p)")))) { stata("\`reg'") // merger_ps = (merger_ps, chi2tail(st_numscalar("e(df_m)"), /// st_numscalar(st_local("test")))) stata("getic, type(aicc) ll(\`=e(ll)') parm(\`=e(df_m)') obs(\`=e(N)')") merger_aic = (merger_aic, st_numscalar("r(ic)")) } else if (!(strlen(svy)) & (rows(st_matrix("r(p)")))) { merger_ps = (merger_ps, st_matrix("r(p)")[1,1]) stata("getic, type(aicc) ll(" + strofreal(st_matrix("r(b)")[1, 2]) + /// ") parm(" + strofreal(st_matrix("r(b)")[1, 3]) + ") obs(" /// + strofreal(st_matrix("r(b)")[1, 4]) + ")") merger_aic = (merger_aic, st_numscalar("r(ic)")) } else { merger_ps = (merger_ps, 1) merger_aic = (merger_aic, .) } } else { merger_ps = (merger_ps, 1) merger_aic = (merger_aic, .) } countmerge++ countmerge // st_dropvar((temploc1s, temploc2s)) } if ((max(merger_ps) > 1 - alpha1) | (strlen(exhaust))) { //if at least one p-value doesn't the "respalpha" limit offered by the user, collapse the levels as they appear sufficiently homogeneous - also ensures that, when the DV is a constant, the algorithm stops merger_ps = strmatch(strofreal(merger_ps), strofreal(max(merger_ps))) //find the position of the largest p-value that passed the spltalpha criterion merger_ps // if (rowsum(merger_ps) > 1) { merger_ps = (0, strmatch(strofreal(merger_aic[2..cols(merger_aic)]), /// strofreal(max(merger_aic[2..cols(merger_aic)])))) //find the position of the largest aic value merger_ps // if (rowsum(merger_ps) > 1) { randresolve = runiform(1, cols(merger_ps)) merger_ps = strmatch(strofreal(merger_ps), strofreal(max(merger_ps))) merger_ps = randresolve:*merger_ps merger_ps = strmatch(strofreal(merger_ps), strofreal(max(merger_ps))) merger_ps // } } withinselect = strtoreal(tokens(select(collapsevec, merger_ps))) //pull out variables associated with largest p-value withinselect // if (selecttype[1] == "u") selectnames = (invtokens(select(selectnames, withinselect)), /// select(selectnames, abs(withinselect:-1))) else if (selecttype[1] == "o") { countmerge = (strpos(invtokens(strofreal(withinselect)), "1") + 1)/2 countmerge // if (withinselect[countmerge] == withinselect[countmerge + 1]) { if (countmerge == 1) selectnames = (invtokens(select(selectnames, withinselect)), /// select(selectnames[countmerge + 2..cols(withinselect)], /// abs(withinselect[countmerge + 2..cols(withinselect)]:-1))) else if (countmerge == cols(withinselect) - 1) selectnames = /// (select(selectnames[1..countmerge - 1], abs(withinselect[1..countmerge - 1]:-1)), /// invtokens(select(selectnames, withinselect))) else selectnames = (select(selectnames[1..countmerge - 1], abs(withinselect[1..countmerge - 1]:-1)), /// invtokens(select(selectnames, withinselect)), /// select(selectnames[countmerge + 2..cols(withinselect)], /// abs(withinselect[countmerge + 2..cols(withinselect)]:-1))) } else { if (countmerge == 1) selectnames = (invtokens(select(selectnames, withinselect)), /// select(selectnames[countmerge + 1..cols(withinselect) - 1], /// abs(withinselect[countmerge + 1..cols(withinselect) - 1]:-1))) else if (countmerge == cols(withinselect) - 1) selectnames = /// (select(selectnames[1..countmerge - 1], abs(withinselect[1..countmerge - 1]:-1)), /// invtokens(select(selectnames, withinselect))) else selectnames = (select(selectnames[1..countmerge - 1], abs(withinselect[1..countmerge - 1]:-1)), /// invtokens(select(selectnames, withinselect)), /// select(selectnames[countmerge + 1..cols(withinselect) - 1], /// abs(withinselect[countmerge + 1..cols(withinselect) - 1]:-1))) } } selectnames // if (alpha2 > 0) { if (strlen(exhaust)) match_p = 1 - max(merger_ps) else match_p = -1 selectnames = chaidsplit(selectnames, withinselect, selecttype[1], alpha2, svy, match_p) selectnames // } } else moremerge = 0 } else moremerge = 0 adj_p = 0 if (!strlen(bfadj)) { if (selecttype[1] == "u") { if (strlen(exhaust)) adj_p = cols(selectnames)*(cols(selectnames) - 1)/2 else { unordpcomp = (0::cols(selectnames)) for (y = 1; y <= cols(selectnames) - 1; y++) { adj_p = adj_p + (-1)^unordpcomp[y]*((cols(selectnames) - /// unordpcomp[y])^cols(selecttype)/(factorial(unordpcomp[y])* /// factorial(cols(selectnames) - unordpcomp[y]))) } } } else if (selecttype[1] == "o") { if (strlen(exhaust)) adj_p = cols(selectnames)*(cols(selectnames)^2 - 1)/2 else adj_p = comb(cols(selecttype) - 1, cols(selectnames) - 1) } else if (selecttype[cols(selecttype)] == "f") { if (strlen(exhaust)) adj_p = cols(selectnames)*(cols(selectnames) - 1)/2 else { comb(cols(selecttype) - 1, cols(selectnames) - 1)* /// ((cols(selectnames) - 1) + cols(selectnames)*(cols(selecttype) - /// cols(selectnames)))/(cols(selecttype) - 1) } } } else adj_p = 1 adj_p // "Completed One Merging Session" // } /*Compute chi-square used for splitting*/ countmerge = 1 temploc1 = st_addvar("byte", st_tempname()) temploc1s = st_varname(temploc1) stata("replace " + temploc1s + "=0 if \`touse' & \`useob'") for (x = 1; x <= cols(selectnames); x++) { select2vec = tokens(selectnames[x]) for (y = 1; y <= cols(select2vec); y++) { st_macroexpand("replace " + temploc1s + "=`" /// + select2vec[y] + "'*" + strofreal(countmerge) + " if `" /// + select2vec[y] + "' & \`touse' & \`useob'") stata("replace " + temploc1s + "=`" /// + select2vec[y] + "'*" + strofreal(countmerge) + " if `" /// + select2vec[y] + "' & \`touse' & \`useob'") } countmerge++ stata("summarize " + temploc1s + " if \`touse' & \`useob' [\`weight'\`exp']") // } stata("tabulate " + temploc1s + /// " \`useob' if \`touse' [\`weight'\`exp'], \`missing' matcell(\`cellcount')") stata("ereturn matrix cellcount = \`cellcount'") countcheck = st_matrix("e(cellcount)")' countcheck // if (selecttype[1] == "u") { st_local("xi", "xi:") st_local("i", "i.") } if (selecttype[1] == "o") { st_local("xi", "") st_local("i", "") } if (strlen(svy)) stata("capture version 10:svy, subpop(if \`touse' & \`useob'):\`xi' \`reg' \`dv' \`i'" + temploc1s) else stata("capture \`permute'version 10:\`xi' \`reg' \`dv' \`i'" + temploc1s + " if \`touse' & \`useob' [\`weight'\`exp']") if ((st_numscalar("c(rc)") == 0) & (cols(countcheck) > 1)) { if ((strlen(svy)) & !((st_numscalar(st_local("test")) == .) | (st_numscalar("df_m") == 0))) { stata("\`reg'") // current_p = Ftail(st_numscalar("e(df_m)"), st_numscalar("e(df_r)"), st_numscalar(st_local("test"))) stata("getic, type(aicw) ll(1) parm(1) obs(1)") current_aic = st_numscalar("r(ic)") } else if (!(strlen(svy)) & !((st_numscalar(st_local("test")) == .) | (st_numscalar("df_m") == 0)) /// & !(rows(st_matrix("r(p)")))) { stata("\`reg'") // current_p = chi2tail(st_numscalar("e(df_m)"), st_numscalar(st_local("test"))) stata("getic, type(aicc) ll(\`=e(ll)') parm(\`=e(df_m)') obs(\`=e(N)')") current_aic = st_numscalar("r(ic)") } else if (!(strlen(svy)) & (rows(st_matrix("r(p)")))) { current_p = st_matrix("r(p)")[1,1] stata("getic, type(aicc) ll(" + strofreal(st_matrix("r(b)")[1, 2]) + /// ") parm(" + strofreal(st_matrix("r(b)")[1, 3]) + ") obs(" /// + strofreal(st_matrix("r(b)")[1, 4]) + ")") current_aic = st_numscalar("r(ic)") } else { current_p = 1 current_aic = . } } else { current_p = 1 current_aic = . } st_dropvar(temploc1s) current_p = current_p*adj_p current_p // current_aic // compare_p // compare_aic // if (current_p < compare_p) { "better!" // compare_p = current_p compare_p // compare_aic = current_aic current_aic // comparenames = selectnames comparenames // comparecount = countcheck comparecount // } else if (current_p == compare_p) { if (current_aic < compare_aic) { "better!" // compare_p = current_p compare_p // compare_aic = current_aic current_aic // comparenames = selectnames comparenames // comparecount = countcheck comparecount // } else { randresolve = runiform(1, 2) if (randresolve[1] < randresolve[2]) { "better!" // compare_p = current_p compare_p // compare_aic = current_aic current_aic // comparenames = selectnames comparenames // comparecount = countcheck comparecount // } } } position = position + 1 position // "Completed Merging Run" // comparenames // if (position == cols(varsvec) + 1) { if ((cols(tokens(invtokens(comparenames))) > 1) & (compare_p < alpha3) & /// (min(comparecount) >= minnode) & /// ((branches[currentcluster] < maxbranch) | (maxbranch < 0))){ "split!" // branches[currentcluster] = branches[currentcluster] + 1 branches // selectnames = (" ") for (x = 1; x <= cols(comparenames); x++) { comparenames[x] // selectnames = (selectnames, comparenames[x], ",") } selectnames // if (branches[currentcluster] > cols(splits)) splits = (splits, J(rows(splits), 1, "")) splits[currentcluster, branches[currentcluster]] = comparenames[1] splits // CHAIDsplit = (CHAIDsplit, invtokens(selectnames)) for (x = 2; x <= cols(comparenames); x++) { clustercount++ clustercount // branches = (branches, branches[currentcluster]) branches // splits = (splits \ splits[currentcluster, .]) splits[clustercount, branches[clustercount]] = comparenames[x] splits // selectnames = tokens(comparenames[x]) for (y = 1; y <= cols(selectnames); y++) { st_macroexpand("replace \`cluster' = " + strofreal(clustercount)+ " if `" + /// selectnames[y] + "' & \`touse' & \`useob'") // stata("replace \`cluster' = " + strofreal(clustercount)+ " if `" + /// selectnames[y] + "' & \`touse' & \`useob'") // } } position = 2 comparenames = ("") compare_p = 1 compare_aic = . } else { position = 2 stata("replace \`stop'=0 if \`cluster' == " + strofreal(currentcluster)) currentcluster++ stata("summarize \`cluster'") if (st_numscalar("r(max)") < currentcluster) moresplit = 0 currentcluster // compare_p = 1 compare_aic = . comparenames = ("") } } } } currentcluster = 0 for (x = 1; x <= rows(splits); x++) { currentcluster++ if (cols(CHAIDsplit) > 1) CHAIDsplit = (CHAIDsplit \ ("path" + strofreal(currentcluster), splits[x, .], J(1, cols(CHAIDsplit)-cols(splits)-1, ""))) } branchmat = st_tempname() st_matrix(branchmat, branches) st_local("branchmat", branchmat) "Finished Execution" // return(CHAIDsplit) } end version 12.1 mata: mata set matastrict on string rowvector chaidsplit(string rowvector selectnames, real rowvector withinselect, string scalar type, /// real scalar cutalpha, string scalar svy, real scalar match_p) { string scalar trysplit, tploc1s, tploc2s real scalar count, float_miss real rowvector selectsplit, tploc1, tploc2, mg_ps, mg_aic, repeats real colvector base, combin string rowvector splitvec, namesplit, comparevec real matrix unordmat transmorphic basis "chaidsplit begins" // if (type == "o") { trysplit = select(selectnames, withinselect[1..cols(withinselect) - 1]) trysplit = trysplit[1] float_miss = rowsum(strmatch(substr(tokens(trysplit), -2), "ms")) } else if (type == "u") trysplit = selectnames[1] trysplit // type // splitvec = ("") mg_ps = (.) mg_aic = (.) if (cols(tokens(trysplit)) > 2) { namesplit = tokens(invtokens(trysplit)) namesplit // if (float_miss == 1) { namesplit = (select(namesplit, abs(strmatch(substr(namesplit, -2), "ms")):-1), /// select(namesplit, strmatch(substr(namesplit, -2), "ms"))) namesplit // } if (type == "u") { repeats = 2^(cols(tokens(trysplit))) - 2 unordmat = J(1, cols(tokens(trysplit)), .) for (x = 1; x <= cols(tokens(trysplit)); x++) { base = J(x, 1, 1) base = (base \ J(cols(tokens(trysplit)) - x, 1, 0)) basis = cvpermutesetup(base) for (y = 1; y <= comb(cols(tokens(trysplit)), x); y++) { combin = cvpermute(basis) unordmat = (unordmat \ combin') } } unordmat = unordmat[2..rows(unordmat)-1, .] unordmat // } else if (type == "o") { repeats = cols(tokens(trysplit)) - 1 if (float_miss == 1) repeats = repeats*2 - 1 } repeats // count = 1 while (count <= repeats) { if (type == "u") { selectsplit = unordmat[count, .] } else if (type == "o") { if (count <= cols(namesplit) - 1) selectsplit = (J(1, count, 1), J(1, cols(namesplit) - count, 0)) else if (count > cols(namesplit) - 1) { selectsplit = (J(1, count - (cols(namesplit) - 1), 1), /// J(1, cols(namesplit) - 1 - (count - (cols(namesplit) - 1)), 0), 1) } } selectsplit // splitvec = (splitvec, invtokens(strofreal(selectsplit))) //create string vector indicating position of variables to test splitvec // comparevec = select(namesplit, selectsplit) //proceed to select 2 columns of predictor to evaluate for potentially collapsing comparevec // tploc1 = st_addvar("byte", st_tempname()) tploc1s = st_varname(tploc1) stata("replace " + tploc1s + "=0") for (y = 1; y <= cols(comparevec); y++) { stata("replace " + tploc1s + "=`" /// + comparevec[y] + "' + " + tploc1s) } comparevec = select(namesplit, abs(selectsplit:-1)) comparevec // tploc2 = st_addvar("byte", st_tempname()) tploc2s = st_varname(tploc2) stata("replace " + tploc2s + "=0") for (y = 1; y <= cols(comparevec); y++) { stata("replace " + tploc2s + "=`" /// + comparevec[y] + "' + " + tploc2s) } stata("summarize " + tploc1s + " " + tploc2s + " if \`touse'" /// + " & \`useob' [\`weight'\`exp']") // st_macroexpand("\`permute'version 10: \`reg' \`dv' " + tploc1s + /// " if \`touse' & \`useob' & (" + tploc1s + " | " + tploc2s + ") [\`weight'\`exp']") // if (strlen(svy)) stata("capture version 10: svy, subpop(if \`touse' & \`useob' & (" + tploc1s + " | " + /// tploc2s + "): \`reg' \`dv' " + tploc1s) else stata("capture \`permute'version 10: \`reg' \`dv' " + tploc1s + /// " if \`touse' & \`useob' & (" + tploc1s + " | " + tploc2s + ") [\`weight'\`exp']") if ((st_numscalar("c(rc)") == 0)) { if ((strlen(svy)) & !((st_numscalar(st_local("test")) == .) | (st_numscalar("df_m") == 0))) { stata("\`reg'") // mg_ps = (mg_ps, Ftail(st_numscalar("e(df_m)"), /// st_numscalar("e(df_r)"), st_numscalar(st_local("test")))) stata("getic, type(aicw) ll(1) parm(1) obs(1)") mg_aic = (mg_aic, st_numscalar("r(ic)")) } else if (!(strlen(svy)) & !((st_numscalar(st_local("test")) == .) | (st_numscalar("df_m") == 0)) & /// !(rows(st_matrix("r(p)")))) { stata("\`reg'") // mg_ps = (mg_ps, chi2tail(st_numscalar("e(df_m)"), /// st_numscalar(st_local("test")))) stata("getic, type(aicc) ll(\`=e(ll)') parm(\`=e(df_m)') obs(\`=e(N)')") mg_aic = (mg_aic, st_numscalar("r(ic)")) } else if (!(strlen(svy)) & (rows(st_matrix("r(p)")))) { mg_ps = (mg_ps, st_matrix("r(p)")[1,1]) stata("getic, type(aicc) ll(" + strofreal(st_matrix("r(b)")[1, 2]) + /// ") parm(" + strofreal(st_matrix("r(b)")[1, 3]) + ") obs(" /// + strofreal(st_matrix("r(b)")[1, 4]) + ")") mg_aic = (mg_aic, st_numscalar("r(ic)")) } else { mg_ps = (mg_ps, 1) mg_aic = (mg_aic, .) } } else { mg_ps = (mg_ps, 1) mg_aic = (mg_aic, .) } st_dropvar((tploc1s, tploc2s)) count++ } if ((min(mg_ps) < cutalpha) & (min(mg_ps) < match_p)) { mg_ps = strpos(strofreal(min(mg_ps[2..cols(mg_ps)])), strofreal(mg_ps[2..cols(mg_ps)])) mg_ps // if (rowsum(mg_ps) > 1) { mg_ps = strpos(strofreal(min(mg_aic[2..cols(mg_aic)])), strofreal(mg_aic[2..cols(mg_aic)])) mg_ps // if (rowsum(mg_ps) > 1) { randresolve = runiform(1, cols(mg_ps)) mg_ps = strmatch(strofreal(mg_ps), strofreal(max(mg_ps))) mg_ps = randresolve:*mg_ps mg_ps = strmatch(strofreal(mg_ps), strofreal(max(mg_ps))) mg_ps // } } trysplit = (invtokens(select(namesplit, strtoreal(tokens(select(splitvec[2..cols(splitvec)], mg_ps))))), /// invtokens(select(namesplit, abs(strtoreal(tokens(select(splitvec[2..cols(splitvec)], mg_ps))):-1)))) trysplit // if (type == "u") selectnames = (trysplit, selectnames[2..cols(selectnames)]) else if (type == "o") { if (withinselect[1] == 1) selectnames = (trysplit, selectnames[2..cols(selectnames)]) else if (withinselect[cols(withinselect)-1] == 1) selectnames = (selectnames[1..cols(selectnames)-1], trysplit) else { count = (strpos(invtokens(strofreal(withinselect)), "1") + 1)/2 count // selectnames = (selectnames[1..count-1], trysplit, selectnames[count..cols(selectnames)]) } if (rowsum(strmatch(substr(selectnames, -2), "ms")) == 1) { if (cols(tokens((select(selectnames, strmatch(substr(selectnames, -2), "ms")))))) { selectnames = (select(selectnames, abs(strmatch(substr(selectnames, -2), "ms")):-1), /// select(selectnames, strmatch(substr(selectnames, -2), "ms"))) } } } } } "end chaidsplit" // return(selectnames) } end //program to obtain information criteria program define getic, rclass syntax, type(string) ll(real) parm(integer) obs(integer) if ("`type'" == "aicc") return scalar ic = -2*`ll'+2*`parm'+(2*`parm'*`parm' + 1)/(`obs' - `parm' - 1) else if ("`type'" == "aicw") { tempname ilog gdeff matrix `ilog' = e(ilog) mata: ilog = select(st_matrix("`ilog'"), abs(sign(st_matrix("`ilog'")))) mata: st_matrix("`ilog'", ilog[1, cols(ilog)]) matrix `gdeff' = trace(invsym(e(V_srs))*e(V)) return scalar ic = -2*`ilog'[1,1]*(e(N)/e(N_pop))+2*`gdeff'[1,1] } else exit 198 end /* programming notes and history - chaid version 1.0 - date - November 18, 2013 Basic version ----- - chaid version 1.1 - date - March 20, 2014 //notable changes\\ a] added frequency weights b] updated display to highlight hierarchical tree-structure using Mata matrix and fix errors with original display c] implemented the remerge process as an option d] allowed a no Bonferroni adjustment option e] fixed merging inaccuracy that excluded categories arbitrarily f] error returning an extra "root" fixed g] predicted value offered based on mode of node h] additional returned values i] updated helpfile - going over collapse option to expedite chaid fitting ----- - chaid version 2.0 - date - August 22, 2014 //notable changes\\ a] added permutation-based p-values b] added svy prefix compatability c] added graphical depiction to display output d] added accessible mata matrices e] now can "replay" chaid results again as with other Stata programs f] added permutation importance metric g] fixed N-obs issue w/ fweights h] added fit metric based on cramer's V (and svy compatible version with similar properties) i] fixed Q1 Q11 nested variable selection issue - also allows to predict itself (dv and iv, though non-sensical, no more error) j] added exhaustive CHAID algorithm k] convenience command allows user to "xtile" for continuous predictors ----- - chaid version 2.1 - date - December 16, 2014 //notable changes\\ a] fixed missing colon for svy with respalpha b] moved location of bonferroni adjustment (doesn't require "exhaust" or there to have been a merge implemented to compute it) to ensure no missing adjusted p-value c] use aic for between model comparisons to decide on splitting (p-value has numerical precision issues... just use it to meet the p<.05 threshold and that's it) d] fix importance w/ missing (extra space added to "ifstmnts" for missings erroneously that was turned into comma e] fix DV levels check that occurs before markout ----- - chaid version 2.2 - date - February 12, 2015 //notable changes\\ a] fixed Bonferroni adjustment (was not invoked unless noadj option was used) - had opposite behavior as intended b] fixed but which prevented splits on splitting variables with a single level v 3? - allow custom xtile() for each variable - save info for all splits (as obj) - postestimation for predict & importance - make interactive fitting - graphing? - add mi?