* This line loads the plugin if not already loaded program boost_plugin, plugin *********************************************************************************************** program define boost, eclass *! version 1.3.5 Oct 16, 2020 Matthias Schonlau version 8.1 syntax [varlist] [if] [in], DISTribution(str) [ INfluence maxiter(int 20000) /* */ PREDict(str) TRAINfraction(real 0.8) /* */ shrink(real 0.01) bag(real 0.5) INTERaction(int 5) /* */ seed(int 0) ] version 16: marksample touse , novarlist local k : word count `varlist' if `k'<2 { di as error " Error: need at least 2 variables" exit } local y : word 1 of `varlist' local xvarlist: list varlist -y local kgroups=1 if ("`distribution'"=="multinomial") { qui tab `y' `if' `in' // only training data local kgroups=r(r) multinom_check_categories `y' `if' `in' } *prepare matrix to transport `kgroups' values per variable back from c++ program tempname influence_mat tempname A matrix `A'=(-1) while (colsof(`A')<`kgroups') { matrix `A'= `A' , -1 } matrix `influence_mat'= `A' while (rowsof(`influence_mat')< `k'-1) { matrix `influence_mat' = `influence_mat' \ `A' } matrix rownames `influence_mat' = `xvarlist' *prepare matrix to transport one value per iteration back from c++ program tempname lltrain_mat tempname lltest_mat tempname B matrix `B'=(-1) matrix `lltrain_mat'= `B' matrix `lltest_mat'= `B' * 40 is the default maximum matrix size while (rowsof(`lltrain_mat')< 40) { matrix `lltrain_mat' = `lltrain_mat' \ `B' matrix `lltest_mat' = `lltest_mat' \ `B' } if (length("`distribution'")==0) { local distribution="normal" } if ("`distribution'"!="poisson" & "`distribution'"!="logistic" & "`distribution'"!="normal" & "`distribution'"!="multinomial" & "`distribution'" != "bernoulli") { di as error "Error: Distribution must be one of: normal, logistic/bernoulli, poisson, multinomial" exit } * prepare prediction; compute number of prediction variables needed (kgroups) if (length("`predict'")==0) { local predict="nopredict" } foreach i of numlist 1/`kgroups' { tempvar temp_predict`i' qui gen `temp_predict`i''=. } local influence = "`influence'" != "" if (`influence'==1) { di "influence" } di "Distribution=`distribution'" if ("`distribution'"=="multinomial") { di " Number of categories (in training data): `kgroups'" } if ("`predict'"!="nopredict") di "predict=`predict'" di "Trainfraction=`trainfraction' Shrink=`shrink' Bag=`bag' maxiter=`maxiter' Interaction=`interaction'" local nleafs=`interaction'+1 if ("`distribution'" == "bernoulli") { local distribution = "logistic" } // to enable the predict statement after boost, the predict needs to be set always // the "predict" option of boost is preserved and re-activated just below the plugin call local preserve_predict = "`predict'" local predict = "predict" plugin call boost_plugin `varlist' `temp_predict1'-`temp_predict`kgroups'' `if' `in', /* */ `maxiter' `nleafs' `shrink' `bag' `distribution' /* */ `trainfraction' `influence' `predict' `influence_mat' `seed' /* */ `lltrain_mat' `lltest_mat' `kgroups' local predict = "`preserve_predict'" ereturn clear di as res "bestiter= " iter ereturn scalar bestiter = iter *di as res "test ll0= " test_ll0 ereturn scalar test_ll0 = test_ll0 *di as res "test ll1= " test_ll1 ereturn scalar test_ll1 = test_ll1 di as res "Test R2= " test_R2 ereturn scalar test_R2 = test_R2 di as res "trainn= " trainn ereturn scalar trainn = trainn *di as res "train ll0= " train_ll0 ereturn scalar train_ll0 = train_ll0 *di as res "train ll1= " train_ll1 ereturn scalar train_ll1 = train_ll1 di as res "Train R2= " train_R2 ereturn scalar train_R2 = train_R2 if (`influence') { if ("`distribution'"=="multinomial") { matrix colnames `influence_mat' = `colnames' } if (wordcount("`varlist'") < 22) { display as result "Influence of each variable (Percent):" matrix list `influence_mat', noblank noheader } ereturn matrix influence = `influence_mat' } ///////////////////////////////////////////////// //Add at July 2018. Seperate the predict file. local temppredlist = "" foreach i of numlist 1/`kgroups' { local temppredlist = "`temppredlist' `temp_predict`i''" } mata: pred_matrix("`temppredlist'", "pmatrix") ereturn matrix predmat = pmatrix ereturn local predict "boost_predict" ///////////////////////////////////////////////// if ("`predict'"!="nopredict") { * for regular predictions use the variable name supplied by user if `kgroups'==1 { qui gen `predict'=`temp_predict1' } else { * for multinomial use append 1..kgroups to name; label variable foreach i of numlist 1/`kgroups' { qui gen `predict'`i'=`temp_predict`i'' label var `predict'`i' "`predlabel`i''" } } } //store the predict labels names to ereturn for the separate predict variables local predictlabels = "" foreach i of numlist 1/`kgroups' { local predictlabels = "`predictlabels' `predlabel`i''" } ereturn local predictlabels = "`predictlabels'" ///////////////////////////////////////////////////// // compute mse (normal, poisson), accuracy (bernoulli, multinomial) for test and train data // the c++ program copies all data that satisfy [if][in] into a new data matrix. The first trainn observations of the new data matrix are used for fitting. // i.e. take the first trainn among those that are `touse' if ("`predict'"!="nopredict") { // id predict statement was specified // create a counter like _n , but only for `touse' tempvar _n_touse order qui gen `_n_touse'=. gen `order' = _n sort `touse', stable qui by `touse': replace `_n_touse'= _n qui replace `_n_touse'=. if !`touse' sort `order' // restore original sort order ; stable not needed (no duplicates) if ("`distribution'"=="normal" || "`distribution'"=="poisson") { // `_n_touse'<=scalar(trainn) excludes obs for which `_n_touse'==. compute_mse if (`_n_touse'<=scalar(trainn) & `touse'), pred(`predict') true(`y') ereturn scalar train_mse = r(mse) if (`trainfraction'<1) { compute_mse if (`_n_touse'>scalar(trainn) & `touse'), pred(`predict') true(`y') ereturn scalar test_mse = r(mse) } } else if ("`distribution'"=="bernoulli" | "`distribution'"=="logistic") { // predicted values are probabilities; needs rounding compute_acc if (`_n_touse'<=scalar(trainn) & `touse'), pred(round(`predict')) true(`y') ereturn scalar train_accuracy = r(accuracy) if (`trainfraction'<1) { compute_acc if (`_n_touse'>scalar(trainn) & `touse'), pred(round(`predict')) true(`y') ereturn scalar test_accuracy = r(accuracy) } } else { // multinomial with `kgroups' // need to create a variable that predicts the group; not just probabilities qui gen `predict'_class=. egen `predict'=rowmax(`predict'1-`predict'`kgroups') foreach var of varlist `predict'1-`predict'`kgroups' { qui replace `predict'_`class'=`: var label `var'' if `predict'==`var' } compute_acc if (`_n_touse'<=scalar(trainn) & `touse'), pred(`predict'_`class') true(`y') ereturn scalar train_accuracy = r(accuracy) if (`trainfraction'<1) { compute_acc if (`_n_touse'>scalar(trainn) & `touse'), pred(`predict'_`class') true(`y') ereturn scalar test_accuracy = r(accuracy) } } } end ********************************************************************************************** * if distribution=multinomial, check whether the training data and the test data have the same * number of categories * training data are data restricted through if/in * test data are the remaining data ignoring the if/in program define multinom_check_categories version 13.0 syntax varlist(min=1 max=1) [if] [in] qui tab `varlist' `if' `in' // not all codes may be in the training data local n_codes=r(r) qui tab `varlist' // all data local n_testcodes=r(r) if (`n_codes'!=`n_testcodes') { di as error "Warning: The number of categories in training data (restricted to [if][in])and full data differ" di as error " Training data `n_codes'" di as error " Full data `n_testcodes'" di as res "" // switches back to black output } end *********************************************************************************************** // create a matrix of varlist in Mata and copy to Stata // Mata can have matrices of size larger than 11,000; a limitation of stata matrices. // Interestingly, a mata matrix of size (e.g.) 12,000 in Mata can be copied successfully to Stata (version 15). version 15.0 mata: void pred_matrix(string varlist, string mname) { V = st_varindex(tokens(varlist)) Data = st_data(.,V) st_matrix(mname, Data) } end *********************************************************************************************** // computes accuracy of predictions program compute_acc, rclass version 16.0 syntax [if] [in] , pred(str) true(str) // obs excluded by [if] [in] marksample touse , novarlist qui count if `touse' if r(N) == 0 { error 2000 } preserve qui drop if !`touse' tempvar correct qui gen `correct'= `pred' == `true' qui sum `correct' local acc= r(mean) restore return scalar accuracy = `acc' end *********************************************************************************************** // computes mse of predictions program compute_mse, rclass version 16 syntax [if] [in] , pred(str) true(str) // obs excluded by [if] [in] marksample touse , novarlist qui count if `touse' if r(N) == 0 { error 2000 } preserve qui drop if !`touse' tempvar res_sqr qui gen `res_sqr'= (`pred' - `true')^2 qui sum `res_sqr' local mse= r(mean) restore return scalar mse = `mse' end *********************************************************************************************** /* Revision history Version date 1.3.5 Oct 16, 2020 if predict="nopredict", don't "display predict=`predict'" 1.3.4 Oct 16, 2020 bug: if (`trainfraction'<1) don't compute accuracy/mse for prediction 1.3.3 Oct 2, 2020 added "qui" in front of 3 replace/gen *=. commands 1.3.2 May 9, 2020 boost_predict now generates var with predicted class in addition to vars with predicted probabilites 1.3.1 Dec 1, 2019 Added e(mse*),e(acc*). For multinomial distribution, new var `predict'_class Improved output in influence_delete 1.3.0 Jul 4, 2018 Enabled predict after the boost command (in addition to predict option) 1.2.0 May 16, 2018 Added Linux and Mac, also influence_delete and influence_barchart 1.1.2 Jun 28, 2016 Allow partial trees (Three need not be fully grown) 1.1.1 Aug 16, 2013 add multinom_check_categories 1.1.0 May 27, 2013 add multinomial boosting (also predvars, kgroups, prediction and influence labels) 1.0.0 Feb 16,2012 cosmetic changes 0.0.9 Aug18,2005 added rownames to influence_mat 0.0.8 Jun23,2005 number of iterations =min(maxiter,bestiter+100) 0.0.7 May 2,2005 remove crossvalidate, add trainfraction 0.0.6 Jun20,2004 pass random generator into boosting model (affects bagging only), bagging tested 0.0.5 Jun2,2004 pseudoR2 defined as 1-L1/L0, affects crossvalidation non normal data 0.0.4 Mar8,2004 bugfix: graceful exit when cross validation and <5 obs bugfix: graceful exit when fitting fails because of tree fully extended within Cross validation 0.0.3 Mar3,2004 bugfix: graceful exit when cross validation with missing x-values def of interaction changed (now 1=main effect, 2=2 way interact) 0.0.2 Feb6,2004 cosmetic changes 0.0.1 Feb5,2004 option bag not yet tested, specify bag(1) always */