/******************************************************************************* * * * Leave-One-Out Cross-Validation for Stata Estimation Commands * * * *******************************************************************************/ *! xvloo *! v 0.0.15 *! 28mar2024 // Drop program from memory if already loaded cap prog drop xvloo // Defines the program; properties lists the applicable options for this prefix // The tpoint option is only valid for panel/time-series cross-validation prog def xvloo, eclass properties(prefix xv) sortpreserve // Stata version statement, can check for backwards compatibility later version 15 // Set the prefix name for Stata to recognize it set prefix xvloo // Check to see if mata library is compiled cap: findfile libxv.mlib // call libxv in case mata library requires recompilation if _rc != 0 { // Look for the sourcecode file cap: findfile crossvalidate.mata // Look for the mata function used by libxv cap: mata: mata which distdate() // If that function is already defined in Mata, call libxv to compile // everything if _rc == 0 qui: libxv // Otherwise run the mata file else run `"`r(fn)'"' } // End IF Block for unfound mata library // Check to see if the data are survey set if !mi(`"`: char _dta[_svy_version]'"') { // Add a warning message di as res "WARNING: {help xvloo} does not account for survey " /// "sample designs when splitting the data and does not use the " /// "{help svy:subpop} option when fitting the model." } // End IF Block to display survey data warning // Allocate a tempvars for the unique identifier variable and for other // options to use a default tempvar uuid xvtouse xvpred xvsplit // Tokenize the input string gettoken cv cmd : 0, parse(":") bind // Remove leading colon from the estimation command loc cmd `= substr(`"`cmd'"', 2, .)' // Check for if/in conditions mata: getifin(`"`cmd'"') // If there is an if/in expression if ustrregexm(`"`ifin'"', "\s?in\s+") { // Create an indicator that can be used to generate an if expression in // the estimation command instead qui: g byte `xvtouse' = 1 `ifin' // Replaces the cmd macro with an updated version that uses an if // expression instead of an in expression mata: st_local("cmd", subinstr(`"`cmd'"', `"`ifin'"', " if `xvtouse' == 1")) } // End IF Block for in expression handling // Get any if expressions mata: getifin(`"`cmd'"') // Parse the prefix on the comma. `props' will contain split proportions gettoken props xvopts : cv, parse(",") bind // Remove the leading comma from the options for xv. loc xvopts `"`= substr(`"`xvopts'"', 2, .)'"' // Then parse the options from the remainder of the macro mata: cvparse(`"`xvopts'"') // Test to see if replay option is invoked if !mi("`replay'") { // If there are macros around that would tell us what to replay and // the user is using a later version of Stata if !mi(`"`e(fitnm)'`e(valnm)'"') & `c(stata_version)' >= 17 { // Test whether or not there are values in the fit macro if !mi(`"`e(fitnm)'"') collect preview, name(`e(fitnm)') // Otherwise try to estimates replay them else if !mi(`"`e(estresnames)'"') estimates replay `e(estresnames)' // Test if there is a value in the validation macro if !mi(`"`e(valnm)'"') collect preview, name(`e(valnm)') // Otherwise try to display the xv matrix else { // Test if there is a matrix to list cap: qui: mat li e(xv) // If the matrix is there display it if _rc == 0 mat li e(xv) } // End ELSE block for missing validation name for collection // Exit the program exit } // End IF Block for replay contents // For older Stata else if !mi(`"`e(estresnames)'"') & `c(stata_version)' < 17 { // Display the estimation results estimates replay `e(estresnames)' // Test if there is a matrix to list cap: qui: mat li e(xv) // If the matrix is there display it if _rc == 0 mat li e(xv) } // End ELSEIF Block for older Stata // If there aren't results we can find: else { // Display an error message di as err "Unable to find necessary returned values. We're " /// "confused about what we should replay if we don't find them. " /// "Try refitting the models using {help xv} again." // Throw an error message err 119 } // End ELSE Block for no detected collection names } // End IF Block to replay results and exit // Get any argument passed to fitnm mata: getarg("`fitnm'", "fitnm") // Get any argument passed to valnm mata: getarg("`valnm'", "valnm") // Assign default collection name if the user doesn't pass one for fitit if mi(`"`fitnm'"') loc fitnm xvfit // Assign default collection name if the user doesn't pass one for validateit if mi(`"`valnm'"') loc valnm xvval // Get the value of classes mata: getarg("`classes'") // If missing or the default downstream set the value to 1 if (mi("`argval'") | "`argval'" == "0") loc c 1 // Otherwise set it to the number of classes being predicted else loc c `argval' // Get the number of folds mata: getarg("`kfold'", "k") // Determine if this is TT or TVT if `: word count `props'' == 1 { // Test if the proportion is unity set the noall option on if `props' == 1 loc noall noall // Count the number of observations that would be used for the model qui: count `ifin' // Store the number of observations loc N `r(N)' // Test the number of variables that need to be created vs allowed if (`props' * `N' + `c' + `c(k)' + 2) >= `c(max_k_theory)' { // Display error message di as err "Currently, your Stata supports `c(max_k_theory)' " /// "variables, but `= `props' * `c(N)' + `c(k)' + 2' variables " /// "are needed for LOO cross-validation. Reduce your training " /// "set proportion or increase the maximum number of variables " /// "(see {help memory}) in order to use LOO cross-validation." // Return error code and exit err 1002 } // End IF Block for insufficient max variable // Test for number of estimation results that would need to be stored if ceil(`props' * `N') >= 298 { // Display an error message di as err "The maximum number of estimation results that can " /// "be stored is 300 but the minimum number of results generated " /// "by your use of {help xvloo} is `= ceil(`props' * `c(N)')'. " /// "You can try using a smaller training set split. See " /// "{help limits} for additional information on system limits." // Return error code and exit err 1000 } // End IF block for too many potential estimation results } // End IF Block for TT split case // For TVT cases else { // Get the proportion for the training set loc trp `: word 1 of `props'' // Test if the proportion is unity set the noall option on if `trp' == 1 loc noall noall // Count the number of observations that would be used for the model qui: count `ifin' // Store the number of observations loc N `r(N)' // Test the number of variables that need to be created vs allowed if (`trp' * `N' * `c' + `c(k)' + 2) >= `c(max_k_theory)' { // Display error message di as err "Currently, your Stata supports `c(max_k_theory)' " /// "variables, but `= `trp' * `c(N)' + `c(k)' + 2' variables " /// "are needed for LOO cross-validation. Reduce your training " /// "set proportion or increase the maximum number of variables " /// "(see {help memory}) in order to use LOO cross-validation." // Return error code and exit err 1002 } // End IF Block for insufficient max variable // Test for number of estimation results that would need to be stored if ceil(`trp' * `N') >= 298 { // Display an error message di as err "The maximum number of estimation results that can " /// "be stored is 300 but the minimum number of results generated " /// "by your use of {help xvloo} is `= ceil(`props' * `c(N)')'. " /// "You can try using a smaller training set split. See " /// "{help limits} for additional information on system limits." // Return error code and exit err 1000 } // End IF block for too many potential estimation results } // End ELSE Block for TVT split case // If there is anything in the missing local throw an error message if mi(`"`metric'"') { // Display the error message di as err `"You must supply a valid argument to the metric option "' /// `"to use the {help xvloo} prefix."' // Throw an error code to exit err 198 } // End IF Block for missing required parameters // Check for uid variable. If none, create a unique ID as _n in a tempvar // and pass that as uid to splitit if mi(`"`uid'"') { // Generate the unique identifier if the user is not using clusters for // the LOO CV qui: g long `uuid' = _n // Set the uid local to use this variable loc uid "uid(`uuid')" // Set a macro for the correct flavor loc flav "Simple Random Sample" } // End IF Block for missing // Test if the user passed a K-fold option if !mi("`kfold'") { // Display an error message di as err "The kfold() option is invalid with Leave-One-Out cross-" /// "validation. The {opt:noall} option may still be used with Leave-" /// "One-Out cross-validation." // Throw an error message err 184 } // End IF Block for invalid kfold argument // Otherwise else { // Allocate a tempname for the scalar tempname xvn // Parses the argument(s) passed to the uid option (or set above) mata: getarg("`uid'") // Gets the number of clusters/records in the dataset mata: st_numscalar("`xvn'", /// rows(uniqrows(st_data(., "(" + subinstr("`argval'", " ", ", ") + ")")))) // Gets the number of records that will need to be sampled for the // clusters or individual records referenced by `uid' loc k = int(`: word 1 of `props'' * `xvn') // Populates the kfold macro with the number of clusters to split the // sample into loc kfold kfold(`k') } // End ELSE Block for xvloo set kfold value // If the user passes a split or pstub argument if !mi(`"`split'`pstub'`results'"') { // set the retain option on automatically loc retain retain } // End IF Block for non-missing split or pstub // Test if results is missing a value if mi(`"`results'"') { // Set a default to use for the results loc results "results(xvres)" // Set a macro to automatically clean this up at the end if mi("`retain'") loc dropresults "estimates drop xvres*" } // End IF Block to set default results values // If missing the split option if mi(`"`split'"') { // Set the default split variable name loc spvar _xvsplit // Check for default name cap confirm new v `spvar' // If the variable already exists if _rc != 0 { // Set do split to 0 to prevent splitting again loc dosplit 0 // Reassign the split macro to use the existing default splitvar loc split "split(`spvar')" } // End of IF Block when default split variable already exists // If it doesn't exist else { // Set do split to 1 to force splitting the data loc dosplit 1 // And use the tempvar to assign the splits loc split "split(`xvsplit')" } // End ELSE Block for non-existent default split variable } // End IF Block for the split variable name // If not missing the split option else { // Parses the split option mata: getarg("`split'") // Assigns the argument value to spvar loc spvar `argval' // Now set the split variable to use the tempvar loc split "split(`xvsplit')" // Check to see if the split variable already exists cap confirm new v `spvar' // If the variable already exists set the do split local to 0 if _rc != 0 loc dosplit 0 // If it doesn't exist set do split to 1 else loc dosplit 1 } // End ELSE Block for present split option // Check for a non-missing pstub argument if !mi(`"`pstub'"') { // Parses the pstub option mata: getarg("`pstub'") // Store the pstubn loc prvar `argval' // Check to see if predict stub variable is present cap confirm new v `argval'all // If the variable exists if _rc != 0 { // Display an error message di as err "The variable `argval'all already exists. You " /// "can drop the variable, or specify a new predict value stubname." // Throw an error and exit err 110 } // End IF Block for existing `pstub'all variable // Check to see if the predicted variable is present cap confirm new v `argval' // If the variable exists if _rc != 0 { // Display an error message di as err "The variable `argval' already exists. You can drop " /// "the variable, or specify a new predict value stubname." // Throw an error and exit err 110 } // End IF Block for existing `pstub'all variable } // End IF Block for non-missing pstub argument // If pstub is missing else { // If the retain option is triggered if !mi(`"`retain'"') { // Confirm whether or not xvpred already exists cap confirm new v _xvpred _xvpredall // If these variables don't already exist if _rc == 0 { // Use xvpred as the default name loc prvar _xvpred } // End IF Block for default predicted value variable name // Otherwise else { // Get the current date/time stamp loc cdt `= tc(`"`c(current_date)' `c(current_time)'"')' // Add the current date time as a suffix to make the default // predicted variable name unique loc prvar _xvpred`: di substr(strofreal(`cdt', "%15.0g"), 1, 12)' } // End ELSE Block when the default predicted variable name is used } // End IF Block for non-missing retain } // End ELSE Block for missing pstub // Set the predict stub to use the tempvar loc pstub "pstub(`xvpred')" // If the seed option is populated set the seed value to the seed that the // user specified if !mi(`"`seed'"') { // Parse the seed option mata: getarg("`seed'") // Set the seed to the user specified value set seed `argval' } // End IF Block to set the pseudo-random number generator seed. // Gets any estimates that already exist qui: estimates dir // Stores the existing estimate names in a global for predictit glo xvstartest `r(names)' // Check to see if the user passed the state option if !mi(`"`state'"') { // Call the state command `state' // Capture all of the returned values in locals loc rng `r(rng)' loc rngcurrent `r(rngcurrent)' loc rngstate `r(rngstate)' loc rngseed `r(rngseed)' loc rngstream `r(rngstream)' loc filename `r(filename)' loc filedate `r(filedate)' loc version `r(version)' loc currentdate `r(currentdate)' loc currenttime `r(currenttime)' loc stflavor `r(stflavor)' loc processors `r(processors)' loc hostname `r(hostname)' loc machinetype `r(machinetype)' } // End IF Block to call the state command // If the split variable doesn't exist if `dosplit' { // Split the dataset into train/test or train/validation/test splits splitit `props' `ifin', `uid' `tpoint' `kfold' `split' loo // Capture the returned values so they can be returned at the end loc splitter `r(splitter)' loc training `r(training)' loc validation `r(validation)' loc testing `r(testing)' loc stype `r(stype)' loc flavor `r(flavor)' loc forecastset `r(forecastset)' } // End IF Block to create split variable // Call the command to fit the model to the data fitit `"`cmd'"', `split' `results' `kfold' `noall' `display' na(`fitnm') // Capture the macros that get returned loc estresnames `e(estresnames)' loc estresall `e(estresall)' // Predict the outcomes using the model fits predictit, `pstub' `split' `classes' `kfold' `threshold' `noall' /// `pmethod' `popts' // Compute the validation metrics for the LOO sample validateit, `metric' `pstub' `split' `monitors' `display' `kfold' /// loo na(`valnm') `noall' // Loops over the names of the scalars created by validate it foreach i in `r(allnames)' { // Returns all of the scalars in e() eret sca `i' = r(`i') } // End Loop over the returned scalars // Need to assign returned matrix to a new matrix mat xv = r(xv) // If the user doesn't want to retain the results if mi(`"`retain'"') { // Drop the stored estimation results `dropresults' // Drop the variables created by xvloo drop `dropvars' // Clears all of the characteristics that may have been set char _dta[rng] char _dta[rngcurrent] char _dta[rngstate] char _dta[rngseed] char _dta[rngstream] char _dta[filename] char _dta[filedate] char _dta[version] char _dta[currentdate] char _dta[currenttime] char _dta[stflavor] char _dta[processors] char _dta[hostname] char _dta[machinetype] char _dta[predifin] char _dta[kfpredifin] char _dta[modcmd] char _dta[kfmodcmd] } // End IF Block remove results generated by the program // If the user wants to retain the results else { // Reassign the temp splitvar to the user requested or default only when // we are already splitting the data. if `dosplit' qui: clonevar `spvar' = `xvsplit' // Reassign the temp pstub to the user requested name qui: clonevar `prvar' = `xvpred' // If the all option is missing if mi(`"`noall'"') qui: clonevar `prvar'all = `xvpred'all // Return all of the macros from the state command if invoked eret loc rng = "`rng'" eret loc rngcurrent = "`rngcurrent'" eret loc rngstate = "`rngstate'" eret loc rngseed = "`rngseed'" eret loc rngstream = "`rngstream'" eret loc filename = "`filename'" eret loc filedate = "`filedate'" eret loc version = "`version'" eret loc currentdate = "`currentdate'" eret loc currenttime = "`currenttime'" eret loc stflavor = "`stflavor'" eret loc processors = "`processors'" eret loc hostname = "`hostname'" eret loc machinetype = "`machinetype'" // Return the macros from splitit if `dosplit' eret loc splitter = "`spvar'" else eret loc splitter = "`splitter'" eret loc training = "`training'" eret loc validation = "`validation'" eret loc testing = "`testing'" eret loc stype = "Leave One Out" if mi("`flav'") eret loc flavor = "`flavor'" else eret loc flavor = "`flav'" eret loc forecastset = "`forecastset'" // Then return the macros from fitit eret loc estresnames = "`estres'" eret loc estresall = "`estresall'" eret loc fitnm = "`fitnm'" // Return macros related to validation eret loc valnm = "`valnm'" } // End ELSE Block to return a few extra macros related to stored results // Remember to repost results ereturn repost // Returns the matrix containing all of the validation/test metrics and // monitors eret mat xv = xv // Check to see if the data are survey set if !mi(`"`: char _dta[_svy_version]'"') { // Add a warning message di as res "WARNING: {help xvloo} does not account for survey " /// "sample designs when splitting the data and does not use the " /// "{help svy:subpop} option when fitting the model." } // End IF Block to display survey data warning // End definition of ttsplit prefix command end