/******************************************************************************* * * * Handles splitting the data set into train/test, * * train/validation/test, or K-Fold samples. * * * *******************************************************************************/ *! splitit *! v 0.0.11 *! 28FEB2024 // Drop program from memory if already loaded cap prog drop splitit // Define program prog def splitit, rclass sortpreserve // Version statement version 15 // Syntax for the splitit subroutine syntax anything(name = props id = "Split proportion(s)") [if] [in] [, /// Uid(varlist) TPoint(string asis) KFold(integer 1) /// SPLit(string asis) loo ] // Test for invalid KFold option if `kfold' < 1 { // Display an error message di as err "There must always be at least 1 K-Fold. This would be " /// "the training set in a simple train/test split. You specified " /// "`kfold' K-Folds." // Return error code and exit err 198 } // End IF Block for invalid K-Fold argument // Mark the sample to handle any if/in arguments (can now pass if `touse') // for the downstream work to handle user specified if/in conditions. marksample touse // First we'll check/verify that appropriate arguments are passed to the // parameters and handle as much defensive stuff up front as possible. // Tokenize the first argument gettoken train validate: props // Validate that the train value is numeric if !ustrregexm("`train'", "^[\d\.]+[\d]*\$") { // Display an error message di as err "Only numeric values can be passed for split proportions." // Throw an error code err 121 } // End IF Block for invalid training split value // Check validation split value if !mi("`validate'") & !ustrregexm("`validate'", "^\s*[\d\.]+[\d]*\$") { // Display an error message di as err "Only numeric values can be passed for split proportions." // Throw an error code err 121 } // End IF Block for invalid training split value // Set a macro for label use later to define the type of splitting if `: word count `props'' == 1 { // Define this as K-Fold if they want multiple cross-validation folds if `kfold' > 1 loc stype "K-Fold Train/Test Split" // Otherwise, plain train/test split else loc stype "Train/Test Split" } // End IF Block for train/test split types // If there are two thresholds it is tvt if `: word count `props'' == 2 { // Define the split type macro to include K-Fold if the user wants that if `kfold' > 1 loc stype "K-Fold Train/Validate/Test Split" // Set the split type macro to indicate train, validation, test split else loc stype "Train/Validate/Test Split" // Replace the validate macro with the sum of train and validate loc validate `= `train' + `validate'' } // End IF Block for train, validation, test split // Test if the user is requesting assigning all the data to the training set // without using K-Fold cv (effectively not splitting the data at all) if `: word 1 of `props'' == 1 & `kfold' == 1 { // Display error message to the screen di as err "You cannot assign all of the data to a single training split." // Return error code and exit err 198 } // End IF Block for invalid training proportion for non-K-Fold case // Define the flavor of the splits based on how the units are allocated if !mi(`"`uid'"') & !mi("`tpoint'") loc flavor "Clustered & Panel Sample" else if !mi(`"`uid'"') & mi("`tpoint'") loc flavor "Clustered Random Sample" else if mi(`"`uid'"') & !mi("`tpoint'") loc flavor "Panel Unit Sample" else if mi(`"`uid'"') & mi("`tpoint'") loc flavor "Simple Random Sample" // Allocate tempname for xt/group splitting tempvar tag sgrp sgrp2 uni // Determine if the values are not proportions if `train' > 1 { // If not a proportion issue an error message di as err "Splits must be specified as proportions of the sample." // Return error code error 198 } // End of IF Block for non-proportion splits // Now test for invalid combination of splits if !mi("`validate'") { // Test if the sum of the split proportions is greater than unity if `validate' > 1 { // Print error message di as err "Invalid validation/test split. The proportion is > 1." // Return error code error 198 } // End IF Block for invalid validation proportion } // End IF Block for proportions that sum to greater than unity // Require an argument for split if the user wants a validation and test // split if !mi("`validate'") & mi(`"`split'"') { // Check to see if _xvsplit is already defined cap confirm v _xvsplit // If the variable exists if _rc == 0 { // If no varname is passed to split di as err "New varname required for validation/test splits if _xvsplit already exists." // Return error code error 100 } // End IF Block for existing split variable defined } // End IF Block for new varname requirement for tvt splits // If no variable name is passed to split use _xvsplit if mi("`split'") loc split _xvsplit // If tpoint is used expect that the data are xt/tsset if !mi(`"`tpoint'"') { // If not xt/ts set if mi(`"`: char _dta[tis]'"') { // Display an error message di as err "Data required to be xt/tsset when using tpoint." // Return an error code error 459 } // End IF Block for non-xt/tsset data with panel data arguments // Store the panel variable in ivar loc ivar `: char _dta[iis]' // Store the time variable in tvar loc tvar `: char _dta[tis]' // Test if the `uid' parameter has an argument and if so if it includes // the panel variable, when there is a panel variable if !mi(`"`uid'"') & !`: list ivar in uid' & !mi(`"`ivar'"') { // Test to see if the panel variable is nested within the clusters mata: st_local("nested", strofreal(isnested("`uid' `ivar'", "`touse'"))) // If the panel variable is not nested within the user defined // clusters if `nested' == 0 { // Return an error message di as err "The panel variable must be nested within the" /// " clustered identified in: `uid'." // Return an error code and exit error 459 } // End IF Block for non-nested panel vars within clusters // If the panel variable is nested, add it to the cluster ID macro else loc uid `uid' `ivar' } // End IF Block for missing panel var in uid } // End IF Block to check for the time point option /*************************************************************************** * This is the section where we create a marker to identify how we will * * split the records. For hierarchical/panel data, we need to assign whole * * clusters of observations, while cross-sectional data can split the all * * of the records. The temporary variable `tag' is used to mark the obs in * * conjunction with any if/in expressions passed by the user. * ***************************************************************************/ // Test for presence of sampling unit id if provided if !mi(`"`uid'"') { // Confirm the variable exists and let this handle returning the error confirm v `uid' // Test if time point is also listed to determine how to tag records // If there is a time point, that should be included in the if condition if !mi("`tpoint'") qui: egen byte `tag' = tag(`uid') if `touse' // This will handle hierarchical cases as well else qui: egen byte `tag' = tag(`uid') if `touse' } // End IF Block to verify variable in uid if specified // Handle the case where we use the xtset info for the xt case else if mi(`"`uid'"') & !mi("`tpoint'") { // If a panel ID variable is defined by xtset if !mi(`"`ivar'"') { // If the panel variable exists, flag an individual case per panel // unit qui: egen byte `tag' = tag(`ivar') if `touse' } // End IF Block for panel data // If this is a timeseries instead of a panel data set: else { // Create the tag for the timeseries including all obs qui: g byte `tag' = 1 if `touse' & `tvar' < `tpoint' } // End ELSE Block for time series } // End IF block for xtset based splits // Create the tag variable for non xt/hierarchical cases else { // Create the tag for cases that don't involve clustering or panels qui: g byte `tag' = 1 if `touse' } // End ELSE Block for non-clustered/panel/timeseries sampling // Generate a random uniform in [0, 1] for the tagged observations qui: g double `uni' = runiform() if `touse' & `tag' == 1 /*************************************************************************** * This is the section where the splits get defined now that we've ID'd the * * way we will allocate the observations/clusters. * ***************************************************************************/ // For the kfold case, we'll use xtile on the random uniform to create the // groups if `kfold' != 1 & mi("`loo'") { // Generate the split group tempvar to create `kfold' equal groups xtile `sgrp' = `uni' if `touse' & `tag' == 1 & `uni' <= `train', /// n(`kfold') // Define the training splits mata: st_local("trainsplit", invtokens(strofreal(1..`kfold'))) // Set number of levels for the splits deflabs, val(`trainsplit') t(Training) // If there is no validation split if mi("`validate'") { // Define the test split loc testsplit `= `kfold' + 1' // Add the testsplit ID to the variable for the test cases qui: replace `sgrp' = `testsplit' if `touse' & `tag' == 1 & /// `uni' > `train' & !mi(`uni') // Generate the value label for the test split deflabs, val(`testsplit') t(Test) } // End IF Block for KFold CV train/test split // If the user also wants to use kfold for a validation set as well: else { // Create a macro with the validation splits loc validsplit `= `kfold' + 1' // Set the value for the test set loc testsplit `= `validsplit' + 1' // Add the validation group to the existing variable qui: replace `sgrp' = `validsplit' if `touse' & `tag' == 1 & /// (`uni' > `train' & `uni' <= `validate') // Create the test split in a similar fashion qui: replace `sgrp' = `testsplit' if `touse' & `tag' == 1 & /// (`uni' > `validate') // Generate value labels for the validation set deflabs, val(`validsplit') t(Validation) // Generate value labels for the test set deflabs, val(`testsplit') t(Test) } // End ELSE Block for kfold CV with validation and test splits } // End IF block to handle splitting the training set // For Leave-One-Out cross-validation splits else if `kfold' > 1 & !mi("`loo'") { // Sort the data so all of the tagged cases appear first and the random // uniform is sorted in ascending order qui: gsort -`tag' -`touse' +`uni' // Now the _n should correspond with the order of the random uniform // value and won't produce duplicates. We'll use a long here just to // be safe but will compress before returning from the command. qui: g long `sgrp' = _n if `touse' & `tag' == 1 & `uni' <= `train' & /// _n <= `kfold' // Define the training splits mata: st_local("trainsplit", invtokens(strofreal(1..`kfold'))) // Set number of levels for the splits deflabs, val(`trainsplit') t(Training) // If there is no validation split if mi("`validate'") { // Define the test split loc testsplit `= `kfold' + 1' // Add the testsplit ID to the variable for the test cases qui: replace `sgrp' = `testsplit' if `touse' & `tag' == 1 & /// mi(`sgrp') & !mi(`uni') //`uni' > `train' & // Generate the value label for the test split deflabs, val(`testsplit') t(Test) } // End IF Block for KFold CV train/test split // If the user also wants to use kfold for a validation set as well: else { // Create a macro with the validation splits loc validsplit `= `kfold' + 1' // Set the value for the test set loc testsplit `= `validsplit' + 1' // Add the validation group to the existing variable qui: replace `sgrp' = `validsplit' if `touse' & `tag' == 1 & /// mi(`sgrp') & (`uni' > `train' & `uni' <= `validate') // Create the test split in a similar fashion qui: replace `sgrp' = `testsplit' if `touse' & `tag' == 1 & /// mi(`sgrp') & (`uni' > `validate') // Generate value labels for the validation set deflabs, val(`validsplit') t(Validation) // Generate value labels for the test set deflabs, val(`testsplit') t(Test) } // End ELSE Block for kfold CV with validation and test splits // Compress the group identifier qui: compress `sgp' } // End ELSEIF block for the LOO-CV case // For the other cases we can generate the train and validation splits // in a single step else { // For train, validate, test splits: if !mi("`validate'") { // Create the split indicator for the training, validation, and test set g byte `sgrp' = cond(`touse' & `tag' == 1 & `uni' <= `train', 1, /// cond(`touse' & `tag' == 1 & `uni' > `train' & /// `uni' <= `validate' & !mi(`uni'), 2, /// cond(`touse' & `tag' == 1 & `uni' > `validate' & /// !mi(`uni'), 3, .))) // Generate value labels for the training set ID deflabs, val(1) t(Training) // Generate value labels for the validation set ID deflabs, val(2) t(Validation) // Generate value labels for the test set ID deflabs, val(3) t(Test) // Stores the values of the split variable that identify the training split loc trainsplit 1 // Stores the value of the split variable for the validation split loc validsplit 2 // Stores the value of the split variable for the test split loc testsplit 3 } // End IF Block for TVT Split // Otherwise: else { // Create the split indicator for training and test sets g byte `sgrp' = cond(`touse' & `tag' == 1 & `uni' <= `train', 1, /// cond(`touse' & `tag' == 1 & `uni' > `train' & /// !mi(`uni'), 2, .)) // Generate value labels for the training set ID deflabs, val(1) t(Training) // Generate value labels for the test set ID deflabs, val(2) t(Test) // Stores the values of the split variable that identify the training split loc trainsplit 1 // Stores the value of the split variable for the test split loc testsplit 2 } // End ELSE Block for TT split } // End IF block for train/validation/test splits /*************************************************************************** * This is the section where we will handle populating the split ID record * * for cases involving hierarchical/custered sampling, panel/timeseries, & * * combinations of the two cases, since we only assigned split IDs to a * * single record per cluster/group above. * ***************************************************************************/ // Handle populating the split ID for hierarchical cases/clustered splits if !mi("`uid'") { // This should fill in the split group ID assignment for the case of // hierarchical splitting qui: bys `uid' (`sgrp'): replace `sgrp' = `sgrp'[_n - 1] if `touse' /// & mi(`sgrp'[_n]) & !mi(`sgrp'[_n - 1]) // For clustered sampling with panel/timeseries data if !mi("`tpoint'") { // Create a new variable to identify the corresponding forecast sample qui: g byte `split'xv4 = `sgrp' if `touse' & `tvar' > `tpoint' // Label the variable la var `split'xv4 "Forecasting sample for the corresponding split" // Then unflag those records from the main sample replace `sgrp' = . if `touse' & `tvar' > `tpoint' } // End IF Block for timeseries/panel cases } // End IF Block to fill things in for hierarchical splits // Handle timeseries/panel case without additional hierarchy specified else if mi("`uid'") & !mi("`tpoint'") & !mi(`"`ivar'"') { // This should fill in the split group ID assignment for the case of // hierarchical splitting qui: bys `ivar' (`sgrp'): replace `sgrp' = `sgrp'[_n - 1] if `touse' /// & mi(`sgrp'[_n]) & !mi(`sgrp'[_n - 1]) // Create the forecast identifier qui: g long `split'xv4 = `sgrp' if `touse' & `tvar' > `tpoint' // Compress the forecast identifier qui: compress `split'xv4 // Label the variable la var `split'xv4 "Forecasting sample for the corresponding split" // And now replace the split variables with missings for the forecast // sample qui: replace `sgrp' = . if `touse' & `tvar' > `tpoint' } // End ELSEIF Block for panel/timeseries data with a specified panel var // Create a variable label for the split IDs la var `sgrp' `"`stype' Identifiers"' // For the last step we'll move the values from the tempvar into the // permanent variable (which could have happened earlier) clonevar `split' = `sgrp' if `touse' // Apply the value label to the split group variable la val `split' _splitvar // Set an r macro with the variable name with the split variable to make // sure it can be cleaned up by the calling command later in the process ret loc splitter = "`split'" // Return the IDs that identifies the training splits ret loc training = "`trainsplit'" // Return the IDs that identifies the validation splits if !mi("`validsplit'") ret loc validation = `validsplit' // Return the ID that identifies the test split if !mi("`testsplit'") ret loc testing = `testsplit' // Return the type of split ret loc stype = `"`stype'"' // Return the flavor of the split ret loc flavor = `"`flavor'"' // If using for panel/timeseries return the forecast variable name if !mi("`tpoint'") ret loc forecastset = "`split'xv4" // End of program definitions end // Subroutine to define value labels for the split identifier prog def deflabs // Declares the syntax for this subroutine syntax, VALues(numlist integer min = 1 > 0) Type(string asis) // If there is only a single ID passed to the command generate this style of // value label for that split type if `: word count `values'' == 1 la def _splitvar `values' "`type' Split", modify // If multiple ID values are passed loop over them and construct the split // labels like this else { // Loop over the values in the numlist foreach i in `values' { // Generate a new value label with the split IDs la def _splitvar `i' "`type' Split #`i'", modify } // End Loop over the range } // End ELSE Block for multiple values // End sub-sub-routine for other label types end