********************************************************************************
*! "r_ml_stata_cv"
*! Author: Giovanni Cerulli
*! Version: 9
*! Date: 13 February 2023 
********************************************************************************

********************************************************************************
* The program 'numlist_to_matrix' put a Stata "numlist" into a Stata "matrix" 
* with one row. We use this program to give Python parameters' grids  
********************************************************************************
cap prog drop numlist_to_matrix
program numlist_to_matrix , rclass
syntax , num_list(numlist min=1)
local nel : word count `num_list'
tempname A
matrix `A' = J(1,`nel',0)
forvalues i=1/`nel'{
		local el : word `i' of `num_list'
		mat `A'[1,`i']=`el'
}
return matrix M = `A'
end
********************************************************************************

*

********************************************************************************
* Display output for "r_ols"
********************************************************************************
cap program drop display_output_r_ols
program display_output_r_ols
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Least Squares regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************

********************************************************************************
* Display output for "r_tree"
********************************************************************************
cap program drop display_output_r_tree
program display_output_r_tree
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Tree regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal tree depth = " e(OPT_DEPTH)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "r_elasticnet"
********************************************************************************
cap program drop display_output_r_elasticnet
program display_output_r_elasticnet
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Elastic Net regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal penalization parameter = " e(OPT_ALPHA)
noi di "Optimal elastic parameter = " e(OPT_L1_RATIO) _continue
noi di _col(45) "Training accuracy = " e(TRAIN_ACCURACY)
noi di "Testing accuracy = " e(TEST_ACCURACY) _continue
noi di _col(45) "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "r_svm"
********************************************************************************
cap program drop display_output_r_svm
program display_output_r_svm
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Support Vector Machine regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal C parameter = " e(OPT_C)
noi di "Optimal GAMMA parameter = " e(OPT_GAMMA) _continue
noi di _col(45) "Training accuracy = " e(TRAIN_ACCURACY)
noi di "Testing accuracy = " e(TEST_ACCURACY) _continue
noi di _col(45) "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "r_randomforest"
********************************************************************************
cap program drop display_output_r_randomforest
program display_output_r_randomforest
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Random Forest regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal tree depth = " e(OPT_MAX_DEPTH)
noi di "Optimal n. of splitting features = " e(OPT_MAX_FEATURES) _continue
noi di _col(45) "Optimal n. of trees = " e(OPT_N_ESTIMATORS) 
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "r_neuralnet"
********************************************************************************
cap program drop display_output_r_neuralnet
program display_output_r_neuralnet
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Neaural Network regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal n. of neurons in layer 1 = " e(OPT_NEURONS_L_1)
noi di "Optimal n. of neurons in layer 2 = " e(OPT_NEURONS_L_2) _continue
noi di _col(45) "Optimal L2 penalization = " e(OPT_ALPHA) 
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY) 
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "r_nearestneighbor"
********************************************************************************
cap program drop display_output_r_nearestneighbor
program display_output_r_nearestneighbor
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Nearest Neighbor regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal n. of nearest neighbors = " e(OPT_NN)
noi di "Optimal kernel function = " "`e(OPT_WEIGHT)'" _continue
noi di _col(45) "Training accuracy = " e(TRAIN_ACCURACY)
noi di "Testing accuracy = " e(TEST_ACCURACY) _continue
noi di _col(45) "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "r_boost"
********************************************************************************
cap program drop display_output_r_boost
program display_output_r_boost
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Boosting regression}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = explained variance" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal learning rate = " e(OPT_LEARNING_RATE)
noi di "Optimal n. of trees = " e(OPT_N_ESTIMATORS) _continue
noi di _col(45) "Optimal tree depth = " e(OPT_MAX_DEPTH)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY) 
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di "MSE = mean squared error" _continue
noi di _col(45) "MAPE = mean absolute percentage error"
noi di "Training MSE = " e(Train_mse) _continue
noi di _col(45) "Testing MSE = " e(Test_mse)
noi di "Training MAPE % = " e(Train_mape) _continue
noi di _col(45) "Testing MAPE % = " e(Test_mape)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************

*
*
*

********************************************************************************
*! "r_ml_stata_cv"
********************************************************************************
program r_ml_stata_cv , eclass
version 16
syntax varlist(numeric) [if] [in] , ///
mlmodel(string) ///
data_test(name)  ///
seed(numlist max=1 integer) ///
[cross_validation(name)  ///
n_folds(numlist max=1 integer >=2) ///
graph_cv save_graph_cv(name) ///
prediction(name) ///
tree_depth(numlist min=1 integer) ///
n_estimators(numlist min=1 integer) ///
learning_rate(numlist min=1) ///
max_features(numlist min=1 integer) ///
c(numlist min=1) ///
gamma(numlist min=1) ///
alpha(numlist min=1) ///
l1_ratio(numlist min=1) ///
nn(numlist min=1 integer) ///
n_neurons_l1(numlist min=1 integer) ///
n_neurons_l2(numlist min=1 integer) ///
default ///
]
********************************************************************************
if "`mlmodel'"!="ols" & "`mlmodel'"!="elasticnet" & "`mlmodel'"!="tree" & "`mlmodel'"!="randomforest" & "`mlmodel'"!="boost" & "`mlmodel'"!="nearestneighbor" & "`mlmodel'"!="neuralnet" & "`mlmodel'"!="svm" {
	di _newline
	di in red "************************************************************************"
	di in red "WARNING: The argument of option 'mlmodel()' must be one of these:       "
	di in red "'ols', 'elasticnet', 'tree', 'randomforest', 'boost', 'nearestneighbor',"       
	di in red "'neuralnet', 'svm'. 												       "
	di in red "************************************************************************"
	di _newline
break
exit	
}
********************************************************************************
if "`c(os)'" == "MacOSX"{
	cap rm _____in_prediction.dta
	cap rm _____out_prediction.dta
	cap rm _____out_sample_y.dta
	cap rm _____out_sample_x.dta
}
else{
	cap erase _____in_prediction.dta
	cap erase _____out_prediction.dta
	cap erase _____out_sample_y.dta
	cap erase _____out_sample_x.dta
}
********************************************************************************
di ""
di as text "=== Begin Python dependencies ======================================================================="
pylearn , setup
di as text "=== End Python dependencies ========================================================================="
di ""
********************************************************************************
set varabbrev off
********************************************************************************
* DROP ALL THE LABEL VALUES AND PUT THEM IN A TEMPFILE
********************************************************************************
tempfile LABELS
qui: label save _all using `LABELS' , replace
qui: label drop _all
********************************************************************************
* SPLIT "varlist" INTO "target" AND "features"
********************************************************************************
gettoken y X : varlist
********************************************************************************
* MARK THE SAMPLE TO USE
********************************************************************************
marksample touse
********************************************************************************
if "`default'"!=""{
r_ml_stata_default `varlist' if `touse' ,  ///
mlmodel("`mlmodel'")            ///
data_test("`data_test'")        ///
prediction("`prediction'")      ///
seed("`seed'") 
exit
}
if "`default'"=="" & ("`cross_validation'"=="" | "`n_folds'"==""){
	di _newline
	di in red "*************************************************************************"
	di in red "WARNING: It seems you want to run this command not in the 'default' mode."
	di in red "It means that you want to use cross-validation. If it is the case,       "                         	
	di in red "provide options 'cross_validation()' and 'n_folds()', plus               " 
	di in red "all the options required by the specific learner you wish to run.        " 
	di in red "On the contrary, if it is your intention to run the default more,        "	
	di in red "only add the option 'default'.                                           "	
	di in red "*************************************************************************"
	di _newline
break
exit
}
********************************************************************************
* WARNING
********************************************************************************
cap confirm file "`cross_validation'.dta"
if _rc==0 {
	di _newline
	di in red "******************************************************************************"
	di in red "WARNING: A file named `cross_validation'.dta exists in your working directory." 
	di in red "Please, change name to this dataset or delete it before running this command. "   
	di in red "******************************************************************************"
	di _newline
break
exit
}
********************************************************************************
* SAVE THE INITIAL DATASET
********************************************************************************
tempfile data_initial 
qui count
local N_train_all=r(N)
tempvar __id
gen `__id'=_n
qui: save `data_initial' , replace
********************************************************************************
* SELECT THE TRAINING SAMPLE
********************************************************************************
qui: keep if `touse'
preserve
keep `__id'
tempfile data_id
qui save `data_id' , replace
restore
********************************************************************************
* WARNING:
********************************************************************************
cap confirm file "_____out_sample_y.dta"
if _rc==0 {
di _newline
di in red "*****************************************************************************"
di in red "WARNING: A file named '_____out_sample_y.dta' exists in your working directory.   " 
di in red "Please, change name to this dataset or delete it before running this command."   
di in red "*****************************************************************************"
di _newline
break
exit
}
********************************************************************************
* WARNING:
********************************************************************************
cap confirm file "_____out_sample_x.dta"
if _rc==0 {
di _newline
di in red "*****************************************************************************"
di in red "WARNING: A file named '_____out_sample_x.dta' exists in your working directory.   " 
di in red "Please, change name to this dataset or delete it before running this command."   
di in red "*****************************************************************************"
di _newline
break
exit
}
********************************************************************************
* FORM THE TESTING TARGET VARIABLE
********************************************************************************
local _____out_sample_y "_____out_sample_y"
preserve
qui: use `data_test' , clear
qui: keep `y' 
qui save `_____out_sample_y' , replace
restore
*
********************************************************************************
* WARNING:
********************************************************************************
local _____out_sample_x "_____out_sample_x"
preserve
qui: use `data_test' , clear
capture{
	qui: keep `X' 	
}
local rc=_rc 
if `rc'==111{
	di _newline
	di in red "******************************************************************"	 	
	di in red "WARNING: Your testing dataset does not contain the same features  "
	di in red "of your training dataset. It is required that your testing dataset" 
	di in red "owns all the features declared in varlist.                        "
	di in red "******************************************************************"	 	
	break 
	exit    
}
label drop _all
qui save `_____out_sample_x' , replace
restore
********************************************************************************
local k: word count `X'
preserve
qui: use "`_____out_sample_x'" , clear
qui: des 
local p=r(k)
restore
if `p'!=`k'{
	di _newline
	di in red "*******************************************************************"	 	
	di in red "WARNING: Your testing set has `p' features, while your training set"
	di in red "has `k' features (those listed in 'varlist').                      " 
	di in red "The two sets must have the same number of features.                "  
	di in red "*******************************************************************"	 	
	break 
	exit                              
}
else{
	preserve
	qui: keep `X'
	order _all , alpha
	qui: ds
	local X_train_sort `r(varlist)'
	qui use "`_____out_sample_x'" , clear
	order _all , alpha
	qui: ds
	local X_test_sort `r(varlist)'
	order `X_test_sort'
	local i=1
	foreach v of local X_train_sort{
		local h: word `i' of `X_test_sort'
		if "`v'"!="`h'"{
        di _newline
	    di in red "***********************************************************************************"	 	
        di in red "WARNING: The predictors in the testing set do not match those in 'varlist'.        "
        di in red "Please, let the testing set only contain the sole predictors declared in 'varlist'."
	    di in red "***********************************************************************************"
		di _newline
		break 
		exit 1
		}
		local i=`i'+1
	}
	restore
}
********************************************************************************
* WARNING
********************************************************************************
foreach v of local X{
     capture confirm string variable `v'
         if !_rc {di _newline
				  di in red "*********************************************************************************"	 	
                  di in red "WARNING: In your training set of predictors, variable '`v'' is a string variable."
				  di in red "Please, make it numerical. If this variable is categorical,                      "
				  di in red "please generate the categorical binary dummies and use them as predictors        "
				  di in red "in place of variable '`v''.                                                      "  
				  di in red "*********************************************************************************"
				  di _newline
                  break
                  exit				  
				}
        }
********************************************************************************
* WARNING
********************************************************************************
foreach v of local y{
     capture confirm string variable `v'
         if !_rc {di _newline
				  di in red "*****************************************************************************"	 	
                  di in red "WARNING: In your training set the target variable '`v'' is a string variable."
				  di in red "Please, make it numerical.                                                   "
				  di in red "*****************************************************************************"	 	
				  di _newline
                  break
                  exit
                }
        }
********************************************************************************
* WARNING
********************************************************************************
preserve
qui: use `_____out_sample_x', clear
foreach v of local X{
     capture confirm string variable `v'
         if !_rc {
		 	      di _newline
				  di in red "********************************************************************************"
                  di in red "WARNING: In your testing set of predictors, variable '`v'' is a string variable."
				  di in red "Please, make it numerical. If this variable is categorical,                     "
				  di in red "please generate the categorical binary dummies and use them as predictors       "  
				  di in red "in place of variable '`v''.                                                     "  
				  di in red "********************************************************************************"
				  di _newline
                  break
                  exit               		
				}
        }
restore
********************************************************************************
* WARNING
********************************************************************************
preserve
qui: use `_____out_sample_y', clear
foreach v of local y{
     capture confirm string variable `v'
         if !_rc {
		 	      di _newline
				  di in red "****************************************************************************"
                  di in red "WARNING: In your testing set the target variable '`v'' is a string variable."
				  di in red "Please, make it numerical.                                                  "
				  di in red "****************************************************************************"
				  di _newline
                  break
                  exit
                }
        }
restore
********************************************************************************
* COUNT THE USED TESTING OBSERVATIONS AND ELIMINATE MISSINGS FROM "_____out_sample_x"
********************************************************************************
preserve 
qui: use `_____out_sample_x', clear
tempfile _____out_sample_x_initial
qui: save `_____out_sample_x_initial', replace
qui: use `_____out_sample_x_initial' , clear
tempvar __id2
gen `__id2'= _n
qui save `_____out_sample_x_initial', replace
restore
********************************************************************************
preserve
qui: use `_____out_sample_x_initial' , clear
qui reg _all
qui: keep if e(sample)
keep `__id2'
tempfile data_id2
qui save `data_id2', replace
restore
********************************************************************************
preserve
qui: use `_____out_sample_x', clear
qui count 
local N_test_all=r(N)
qui reg _all
qui count if e(sample)
local N_test_used=r(N)
qui: keep if e(sample)
qui: save `_____out_sample_x', replace
restore
********************************************************************************
* WARNING
********************************************************************************
capture confirm variable index
if !_rc {
	di _newline
	di in red "************************************************"
	di in red "WARNING: One of your variables is names 'index'." 
	di in red "Please, change name to this variable.           "   
	di in red "************************************************"
	di _newline
	break
	exit
}
********************************************************************************
* WARNING
********************************************************************************
cap confirm file "_____in_prediction.dta"
if _rc==0 {
	di _newline
	di in red "*****************************************************************************"
	di in red "WARNING: A file named '_____in_prediction.dta' exists in your working directory.  " 
	di in red "Please, change name to this dataset or delete it before running this command."   
	di in red "*****************************************************************************"
	di _newline
break
exit
}
********************************************************************************
cap confirm file "_____out_prediction.dta"
if _rc==0 {
	di _newline
	di in red "*****************************************************************************"
	di in red "WARNING: A file named '_____out_prediction.dta' exists in your working directory. " 
	di in red "Please, change name to this dataset or delete it before running this command."   
	di in red "*****************************************************************************"
	di _newline
	break
	exit
}
********************************************************************************
local _____in_prediction "_____in_prediction"
local _____out_prediction "_____out_prediction"
********************************************************************************
*
********************************************************************************
* COMPUTE THE SAMPLE SIZE OF THE TRAINING DATASET. 
* THEN, MAKE THE ORDERING OF THE "X" IN THE TESTING DATASET THE SAME 
* AS THE ORDERING OF THE "X" IN THE TRAINING DATASET. 
********************************************************************************
preserve
qui count if `touse'
local N_train_used=r(N)
local N_features: word count `X'
qui: use `_____out_sample_x' , clear
order `X'  // same order of the X in the training and testing datasets
qui: save `_____out_sample_x'  , replace
restore
********************************************************************************
* PASS THE STATA DIRECTORY TO PYTHON
********************************************************************************
local dir `c(pwd)'
********************************************************************************
* SELECT AND ORDER THE VARIABLES IN THE TRAINING DATASET
********************************************************************************
keep `y' `X'
order `y' `X'
********************************************************************************
* WARNING
********************************************************************************
qui{
preserve
keep `y' `X'
order `y' `X'
qui count 
local NN=r(N)
qui reg `y' `X'
keep if e(sample)
qui count 
local SS=r(N)
restore
}
********************************************************************************
if `SS'!=`NN'{
	di _newline
	di in red "********************************************************************"
	di in red "WARNING: It seems there are missing values in your training dataset,"
	di in red "either in your target variable, or in your predictors.              "
	di in red "Please, check and remove them. Then, re-run this command.           "
	di in red "********************************************************************"
	di _newline
	exit
}
********************************************************************************
* WARNING 
********************************************************************************
qui{
preserve
qui: use `_____out_sample_x' , clear
qui count 
local NN=r(N)
qui reg _all
keep if e(sample)
qui count 
local SS=r(N)
restore
}
********************************************************************************
if `SS'!=`NN'{
	di _newline
	di in red "***********************************************************************************"
	di in red "WARNING: It seems there are missing values in the features of your testing dataset."
	di in red "Please, check and remove them. Then, re-run this command.                          "
	di in red "***********************************************************************************"
	di _newline
	exit
}
********************************************************************************
* ESTIMATION PROCEDURE
********************************************************************************
* SAVE THE DATASET AS IT IS
********************************************************************************
tempfile data_fitting 
qui: save `data_fitting' , replace
di in red "=== Begin Python warnings ==========================================================================="
********************************************************************************
* PYTHON CODE - BEGIN 
********************************************************************************
if "`mlmodel'"=="ols"{
python: r_ols()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
}
********************************************************************************
if "`mlmodel'"=="boost"{
	if "`n_estimators'"!="" & "`tree_depth'"!="" & "`learning_rate'"!=""{
	numlist_to_matrix , num_list(`n_estimators')
	mat mat_n_estmators = r(M)	

	numlist_to_matrix , num_list(`tree_depth')
	mat mat_max_depth = r(M)

	numlist_to_matrix , num_list(`learning_rate')
	mat mat_learning_rate = r(M)		
	}
else{
	di _newline
	di in red "****************************************************************"
	di in red "WARNING: Boosting requires to specify these options:            " 
	di in red "(1) 'n_estimators()'; (2) 'tree_depth()'; (3) 'learning_rate()'."   
	di in red "****************************************************************"
	di _newline
break
exit
}
python: r_boost()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_LEARNING_RATE=OPT_LEARNING_RATE
ereturn scalar OPT_N_ESTIMATORS=OPT_N_ESTIMATORS
ereturn scalar OPT_MAX_DEPTH=OPT_MAX_DEPTH
}
********************************************************************************
else if "`mlmodel'"=="nearestneighbor"{
	if "`nn'"!=""{ 
	numlist_to_matrix , num_list(`nn')
	mat mat_n_neighbor = r(M)
	}
else{
	di _newline
	di in red "****************************************************************"
	di in red "WARNING: Nearest Neighbor requires to specify this option:      " 
	di in red "(1) 'nn()'.                                                     "   
	di in red "****************************************************************"
	di _newline
break
exit	
}
python: r_nearestneighbor()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_NN=OPT_NN
ereturn local OPT_WEIGHT "$OPT_WEIGHT"
}
********************************************************************************
else if "`mlmodel'"=="neuralnet"{
	if "`n_neurons_l1'"!="" & "`n_neurons_l1'"!="" & "`alpha'"!=""{ 
		numlist_to_matrix , num_list(`n_neurons_l1')
		mat mat_n_neurons_l1 = r(M)	

		numlist_to_matrix , num_list(`n_neurons_l2')
		mat mat_n_neurons_l2 = r(M)	
		
		numlist_to_matrix , num_list(`alpha')
		mat mat_alpha = r(M)	
    }
else{
	di _newline
	di in red "**********************************************************"
	di in red "WARNING: Neural Network requires to specify these options:" 
	di in red "(1) 'n_neurons_l1()'; (2) 'n_neurons_l2()'; (3) 'alpha()'."   
	di in red "**********************************************************"
	di _newline
break
exit
}
python: r_neuralnet()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_NEURONS_L_1=OPT_NEURONS_L_1
ereturn scalar OPT_NEURONS_L_2=OPT_NEURONS_L_2
ereturn scalar OPT_ALPHA=OPT_ALPHA
}
********************************************************************************
else if "`mlmodel'"=="randomforest"{
	if "`n_estimators'"!="" & "`tree_depth'"!="" & "`max_features'"!=""{
	numlist_to_matrix , num_list(`n_estimators')
	mat mat_n_estmators = r(M)	

	numlist_to_matrix , num_list(`tree_depth')
	mat mat_tree_depth = r(M)

	numlist_to_matrix , num_list(`max_features')
	mat mat_max_features = r(M)
	}
else{
	di _newline
	di in red "**************************************************************"
	di in red "WARNING: Random Forest requires to specify these options: " 
	di in red "(1) 'n_estimators()'; (2) 'tree_depth()'; (3) 'max_features()'"   
	di in red "**************************************************************"
	di _newline
break
exit	
}
python: r_randomforest()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_MAX_DEPTH=OPT_MAX_DEPTH
ereturn scalar OPT_MAX_FEATURES=OPT_MAX_FEATURES
ereturn scalar OPT_N_ESTIMATORS=OPT_N_ESTIMATORS
}
********************************************************************************
else if "`mlmodel'"=="svm"{
	if "`c'"!="" & "`gamma'"!="" {
	numlist_to_matrix , num_list(`c')
	mat mat_c = r(M)	

	numlist_to_matrix , num_list(`gamma')
	mat mat_gamma = r(M)
	}
else{
	di _newline
	di in red "******************************************************************"
	di in red "WARNING: Support Vector Machine requires to specify these options:" 
	di in red "(1) 'c()'; (2) 'gamma()'.                                         "   
	di in red "******************************************************************"
	di _newline
break
exit	
}
python: r_svm()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_C=OPT_C
ereturn scalar OPT_GAMMA=OPT_GAMMA
}
********************************************************************************
else if "`mlmodel'"=="tree"{
    if "`tree_depth'"!=""{
	numlist_to_matrix , num_list(`tree_depth')
	mat mat_tree_depth = r(M)	
    }
else{
	di _newline
	di in red "*********************************************************"
	di in red "WARNING: Regression tree requires to specify this option:" 
	di in red "(1) 'tree_depth()'                                       "   
	di in red "**********************************************************"
	di _newline
break
exit	
}	
python: r_tree()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_DEPTH=OPT_LEAVES
}
********************************************************************************
else if "`mlmodel'"=="elasticnet"{
	if "`l1_ratio'"!="" & "`alpha'"!=""{ 
	numlist_to_matrix , num_list(`l1_ratio')
	mat mat_l1_ratio = r(M)		

	numlist_to_matrix , num_list(`alpha')
	mat mat_alpha = r(M)		
	}
else{
	di _newline
	di in red "******************************************************"
	di in red "WARNING: Elasticnet requires to specify these options:" 
	di in red "(1) 'alpha()'; (2) 'l1_ratio()'.                      "   
	di in red "******************************************************"
	di _newline
break
exit	
}
python: r_elasticnet()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_ALPHA=OPT_ALPHA
ereturn scalar OPT_L1_RATIO=OPT_L1_RATIO
}
********************************************************************************
*
********************************************************************************
di in red "=== End Python warnings ============================================================================="
di _newline
di as text "=== Begin Stata output =============================================================================="
********************************************************************************
* STORE RESULTS
******************************************************************************** 
preserve
********************************************************************************
qui: use `cross_validation' , clear
qui sum mean_test_score
scalar max_score_test=r(max)
ereturn scalar TEST_ACCURACY=max_score_test
********************************************************************************
qui sum mean_train_score if mean_test_score==max_score_test
scalar score_train=r(mean)
ereturn scalar TRAIN_ACCURACY=score_train
********************************************************************************
qui sum index if mean_test_score==max_score_test
scalar max_index=r(mean)
ereturn scalar BEST_INDEX=int(max_index)
ereturn scalar SE_TEST_ACCURACY=std_test_score[max_index+1]
ereturn scalar N_FOLDS=`n_folds'
********************************************************************************
restore
********************************************************************************
preserve
if "`prediction'"!=""{
qui: use `_____out_prediction' , clear
rename _0 `prediction' 
}
qui: save `_____out_prediction' , replace
restore
********************************************************************************
* CROSS-VALIDATION GRAPH
********************************************************************************
qui: use `cross_validation' , clear
local A=int(max_index)
if "`graph_cv'"!=""{
tw ///
(line mean_test_score index  , xline(`A',lp(dash) lw(thick))) ///
(line mean_train_score index ) , ///
legend(order(1 "TEST ACCURACY" 2 "TRAIN ACCURACY")) ///
note("Learner = `mlmodel'" "Optimal index = `A'" "Number of folds = `n_folds'") ///
ytitle(Accuracy) xtitle(Index) ///
graphregion(fcolor(white)) scheme(s2mono)
}
********************************************************************************
if "`save_graph_cv'"!=""{
set graph off
tw ///
(line mean_test_score index  , xline(`A',lp(dash) lw(thick))) ///
(line mean_train_score index ) , ///
legend(order(1 "TEST ACCURACY" 2 "TRAIN ACCURACY")) ///
note("Learner = `mlmodel'" "Optimal index = `A'" "Number of folds = `n_folds'") ///
ytitle(Accuracy) xtitle(Index) ///
graphregion(fcolor(white)) scheme(s2mono)
qui: graph save `save_graph_cv' , replace
set graph on	
}
********************************************************************************
********************************************************************************
if "`prediction'"!=""{
********************************************************************************
* WARNING
********************************************************************************
capture confirm variable _train_index
if !_rc {
	di _newline
	di in red "*****************************************************************************"
	di in red "WARNING: One of your variables in the dataset is named '_train_index'.       " 
	di in red "Please, change name to this variable before running this command,            "
	di in red "as this name is used for the identifier of training and testing observations."     
	di in red "*****************************************************************************"
	di _newline
	break
	exit
}
********************************************************************************	
qui{ // start quietly
tempfile _____out_sample_x_y
qui: use `_____out_sample_x'
qui merge 1:1 _n using `_____out_sample_y'
cap drop _merge
qui merge 1:1 _n using `_____out_prediction'
cap drop _merge
qui merge 1:1 _n using `data_id2'
cap drop _merge
drop if `__id2'==.
qui save `_____out_sample_x_y'
********************************************************************************
preserve
qui: use `_____out_sample_x_initial', clear
merge 1:1 `__id2' using `_____out_sample_x_y'
cap drop _merge
qui save `_____out_sample_x_initial', replace
restore
********************************************************************************
qui: use `data_initial' , clear
preserve
qui: use `_____in_prediction' , clear
merge 1:1 _n using `data_id'
cap drop _merge
qui: save `_____in_prediction' , replace
restore
qui: merge 1:1 `__id' using `_____in_prediction'
cap drop _merge
cap drop _train_index
gen _train_index = "train"
********************************************************************************
preserve
qui use `_____out_sample_x_initial' , clear
keep `prediction' 
merge 1:1 _n using `data_test'
cap drop _merge
qui save `_____out_sample_x_initial' , replace
restore
********************************************************************************
* Append the dataset "`_____out_sample_x_initial'"
********************************************************************************
append using `_____out_sample_x_initial'
replace _train_index = "test" if _train_index==""
********************************************************************************
* Compute validation train-MSE
tempvar v_train_mse 
gen `v_train_mse'= (`y'-`prediction')^2 if _train_index=="train"
qui sum `v_train_mse' 
ereturn scalar Train_mse = r(mean)

* Compute validation train-MAPE
tempvar v_train_mape 
gen `v_train_mape'= abs((`y'-`prediction')/`y')*100 if _train_index=="train"
qui sum `v_train_mape' 
ereturn scalar Train_mape = r(mean)

* Compute validation test-MSE
tempvar v_test_mse 
gen `v_test_mse'= (`y'-`prediction')^2 if _train_index=="test"
qui sum `v_test_mse' 
ereturn scalar Test_mse = r(mean)

* Compute validation test-MAPE
tempvar v_test_mape 
gen `v_test_mape'= abs((`y'-`prediction')/`y')*100 if _train_index=="test"
qui sum `v_test_mape' 
ereturn scalar Test_mape = r(mean)
} // end quietly
}
else{
qui: use `data_initial' , clear
} 
********************************************************************************
* Display output
********************************************************************************
ereturn local dep_var "`y'"
ereturn scalar N_features=`N_features' 
ereturn scalar N_train_all=`N_train_all' 
ereturn scalar N_test_all=`N_test_all'  
ereturn scalar N_train_used=`N_train_used' 
ereturn scalar N_test_used=`N_test_used'  
********************************************************************************
* Tree
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="tree"{
    display_output_r_tree 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="tree"{
    display_output_r_tree , prediction	
	}
}
********************************************************************************
* Elasticnet
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="elasticnet"{
    display_output_r_elasticnet 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="elasticnet"{
    display_output_r_elasticnet , prediction	
	}
}
********************************************************************************
* SVM
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="svm"{
    display_output_r_svm 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="svm"{
    display_output_r_svm , prediction	
	}
}
********************************************************************************
* Random forests
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="randomforest"{
    display_output_r_randomforest 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="randomforest"{
    display_output_r_randomforest , prediction	
	}
}
********************************************************************************
* Neural network
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="neuralnet"{
    display_output_r_neuralnet
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="neuralnet"{
    display_output_r_neuralnet , prediction	
	}
}
********************************************************************************
* Nearest Neighbor
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="nearestneighbor"{
    display_output_r_nearestneighbor
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="nearestneighbor"{
    display_output_r_nearestneighbor , prediction	
	}
}
********************************************************************************
* Boosting
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="boost"{
    display_output_r_boost
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="boost"{
    display_output_r_boost , prediction	
	}
}
********************************************************************************
* OLS
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="ols"{
    display_output_r_ols
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="ols"{
    display_output_r_ols , prediction	
	}
}
********************************************************************************
di as text "=== End Stata output ================================================================================"
********************************************************************************
cap drop index
********************************************************************************
if "`c(os)'" == "MacOSX"{
	cap rm `_____in_prediction'.dta
	cap rm `_____out_prediction'.dta
	cap rm `_____out_sample_y'.dta
	cap rm `_____out_sample_x'.dta
}
else{
	cap erase `_____in_prediction'.dta
	cap erase `_____out_prediction'.dta
	cap erase `_____out_sample_y'.dta
	cap erase `_____out_sample_x'.dta
}
********************************************************************************
set varabbrev on
********************************************************************************
qui: do `LABELS'
********************************************************************************
end 
********************************************************************************

*
*
*

********************************************************************************
* PYTHON FUNCTIONS
********************************************************************************
python:
################################################################################
#! "r_tree()": Tree regression with CV 
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_tree():
	# IMPORT NEEDED PACKAGES
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os
	
	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)
	
	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")
	
	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)
	
	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]
	
	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]
	
	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))
	
	# FIT A TREE (with the "number of leaves" parameter=5) JUST FOR ILLUSTRATION
	model=DecisionTreeRegressor(max_depth=5,random_state=R)
	
	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED
	# k_range = list(range(1,16))
	tree_depth = Matrix.get("mat_tree_depth")
	k_range=tree_depth[0]
	k_range = [int(x) for x in k_range]
	
	# CREATE A PARAMETER GRID: MAP THE PARAMETER NAMES TO THE VALUES THAT SHOULD BE SEARCHED
	
	param_grid = dict(max_depth=k_range)
	
	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))
	
	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)	
	
	# FIT THE GRID
	grid.fit(X, y)
	
	# FIT THE GRID
	grid.fit(X, y)
	
	# ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_max_depth','mean_train_score','mean_test_score']]

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)
	
	# EXAMINE THE BEST MODEL
	Scalar.setValue('OPT_SCORE',grid.best_score_,vtype='visible')
	
	# PUT "OPT_LEAVES" INTO A STATA SCALAR
	params_values=list(grid.best_params_.values())
	Scalar.setValue('OPT_LEAVES',params_values[0],vtype='visible')
	
	# GET THE VALUE "opt_leaves" AND PUT IT INTO A STATA SCALAR "OPT_LEAVES"
	opt_leaves=grid.best_params_.get('max_depth')
	
	opt_leaves=opt_leaves
	
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model=DecisionTreeRegressor(max_depth=opt_leaves,random_state=R)
	
	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_max_depth','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['Tree depth', 'Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','Tree depth', 'Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("================================================")
	print(" Results of cross-validation grid search")
	print("================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("================================================")
################################################################################

################################################################################
#! "r_elasticnet()": Elastic-net with CV 
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_elasticnet():
	# IMPORT NEEDED PACKAGES
	from sklearn.linear_model import ElasticNet
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# INITIALIZE AN ELATIC NET
	model = ElasticNet()

	# "CROSS-VALIDATION" FOR "L1_RATIO" ("C") AND "ALFA"  BY PRODUCING A "GRID SEARCH"
	# GENERATE THE TWO PARAMETERS' GRID AS A "LIST"
	
	_M = Matrix.get("mat_l1_ratio")
	gridC=_M[0]
	
	_M = Matrix.get("mat_alpha")
	gridG=_M[0]
	
	#gridC=(0,0.25,0.50,0.75,1)
	#gridG=(0,10,20,30,40,50,60,70,80,90,100,110,120,130,150)

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'l1_ratio': gridC, 'alpha': gridG}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_ALPHA',params_values[0],vtype='visible')
	Scalar.setValue('OPT_L1_RATIO',params_values[1],vtype='visible')

	# GET THE VALUE "opt_c" AND PUT IT INTO A STATA SCALAR "OPT_C"
	opt_c=grid.best_params_.get('l1_ratio')

	# GET THE VALUE "opt_gamma" AND PUT IT INTO A STATA SCALAR "OPT_GAMMA"
	opt_gamma=grid.best_params_.get('alpha')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = ElasticNet(l1_ratio=opt_c, alpha=opt_gamma)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_l1_ratio','param_alpha','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['L1 ratio', 'alpha' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','L1 ratio', 'alpha' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("====================================================")
	print(" Results of cross-validation grid search")
	print("====================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("====================================================")
################################################################################

################################################################################
#! "r_svm()": SVM regression with CV
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_svm():
	# IMPORT NEEDED PACKAGES
	from sklearn import svm
	from sklearn.svm import SVR
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE A SVM (with parameters: kernel='rbf', C = 10.0, gamma=0.1)
	model = SVR(kernel='rbf', C = 10.0, gamma=0.1)

	# SVMC "CROSS-VALIDATION" FOR "C" AND "GAMMA" BY PRODUCING A "GRID SEARCH"
	# GENERATE THE TWO PARAMETERS' GRID AS A "LIST"
	
	_M = Matrix.get("mat_c")
	gridC=_M[0]
	
	_M = Matrix.get("mat_gamma")
	gridG=_M[0]
	
	#gridC=list(range(1,101,10))
	#gridG=[0.1,0.2,0.35,0.5,0.65,0.8,1,10]

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'C': gridC, 'gamma': gridG}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	
	# GET THE VALUE "opt_c" AND PUT IT INTO A STATA SCALAR "OPT_C"
	opt_c=grid.best_params_.get('C')
	Scalar.setValue('OPT_C',opt_c,vtype='visible')

	# GET THE VALUE "opt_gamma" AND PUT IT INTO A STATA SCALAR "OPT_GAMMA"
	opt_gamma=grid.best_params_.get('gamma')
	Scalar.setValue('OPT_GAMMA',opt_gamma, vtype='visible')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = SVR(kernel='rbf', C=opt_c, gamma=opt_gamma)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_C','param_gamma','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['c', 'Gamma' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','c', 'Gamma' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("===============================================")
	print(" Results of cross-validation grid search")
	print("===============================================")
	print(ALL_CV_RES.to_string(index=False))
	print("===============================================")
################################################################################	

################################################################################
#! "r_randomforest()": Random-forest regression with CV
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_randomforest():
	# IMPORT NEEDED PACKAGES
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# COMPUTE THE NUMBER OF FEATURES 
	X = np.array(X)
	n_features=int(len(X[0]))

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# ESTIMATE A "RFC" AT GIVEN PARAMETERS (JUST TO TRY IF IT WORKS)
	model = RandomForestRegressor(max_depth=5, n_estimators=3, max_features=2, random_state=R)

	# RFC "CROSS-VALIDATION":
	# WE CROSS-VALIDATE OVER TWO PARAMETERS:
	# 1. "D = tree depth" (i.e., number of layers of the tree);
	# 2. "G = n. of features to randomly consider at each split"
	# 3. "B = number of bootstraps"

	# GENERATE THE TWO PARAMETERS' GRID AS "LISTS"
	
	_M = Matrix.get("mat_tree_depth")
	gridD=_M[0]
	gridD = [int(x) for x in gridD]
	
	_M = Matrix.get("mat_max_features")
	gridG=_M[0]
	gridG = [int(x) for x in gridG]
	
	_M = Matrix.get("mat_n_estmators")
	gridB=_M[0]
	gridB = [int(x) for x in gridB]

	#gridD=list(range(1,11))
	#gridG=list(range(1,n_features+1))
	#gridB=[50,100,150,200]

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'max_depth': gridD, 'max_features': gridG, 'n_estimators': gridB}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 

	# GET THE VALUE "opt_max_depth" AND PUT IT INTO A STATA SCALAR "OPT_MAX_DEPTH"
	opt_max_depth=grid.best_params_.get('max_depth')
	Scalar.setValue('OPT_MAX_DEPTH',opt_max_depth,vtype='visible')

	# GET THE VALUE "opt_max_features" AND PUT IT INTO A STATA SCALAR "OPT_MAX_FEATURES"
	opt_max_features=grid.best_params_.get('max_features')
	Scalar.setValue('OPT_MAX_FEATURES',opt_max_features, vtype='visible')
	
	# GET THE VALUE "opt_n_estimators" AND PUT IT INTO A STATA SCALAR "OPT_N_ESTIMATORS"
	opt_n_estimators=grid.best_params_.get('n_estimators')
	Scalar.setValue('OPT_N_ESTIMATORS',opt_n_estimators, vtype='visible')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = RandomForestRegressor(max_depth=opt_max_depth, 
									n_estimators=opt_n_estimators, 
									max_features=opt_max_features, 
									random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_max_depth','param_max_features','param_n_estimators','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['Tree depth', 'N. of splitting features', 'N. of trees','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','Tree depth', 'N. of splitting features','N. of trees','Train accuracy','Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("=====================================================================================")
	print(" Results of cross-validation grid search")
	print("=====================================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("=====================================================================================")
################################################################################

################################################################################
#! "r_neuralnet()": Neural network regression with CV 
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_neuralnet():
	# IMPORT NEEDED PACKAGES
	from sklearn.neural_network import MLPRegressor
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# ESTIMATE A "neuralnet" AT GIVEN PARAMETERS (JUST TO TRY IF IT WORKS)
	model = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 5), random_state=R)

	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED FOR CROSS-VALIDATION
	# 1. "NUMBER OF NEURONS FOR LAYER 1" 
	# 2. "NUMBER OF NEURONS FOR LAYER 2" 
	# 3. "L2 PENALIZIATION PARAMETER"
	# 4. "RANDOM SEED" 

	# CREATE A PARAMETER GRID AS A 2D "LIST" FOR NEURONS-LAYER-1 AND NEURONS-LAYER-2
	
	_M = Matrix.get("mat_n_neurons_l1")
	gridL1=_M[0]
	gridL1 = [int(x) for x in gridL1]
	
	_M = Matrix.get("mat_n_neurons_l2")
	gridL2=_M[0]
	gridL2 = [int(x) for x in gridL2]
	
	grid=[]
	for i in gridL1:
	   for j in gridL2:
	 	 g=(i,j)
	 	 grid.append(g)
	
	# GENERATE THE GRID (JUST ONE NUMBER) FOR THE "RANDOM SEED"	
	gridR=[1]
		
	# GRID FOR "alpha"
	_M = Matrix.get("mat_alpha")
	grid_alpha=_M[0]
		
	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid={'hidden_layer_sizes': grid , 'random_state':gridR, 'alpha': grid_alpha}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT THE BEST PARAMETERS INTO STATA SCALARS
	opt_nn=grid.best_params_.get('hidden_layer_sizes')
	opt_neurons1=opt_nn[0]
	opt_neurons2=opt_nn[1]
	Scalar.setValue('OPT_NEURONS_L_1',opt_neurons1,vtype='visible')
	Scalar.setValue('OPT_NEURONS_L_2',opt_neurons2,vtype='visible')
	opt_alpha=grid.best_params_.get('alpha')
	Scalar.setValue('OPT_ALPHA',opt_alpha,vtype='visible')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model=MLPRegressor(solver='lbfgs', hidden_layer_sizes=opt_nn, alpha=opt_alpha , random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_hidden_layer_sizes','param_alpha','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['(N. of neurons in L1, N. of neurons in L2)','Alpha','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','(N. of neurons in L1, N. of neurons in L2)','Alpha','Train accuracy','Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("======================================================================================")
	print(" Results of cross-validation grid search")
	print("======================================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("======================================================================================")
################################################################################

################################################################################
#! "r_nearestneighbor()": Nearest neighbor regression with CV 
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_nearestneighbor():
	# IMPORT NEEDED PACKAGES
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE a KNN (with the n_neighbors parameter=5)
	model = KNeighborsRegressor(n_neighbors=5)

	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED
	# k_range = list(range(1, 31))
	nn = Matrix.get("mat_n_neighbor")
	k_range=nn[0]
	k_range = [int(x) for x in k_range]
	
	weight_options = ['uniform', 'distance']

	# CREATE A PARAMETER GRID: MAP THE PARAMETER NAMES TO THE VALUES THAT SHOULD BE SEARCHED
	param_grid = dict(n_neighbors=k_range, weights=weight_options)

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT "OPT_LEAVES" INTO A STATA SCALAR
	params_values=list(grid.best_params_.values()) 

	# STORE THE BEST NUMBER OF NEIGHBORS INTO A STATA SCALAR
	opt_nn=grid.best_params_.get('n_neighbors')
	Scalar.setValue('OPT_NN',opt_nn,vtype='visible')

	# STORE THE BEST WEIGHT-TYPE INTO A STATA SCALAR
	opt_weight=grid.best_params_.get('weights')
	Macro.setGlobal('OPT_WEIGHT',opt_weight, vtype='visible')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model=KNeighborsRegressor(n_neighbors=opt_nn, weights=opt_weight)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_n_neighbors','param_weights','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['N. of nearest neighbors','Kernel','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','N. of nearest neighbors','Kernel','Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("=======================================================================")
	print(" Results of cross-validation grid search")
	print("=======================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("=======================================================================")
################################################################################


################################################################################
#! "r_boost()": Boosting regression with CV
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_boost():
	# IMPORT NEEDED PACKAGES
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# ESTIMATE A "ABC" AT GIVEN PARAMETERS (JUST TO TRY IF IT WORKS)
	model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, random_state=R)

	# ABC "CROSS-VALIDATION"
	# WE CROSS-VALIDATE OVER TWO PARAMETERS:
	# 1. "D = learning_rate"
	# 2. "G = n_estimators"
    # 3. "H = tree depth" (i.e., number of layers of the tree)
	
	# GENERATE THE TWO PARAMETERS' GRID AS "LISTS"
	# GRID FOR "learning_rate"
	# gridD=[0.001,0.005,0.01,0.05,0.1,0.20]
	# GRID FOR "n_estimators"
	# gridG=list(range(1,21))
	# GRID FOR "max_depth"
	# gridH=list(range(1,11))
	
	_M = Matrix.get("mat_learning_rate")
	gridD=_M[0]
	
	_M = Matrix.get("mat_n_estmators")
	gridG=_M[0]
	gridG = [int(x) for x in gridG]
	
	_M = Matrix.get("mat_max_depth")
	gridH=_M[0]
	gridH = [int(x) for x in gridH]
	
	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'learning_rate': gridD, 'n_estimators': gridG, 'max_depth': gridH}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_LEARNING_RATE',params_values[0],vtype='visible')
	Scalar.setValue('OPT_N_ESTIMATORS',params_values[2],vtype='visible')
	Scalar.setValue('OPT_MAX_DEPTH',params_values[1],vtype='visible')

	# GET THE VALUE "opt_learning_rate" AND PUT IT INTO A STATA SCALAR
	opt_learning_rate=grid.best_params_.get('learning_rate')
	opt_n_estimators=grid.best_params_.get('n_estimators')
	opt_max_depth=grid.best_params_.get('max_depth')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = GradientBoostingRegressor(learning_rate=opt_learning_rate, 
									  n_estimators=opt_n_estimators,
									  max_depth=opt_max_depth,
									  random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_learning_rate','param_n_estimators','param_max_depth','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['Learning rate', 'N. of trees', 'Tree depth','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','Learning rate','N. of trees','Tree depth','Train accuracy','Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("==========================================================================")
	print(" Results of cross-validation grid search")
	print("==========================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("==========================================================================")
################################################################################


################################################################################
#! "r_ols()": OLS with CV 
#! Author: Giovanni Cerulli
#! Version: 7
#! Date: 02/05/2022
################################################################################
def r_ols():
	# IMPORT NEEDED PACKAGES
	from sklearn.linear_model import ElasticNet
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# INITIALIZE AN ELATIC NET
	model = ElasticNet()

	# "CROSS-VALIDATION" FOR "L1_RATIO" ("C") AND "ALFA" BY PRODUCING A "GRID SEARCH"
	# GENERATE THE TWO PARAMETERS' GRID AS A "LIST"
	gridC=(1,1,1)
	gridG=(0,0,0)

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'l1_ratio': gridC, 'alpha': gridG}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='explained_variance', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_ALPHA',params_values[0],vtype='visible')
	Scalar.setValue('OPT_L1_RATIO',params_values[1],vtype='visible')

	# GET THE VALUE "opt_c" AND PUT IT INTO A STATA SCALAR "OPT_C"
	opt_c=grid.best_params_.get('l1_ratio')

	# GET THE VALUE "opt_gamma" AND PUT IT INTO A STATA SCALAR "OPT_GAMMA"
	opt_gamma=grid.best_params_.get('alpha')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = ElasticNet(l1_ratio=opt_c, alpha=opt_gamma)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y
	y_hat = model.predict(X)
	
	# PUT IN-SAMPLE PREDICTION of y IN A DATA FRAME
	in_sample = pd.DataFrame(y_hat)
	
	# PUT IN-SAMPLE PREDICTION INTO STATA
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)
	
	# MAKE OUT-OF-SAMPLE PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# PUT OUT-OF-SAMPLE PREDICTION FOR y INTO A DATA FRAME
	Ynew = pd.DataFrame(ynew)

	# GENERATE A DATAFRAME 'OUT'
	OUT = pd.DataFrame(Ynew)
					
	# PUT "OUT" AS STATA DATASET
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
################################################################################
end