********************************************************************************
*! "c_ml_stata_cv"
*! Author: Giovanni Cerulli
*! Version: 11
*! Date: 15 November 2022
********************************************************************************


********************************************************************************
* The program 'numlist_to_matrix' put a Stata "numlist" into a Stata "matrix" 
* with one row. We use this program to give Python parameters' grids  
********************************************************************************
cap prog drop numlist_to_matrix
program numlist_to_matrix , rclass
syntax , num_list(numlist min=1)
local nel : word count `num_list'
tempname A
matrix `A' = J(1,`nel',0)
forvalues i=1/`nel'{
		local el : word `i' of `num_list'
		mat `A'[1,`i']=`el'
}
return matrix M = `A'
end
********************************************************************************

*

********************************************************************************
* Display output for "c_tree"
********************************************************************************
cap program drop display_output_c_tree
program display_output_c_tree
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Tree classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal tree depth = " e(OPT_DEPTH)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_svm"
********************************************************************************
cap program drop display_output_c_svm
program display_output_c_svm
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Support Vector Machine classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal C parameter = " e(OPT_C)
noi di "Optimal GAMMA parameter = " e(OPT_GAMMA) _continue
noi di _col(45) "Training accuracy = " e(TRAIN_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "Testing accuracy = " e(TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_randomforest"
********************************************************************************
cap program drop display_output_c_randomforest
program display_output_c_randomforest
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Random Forest classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal tree depth = " e(OPT_MAX_DEPTH)
noi di "Optimal n. of splitting features = " e(OPT_MAX_FEATURES) _continue
noi di _col(45) "Optimal n. of trees = " e(OPT_N_ESTIMATORS) 
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_neuralnet"
********************************************************************************
cap program drop display_output_c_neuralnet
program display_output_c_neuralnet
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Neural Network classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal n. of neurons in layer 1 = " e(OPT_NEURONS_L_1)
noi di "Optimal n. of neurons in layer 2 = " e(OPT_NEURONS_L_2) _continue
noi di _col(45) "Optimal L2 penalization = " e(OPT_ALPHA) 
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_nearestneighbor"
********************************************************************************
cap program drop display_output_c_nearestneighbor
program display_output_c_nearestneighbor
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Nearest Neighbor classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal n. of nearest neighbors = " e(OPT_NN)
noi di "Optimal kernel function = " "`e(OPT_WEIGHT)'" _continue
noi di _col(45) "Training accuracy = " e(TRAIN_ACCURACY)
noi di "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_boost"
********************************************************************************
cap program drop display_output_c_boost
program display_output_c_boost
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Boosting classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal learning rate = " e(OPT_LEARNING_RATE)
noi di "Optimal n. of trees = " e(OPT_N_ESTIMATORS) _continue
noi di _col(45) "Optimal tree depth = " e(OPT_MAX_DEPTH)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_rmn"
********************************************************************************
cap program drop display_output_c_rmn
program display_output_c_rmn
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Regularized Multinomial classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal penalization parameter = " e(OPT_PENALIZATION)
noi di "Optimal elastic parameter = " e(OPT_L1_RATIO) _continue
noi di _col(45) "Training accuracy = " e(TRAIN_ACCURACY)
noi di "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************
*
********************************************************************************
* Display output for "c_naivebayes"
********************************************************************************
cap program drop display_output_c_naivebayes
program display_output_c_naivebayes
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Naive Bayes classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Best grid index = " e(BEST_INDEX) _continue
noi di _col(45) "Optimal variance smoothing = " e(OPT_VAR_SMOOTHING)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************


********************************************************************************
* Display output for "c_multinomial"
********************************************************************************
cap program drop display_output_c_multinomial
program display_output_c_multinomial
syntax , [prediction] 

noi di "{hline 80}"
noi di in gr "{bf:Learner: Multinomial classification}"
di " "

noi di in gr "{ul:Dataset information}"
di " "
noi di "Target variable = " `""`e(dep_var)'""'    _continue
noi di _col(45) "Number of features  = " e(N_features)
noi di "N. of training units = " e(N_train_all) _continue
noi di _col(45) "N. of testing units = " e(N_test_all)
noi di "N. of used training units = " e(N_train_used) _continue
noi di _col(45) "N. of used testing units = " e(N_test_used)
noi di "{hline 80}"
di " "

noi di "{ul:Cross-validation results}"
di " "
noi di "Accuracy measure = rate correct matches" _continue
noi di _col(45) "Number of folds = " e(N_FOLDS)
noi di "Training accuracy = " e(TRAIN_ACCURACY) _continue
noi di _col(45) "Testing accuracy = " e(TEST_ACCURACY)
noi di "Std. err. test accuracy = " e(SE_TEST_ACCURACY)
noi di "{hline 80}"

if "`prediction'"!=""{
di " "
noi di in gr "{ul:Validation results}"
di " "
noi di in gr "CER = classification error rate" _continue
noi di in gr _col(45) "Training CER = " e(Train_err)
noi di in gr "Testing CER = " e(Test_err)
noi di " "
noi di "{hline 80}"
}
end 
********************************************************************************

*
*
*

********************************************************************************
*! "c_ml_stata_cv"
********************************************************************************
program c_ml_stata_cv , eclass
version 16
syntax varlist(numeric) [if] [in] , ///
mlmodel(string) ///
data_test(name)  ///
seed(numlist max=1 integer) ///
[cross_validation(name)  ///
n_folds(numlist max=1 integer >=2) ///
graph_cv save_graph_cv(name) ///
prediction(name) ///
tree_depth(numlist min=1 integer) ///
n_estimators(numlist min=1 integer) ///
learning_rate(numlist min=1) ///
max_features(numlist min=1 integer) ///
c(numlist min=1) ///
gamma(numlist min=1) ///
alpha(numlist min=1) ///
l1_ratio(numlist min=1) ///
nn(numlist min=1 integer) ///
n_neurons_l1(numlist min=1 integer) ///
n_neurons_l2(numlist min=1 integer) ///
default ///
]
if "`default'"!="" & "`prediction'"==""{
	di _newline
	di in red "*************************************************************"
	di in red "WARNING: Option 'prediction()' needed in the 'default' mode."
	di in red "*************************************************************"
	di _newline
break
exit	
}
********************************************************************************
if "`mlmodel'"!="multinomial" & "`mlmodel'"!="regmult" & "`mlmodel'"!="tree" & "`mlmodel'"!="randomforest" & "`mlmodel'"!="boost" & "`mlmodel'"!="nearestneighbor" & "`mlmodel'"!="neuralnet" & "`mlmodel'"!="svm" & "`mlmodel'"!="naivebayes" {
	di _newline
	di in red "*************************************************************************"
	di in red "WARNING: The argument of option 'mlmodel()' must be one of these:        "
	di in red "'multinomial', 'regmult', 'tree', 'randomforest', 'boost',"
	di in red "'nearestneighbor', 'neuralnet', 'svm', 'naivebayes'.                     "                                 
	di in red "*************************************************************************"
	di _newline
break
exit	
}
********************************************************************************
if "`c(os)'" == "MacOSX"{
	cap rm _____in_prediction.dta
	cap rm _____out_prediction.dta
	cap rm _____out_sample_y.dta
	cap rm _____out_sample_x.dta
}
else{
	cap erase _____in_prediction.dta
	cap erase _____out_prediction.dta
	cap erase _____out_sample_y.dta
	cap erase _____out_sample_x.dta
}
********************************************************************************
di ""
di as text "=== Begin Python dependencies ======================================================================="
pylearn , setup
di as text "=== End Python dependencies ========================================================================="
di ""
********************************************************************************
set varabbrev off
********************************************************************************
* DROP ALL THE LABEL VALUES AND PUT THEM IN A TEMPFILE
********************************************************************************
tempfile LABELS
qui: label save _all using `LABELS' , replace
qui: label drop _all
********************************************************************************
* SPLIT "varlist" INTO "target" AND "features"
********************************************************************************
gettoken y X : varlist
********************************************************************************
* CONFIRM THAT "y" IS AN INTEGER VARIABLE (INCLUDING "0")
********************************************************************************
cap confirm int variable `y'
if _rc {
	recast int `y' , force
}
********************************************************************************
* MARK THE SAMPLE TO USE
********************************************************************************
marksample touse
********************************************************************************
if "`default'"!=""{
c_ml_stata_default `varlist' if `touse' ,  ///
mlmodel("`mlmodel'")            ///
data_test("`data_test'")        ///
prediction("`prediction'")      ///
seed("`seed'") 
exit
}
if "`default'"=="" & ("`cross_validation'"=="" | "`n_folds'"==""){
	di _newline
	di in red "*************************************************************************"
	di in red "WARNING: It seems you want to run this command not in the 'default' mode."
	di in red "It means that you want to use cross-validation. If it is the case,       "                         	
	di in red "provide options 'cross_validation()' and 'n_folds()', plus               " 
	di in red "all the options required by the specific learner you wish to run.        " 
	di in red "On the contrary, if it is your intention to run the default more,        "	
	di in red "only add the option 'default'.                                           "	
	di in red "*************************************************************************"
	di _newline
break
exit
}
********************************************************************************
* WARNING
********************************************************************************
cap confirm file "`cross_validation'.dta"
if _rc==0 {
	di _newline
	di in red "******************************************************************************"
	di in red "WARNING: A file named `cross_validation'.dta exists in your working directory." 
	di in red "Please, change name to this dataset or delete it before running this command. "   
	di in red "******************************************************************************"
	di _newline
break
exit
}
********************************************************************************
* SAVE THE INITIAL DATASET
********************************************************************************
tempfile data_initial 
qui count
local N_train_all=r(N)
tempvar __id
gen `__id'=_n
qui: save `data_initial' , replace
********************************************************************************
* SELECT THE TRAINING SAMPLE
********************************************************************************
qui: keep if `touse'
preserve
qui keep `__id'
tempfile data_id
qui save `data_id' , replace
restore
********************************************************************************
* WARNING:
********************************************************************************
cap confirm file "_____out_sample_y.dta"
if _rc==0 {
di _newline
di in red "*****************************************************************************"
di in red "WARNING: A file named '_____out_sample_y.dta' exists in your working directory.   " 
di in red "Please, change name to this dataset or delete it before running this command."   
di in red "*****************************************************************************"
di _newline
break
exit
}
********************************************************************************
* WARNING:
********************************************************************************
cap confirm file "_____out_sample_x.dta"
if _rc==0 {
di _newline
di in red "*****************************************************************************"
di in red "WARNING: A file named '_____out_sample_x.dta' exists in your working directory.   " 
di in red "Please, change name to this dataset or delete it before running this command."   
di in red "*****************************************************************************"
di _newline
break
exit
}
********************************************************************************
* FORM THE TESTING TARGET VARIABLE
********************************************************************************
local _____out_sample_y "_____out_sample_y"
preserve
qui: use `data_test' , clear
qui: keep `y' 
qui save `_____out_sample_y' , replace
restore
*
********************************************************************************
* WARNING:
********************************************************************************
local _____out_sample_x "_____out_sample_x"
preserve
qui: use `data_test' , clear
capture{
	qui: keep `X' 	
}
local rc=_rc 
if `rc'==111{
	di _newline
	di in red "******************************************************************"	 	
	di in red "WARNING: Your testing dataset does not contain the same features  "
	di in red "of your training dataset. It is required that your testing dataset" 
	di in red "owns all the features declared in varlist.                        "
	di in red "******************************************************************"	 	
	break 
	exit    
}
label drop _all
qui save `_____out_sample_x' , replace
restore
********************************************************************************
local k: word count `X'
preserve
qui: use "`_____out_sample_x'" , clear
qui: des 
local p=r(k)
restore
if `p'!=`k'{
	di _newline
	di in red "*******************************************************************"	 	
	di in red "WARNING: Your testing set has `p' features, while your training set"
	di in red "has `k' features (those listed in 'varlist').                      " 
	di in red "The two sets must have the same number of features.                "  
	di in red "*******************************************************************"	 	
	break 
	exit                              
}
else{
	preserve
	qui: keep `X'
	order _all , alpha
	qui: ds
	local X_train_sort `r(varlist)'
	qui use "`_____out_sample_x'" , clear
	order _all , alpha
	qui: ds
	local X_test_sort `r(varlist)'
	order `X_test_sort'
	local i=1
	foreach v of local X_train_sort{
		local h: word `i' of `X_test_sort'
		if "`v'"!="`h'"{
        di _newline
	    di in red "***********************************************************************************"	 	
        di in red "WARNING: The predictors in the testing set do not match those in 'varlist'.        "
        di in red "Please, let the testing set only contain the sole predictors declared in 'varlist'."
	    di in red "***********************************************************************************"
		di _newline
		break 
		exit 1
		}
		local i=`i'+1
	}
	restore
}
********************************************************************************
* WARNING
********************************************************************************
foreach v of local X{
     capture confirm string variable `v'
         if !_rc {di _newline
				  di in red "*********************************************************************************"	 	
                  di in red "WARNING: In your training set of predictors, variable '`v'' is a string variable."
				  di in red "Please, make it numerical. If this variable is categorical,                      "
				  di in red "please generate the categorical binary dummies and use them as predictors        "
				  di in red "in place of variable '`v''.                                                      "  
				  di in red "*********************************************************************************"
				  di _newline
                  break
                  exit				  
				}
        }
********************************************************************************
* WARNING
********************************************************************************
foreach v of local y{
     capture confirm string variable `v'
         if !_rc {di _newline
				  di in red "*******************************************************************************"	 	
                  di in red "WARNING: In your training set the target variable '`v'' is a string variable.  "
				  di in red "Please, make it numerical. As this is a classification setting,                "
				  di in red "please recode this variable so to take values [1,2,...,M] in a M-class setting."  
				  di in red "if it is binary, please recode it so to take values [1,2].                     "
				  di in red "*******************************************************************************"	 	
				  di _newline
                  break
                  exit
                }
        }
********************************************************************************
* WARNING
********************************************************************************
preserve
qui: use `_____out_sample_x', clear
foreach v of local X{
     capture confirm string variable `v'
         if !_rc {
		 	      di _newline
				  di in red "********************************************************************************"
                  di in red "WARNING: In your testing set of predictors, variable '`v'' is a string variable."
				  di in red "Please, make it numerical. If this variable is categorical,                     "
				  di in red "please generate the categorical binary dummies and use them as predictors       "  
				  di in red "in place of variable '`v''.                                                     "  
				  di in red "********************************************************************************"
				  di _newline
                  break
                  exit               		
				}
        }
restore
********************************************************************************
* WARNING
********************************************************************************
preserve
qui: use `_____out_sample_y', clear
foreach v of local y{
     capture confirm string variable `v'
         if !_rc {
		 	      di _newline
				  di in red "*******************************************************************************"
                  di in red "WARNING: In your testing set the target variable '`v'' is a string variable.   "
				  di in red "Please, make it numerical. As this is a classification setting,                "
				  di in red "please recode this variable so to take values [1,2,...,M] in a M-class setting."  
				  di in red "if it is binary, please recode it so to take values [1,2].                     "
				  di in red "*******************************************************************************"
				  di _newline
                  break
                  exit
                }
        }
restore
********************************************************************************
* COUNT THE USED TESTING OBSERVATIONS AND ELIMINATE MISSINGS FROM "_____out_sample_x"
********************************************************************************
preserve 
qui: use `_____out_sample_x', clear
tempfile _____out_sample_x_initial
qui: save `_____out_sample_x_initial', replace
qui: use `_____out_sample_x_initial' , clear
tempvar __id2
qui gen `__id2'= _n
qui save `_____out_sample_x_initial', replace
restore
********************************************************************************
preserve
qui: use `_____out_sample_x_initial' , clear
qui reg _all
qui: keep if e(sample)
keep `__id2'
tempfile data_id2
qui save `data_id2', replace
restore
********************************************************************************
preserve
qui: use `_____out_sample_x', clear
qui count 
local N_test_all=r(N)
qui reg _all
qui count if e(sample)
local N_test_used=r(N)
qui: keep if e(sample)
qui: save `_____out_sample_x', replace
restore
********************************************************************************
* WARNING
********************************************************************************
capture confirm variable index
if !_rc {
	di _newline
	di in red "************************************************"
	di in red "WARNING: One of your variables is names 'index'." 
	di in red "Please, change name to this variable.           "   
	di in red "************************************************"
	di _newline
	break
	exit
}
********************************************************************************
* WARNING
********************************************************************************
cap confirm file "_____in_prediction.dta"
if _rc==0 {
	di _newline
	di in red "*****************************************************************************"
	di in red "WARNING: A file named '_____in_prediction.dta' exists in your working directory.  " 
	di in red "Please, change name to this dataset or delete it before running this command."   
	di in red "*****************************************************************************"
	di _newline
break
exit
}
********************************************************************************
cap confirm file "_____out_prediction.dta"
if _rc==0 {
	di _newline
	di in red "*****************************************************************************"
	di in red "WARNING: A file named '_____out_prediction.dta' exists in your working directory. " 
	di in red "Please, change name to this dataset or delete it before running this command."   
	di in red "*****************************************************************************"
	di _newline
	break
	exit
}
********************************************************************************
local _____in_prediction "_____in_prediction"
local _____out_prediction "_____out_prediction"
********************************************************************************
*
********************************************************************************
* COMPUTE THE SAMPLE SIZE OF THE TRAINING DATASET. 
* THEN, MAKE THE ORDERING OF THE "X" IN THE TESTING DATASET THE SAME 
* AS THE ORDERING OF THE "X" IN THE TRAINING DATASET. 
********************************************************************************
preserve
qui count if `touse'
local N_train_used=r(N)
local N_features: word count `X'
qui: use `_____out_sample_x' , clear
order `X'  // same order of the X in the training and testing datasets
qui: save `_____out_sample_x'  , replace
restore
********************************************************************************
* PASS THE STATA DIRECTORY TO PYTHON
********************************************************************************
local dir `c(pwd)'
********************************************************************************
* SELECT AND ORDER THE VARIABLES IN THE TRAINING DATASET
********************************************************************************
keep `y' `X'
order `y' `X'
********************************************************************************
* WARNING
********************************************************************************
qui{
preserve
keep `y' `X'
order `y' `X'
qui count 
local NN=r(N)
qui reg `y' `X'
keep if e(sample)
qui count 
local SS=r(N)
restore
}
********************************************************************************
if `SS'!=`NN'{
	di _newline
	di in red "********************************************************************"
	di in red "WARNING: It seems there are missing values in your training dataset,"
	di in red "either in your target variable, or in your predictors.              "
	di in red "Please, check and remove them. Then, re-run this command.           "
	di in red "********************************************************************"
	di _newline
	exit
}
********************************************************************************
* WARNING 
********************************************************************************
qui{
preserve
qui: use `_____out_sample_x' , clear
qui count 
local NN=r(N)
qui reg _all
keep if e(sample)
qui count 
local SS=r(N)
restore
}
********************************************************************************
if `SS'!=`NN'{
	di _newline
	di in red "***********************************************************************************"
	di in red "WARNING: It seems there are missing values in the features of your testing dataset."
	di in red "Please, check and remove them. Then, re-run this command.                          "
	di in red "***********************************************************************************"
	di _newline
	exit
}
*
********************************************************************************
* ESTIMATION PROCEDURE
********************************************************************************
* SAVE THE DATASET AS IT IS
tempfile data_fitting 
********************************************************************************
* USE "levelsof" FOR RENAMING PREDICTION PROBABILITIES LATER
********************************************************************************
qui: levelsof `y' , local(ylevels) clean 
********************************************************************************
* SAVE "data_fitting"
********************************************************************************
qui: save `data_fitting' , replace
********************************************************************************

di in red "=== Begin Python warnings ==========================================================================="

********************************************************************************
* PYTHON CODE - BEGIN 
********************************************************************************
else if "`mlmodel'"=="multinomial"{
python: c_multinomial()
preserve
qui: use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
}
********************************************************************************
else if "`mlmodel'"=="naivebayes"{
python: c_naivebayes()
preserve
qui: use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_VAR_SMOOTHING=OPT_VAR_SMOOTHING
}
********************************************************************************
else if "`mlmodel'"=="regmult"{
	
	if "`l1_ratio'"!="" & "`alpha'"!=""{
	numlist_to_matrix , num_list(`l1_ratio')
	mat mat_l1_ratio = r(M)	

	numlist_to_matrix , num_list(`alpha')
	mat mat_alpha = r(M)
		
	}
else{
	di _newline
	di in red "*******************************************************************"
	di in red "WARNING: Regularized Multinomial requires to specify these options:" 
	di in red "(1) 'alpha()'; (2) 'l1_ratio()'.                                   "   
	di in red "*******************************************************************"
	di _newline
break
exit
}
python: c_rmn()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_PENALIZATION=OPT_PENALIZATION
ereturn scalar OPT_L1_RATIO=OPT_L1_RATIO
}
********************************************************************************
if "`mlmodel'"=="boost"{
	if "`n_estimators'"!="" & "`tree_depth'"!="" & "`learning_rate'"!=""{
	numlist_to_matrix , num_list(`n_estimators')
	mat mat_n_estmators = r(M)	

	numlist_to_matrix , num_list(`tree_depth')
	mat mat_max_depth = r(M)

	numlist_to_matrix , num_list(`learning_rate')
	mat mat_learning_rate = r(M)		
	}
else{
	di _newline
	di in red "****************************************************************"
	di in red "WARNING: Boosting requires to specify these options:            " 
	di in red "(1) 'n_estimators()'; (2) 'tree_depth()'; (3) 'learning_rate()'."   
	di in red "****************************************************************"
	di _newline
break
exit
}
python: c_boost()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_LEARNING_RATE=OPT_LEARNING_RATE
ereturn scalar OPT_N_ESTIMATORS=OPT_N_ESTIMATORS
ereturn scalar OPT_MAX_DEPTH=OPT_MAX_DEPTH
}
********************************************************************************
else if "`mlmodel'"=="nearestneighbor"{
	if "`nn'"!=""{ 
	numlist_to_matrix , num_list(`nn')
	mat mat_n_neighbor = r(M)
	}
else{
	di _newline
	di in red "****************************************************************"
	di in red "WARNING: Nearest Neighbor requires to specify this option:      " 
	di in red "(1) 'nn()'.                                                     "   
	di in red "****************************************************************"
	di _newline
break
exit	
}
python: c_nearestneighbor()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_NN=OPT_NN
ereturn local OPT_WEIGHT "$OPT_WEIGHT"
}
********************************************************************************
else if "`mlmodel'"=="neuralnet"{
	if "`n_neurons_l1'"!="" & "`n_neurons_l1'"!="" & "`alpha'"!=""{ 
		numlist_to_matrix , num_list(`n_neurons_l1')
		mat mat_n_neurons_l1 = r(M)	

		numlist_to_matrix , num_list(`n_neurons_l2')
		mat mat_n_neurons_l2 = r(M)	
		
		numlist_to_matrix , num_list(`alpha')
		mat mat_alpha = r(M)	
    }
else{
	di _newline
	di in red "**********************************************************"
	di in red "WARNING: Neural Network requires to specify these options:" 
	di in red "(1) 'n_neurons_l1()'; (2) 'n_neurons_l2()'; (3) 'alpha()'."   
	di in red "**********************************************************"
	di _newline
break
exit
}
python: c_neuralnet()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_NEURONS_L_1=OPT_NEURONS_L_1
ereturn scalar OPT_NEURONS_L_2=OPT_NEURONS_L_2
ereturn scalar OPT_ALPHA=OPT_ALPHA
}
********************************************************************************
else if "`mlmodel'"=="randomforest"{
	if "`n_estimators'"!="" & "`tree_depth'"!="" & "`max_features'"!=""{
	numlist_to_matrix , num_list(`n_estimators')
	mat mat_n_estmators = r(M)	

	numlist_to_matrix , num_list(`tree_depth')
	mat mat_tree_depth = r(M)

	numlist_to_matrix , num_list(`max_features')
	mat mat_max_features = r(M)
	}
else{
	di _newline
	di in red "**************************************************************"
	di in red "WARNING: Random Forest requires to specify these options:     " 
	di in red "(1) 'n_estimators()'; (2) 'tree_depth()'; (3) 'max_features()'"   
	di in red "**************************************************************"
	di _newline
break
exit	
}
python: c_randomforest()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_MAX_DEPTH=OPT_MAX_DEPTH
ereturn scalar OPT_MAX_FEATURES=OPT_MAX_FEATURES
ereturn scalar OPT_N_ESTIMATORS=OPT_N_ESTIMATORS
}
********************************************************************************
else if "`mlmodel'"=="svm"{
	if "`c'"!="" & "`gamma'"!="" {
	numlist_to_matrix , num_list(`c')
	mat mat_c = r(M)	

	numlist_to_matrix , num_list(`gamma')
	mat mat_gamma = r(M)
	}
else{
	di _newline
	di in red "******************************************************************"
	di in red "WARNING: Support Vector Machine requires to specify these options:" 
	di in red "(1) 'c()'; (2) 'gamma()'.                                         "   
	di in red "******************************************************************"
	di _newline
break
exit	
}
python: c_svm()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_C=OPT_C
ereturn scalar OPT_GAMMA=OPT_GAMMA
}
********************************************************************************
else if "`mlmodel'"=="tree"{
	if "`tree_depth'"!=""{
	numlist_to_matrix , num_list(`tree_depth')
	mat mat_tree_depth = r(M)	
    }
else{
	di _newline
	di in red "*********************************************************"
	di in red "WARNING: Regression tree requires to specify this option:" 
	di in red "(1) 'tree_depth()'                                       "   
	di in red "**********************************************************"
	di _newline
break
exit	
}	
python: c_tree()
preserve
use `_____in_prediction' , clear
if "`prediction'"!=""{
rename _0 `prediction' 
}
qui: save `_____in_prediction' , replace
restore
ereturn clear
ereturn scalar OPT_DEPTH=OPT_LEAVES
}
*
********************************************************************************
di in red "=== End Python warnings ============================================================================="
di _newline
di as text "=== Begin Stata output =============================================================================="
********************************************************************************
* STORE RESULTS
******************************************************************************** 
preserve
********************************************************************************
qui: use `cross_validation' , clear
qui sum mean_test_score
scalar max_score_test=r(max)
ereturn scalar TEST_ACCURACY=max_score_test
********************************************************************************
qui sum mean_train_score if mean_test_score==max_score_test
scalar score_train=r(mean)
ereturn scalar TRAIN_ACCURACY=score_train
********************************************************************************
qui sum index if mean_test_score==max_score_test
scalar max_index=r(mean)
ereturn scalar BEST_INDEX=int(max_index)
ereturn scalar SE_TEST_ACCURACY=std_test_score[max_index+1]
ereturn scalar N_FOLDS=`n_folds'
********************************************************************************
restore
********************************************************************************
preserve
if "`prediction'"!=""{
qui: use `_____out_prediction' , clear
rename _0 `prediction' 
}
qui: save `_____out_prediction' , replace
restore
********************************************************************************
* CROSS-VALIDATION GRAPH
********************************************************************************
qui: use `cross_validation' , clear
local A=int(max_index)
if "`graph_cv'"!=""{
tw ///
(line mean_test_score index  , xline(`A',lp(dash) lw(thick))) ///
(line mean_train_score index ) , ///
legend(order(1 "TEST ACCURACY" 2 "TRAIN ACCURACY")) ///
note("Learner = `mlmodel'" "Optimal index = `A'" "Number of folds = `n_folds'") ///
ytitle(Accuracy) xtitle(Index) ///
graphregion(fcolor(white)) scheme(s2mono)
}
********************************************************************************
if "`save_graph_cv'"!=""{
set graph off
tw ///
(line mean_test_score index  , xline(`A',lp(dash) lw(thick))) ///
(line mean_train_score index ) , ///
legend(order(1 "TEST ACCURACY" 2 "TRAIN ACCURACY")) ///
note("Learner = `mlmodel'" "Optimal index = `A'" "Number of folds = `n_folds'") ///
ytitle(Accuracy) xtitle(Index) ///
graphregion(fcolor(white)) scheme(s2mono)
qui: graph save `save_graph_cv' , replace
set graph on	
}
********************************************************************************
if "`prediction'"!=""{
********************************************************************************
* WARNING
********************************************************************************
capture confirm variable _train_index
if !_rc {
di _newline
di in red "*****************************************************************************"
di in red "WARNING: One of your variables in the dataset is named '_train_index'.        " 
di in red "Please, change name to this variable before running this command,            "
di in red "as this name is used for the identifier of training and testing observations."     
di in red "*****************************************************************************"
di _newline
break
exit
}
********************************************************************************	
qui{ // start quietly
tempfile _____out_sample_x_y
qui: use `_____out_sample_x'
qui merge 1:1 _n using `_____out_sample_y'
cap drop _merge
qui merge 1:1 _n using `_____out_prediction'
cap drop _merge
qui merge 1:1 _n using `data_id2'
cap drop _merge
drop if `__id2'==.
qui save `_____out_sample_x_y'
********************************************************************************
preserve
qui: use `_____out_sample_x_initial', clear
merge 1:1 `__id2' using `_____out_sample_x_y'
cap drop _merge
save `_____out_sample_x_initial', replace
restore
********************************************************************************
qui: use `data_initial' , clear
preserve
qui: use `_____in_prediction' , clear
merge 1:1 _n using `data_id'
cap drop _merge
qui: save `_____in_prediction' , replace
restore
qui: merge 1:1 `__id' using `_____in_prediction'
cap drop _merge
cap drop _train_index
gen _train_index = "train"
********************************************************************************
preserve
qui use `_____out_sample_x_initial' , clear
merge 1:1 _n using `data_test'
cap drop _merge
qui save `_____out_sample_x_initial' , replace
restore
********************************************************************************
* Append the dataset "`_____out_sample_x_initial'"
********************************************************************************
append using `_____out_sample_x_initial'
replace _train_index = "test" if _train_index==""
********************************************************************************
* Compute validation train-ERR (Error rate)
tempvar v_train_err 
gen `v_train_err'= (`y'!=`prediction') if (_train_index=="train" & `y'!=. & `prediction'!=.)  
qui sum `v_train_err' 
ereturn scalar Train_err = r(mean)

* Compute validation test-ERR (Error rate)
tempvar v_test_err 
gen `v_test_err'= (`y'!=`prediction') if (_train_index=="test" & `y'!=. & `prediction'!=.)
qui sum `v_test_err' 
ereturn scalar Test_err = r(mean)
} // end quietly
}
else{
qui: use `data_initial' , clear
} 
********************************************************************************
* Display output
********************************************************************************
ereturn local dep_var "`y'"
ereturn scalar N_features=`N_features' 
ereturn scalar N_train_all=`N_train_all' 
ereturn scalar N_test_all=`N_test_all'  
ereturn scalar N_train_used=`N_train_used' 
ereturn scalar N_test_used=`N_test_used' 
********************************************************************************
* Tree
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="tree"{
    display_output_c_tree 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="tree"{
    display_output_c_tree , prediction	
	}
}
********************************************************************************
* Elasticnet
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="elasticnet"{
    display_output_c_elasticnet 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="elasticnet"{
    display_output_c_elasticnet , prediction	
	}
}
********************************************************************************
* SVM
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="svm"{
    display_output_c_svm 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="svm"{
    display_output_c_svm , prediction	
	}
}
********************************************************************************
* Random forests
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="randomforest"{
    display_output_c_randomforest 
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="randomforest"{
    display_output_c_randomforest , prediction	
	}
}
********************************************************************************
* Neural network
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="neuralnet"{
    display_output_c_neuralnet
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="neuralnet"{
    display_output_c_neuralnet , prediction	
	}
}
********************************************************************************
* Nearest Neighbor
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="nearestneighbor"{
    display_output_c_nearestneighbor
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="nearestneighbor"{
    display_output_c_nearestneighbor , prediction	
	}
}
********************************************************************************
* Boosting
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="boost"{
    display_output_c_boost
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="boost"{
    display_output_c_boost , prediction	
	}
}
********************************************************************************
* Regularized Multinomial
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="regmult"{
    display_output_c_rmn
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="regmult"{
    display_output_c_rmn , prediction	
	}
}
********************************************************************************
* Naive Bayes
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="naivebayes"{
    display_output_c_naivebayes
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="naivebayes"{
    display_output_c_naivebayes , prediction	
	}
}
********************************************************************************
* Multinomial
********************************************************************************
if "`prediction'"==""{
	if "`mlmodel'"=="multinomial"{
    display_output_c_multinomial
	}
}
else if "`prediction'"!=""{
	if "`mlmodel'"=="multinomial"{
    display_output_c_multinomial , prediction	
	}
}
********************************************************************************
di as text "=== End Stata output ================================================================================"
********************************************************************************
cap drop index
********************************************************************************
if "`c(os)'" == "MacOSX"{
	cap rm `_____in_prediction'.dta
	cap rm `_____out_prediction'.dta
	cap rm `_____out_sample_y'.dta
	cap rm `_____out_sample_x'.dta
}
else{
	cap erase `_____in_prediction'.dta
	cap erase `_____out_prediction'.dta
	cap erase `_____out_sample_y'.dta
	cap erase `_____out_sample_x'.dta
}
********************************************************************************
set varabbrev on
********************************************************************************
qui: do `LABELS'
********************************************************************************
* RENAME PREDICTION PROBABILITIES
********************************************************************************
local i=1
foreach num of local ylevels{
	rename _`i' `prediction'_prob`num' 
	local i=`i'+1
}
********************************************************************************
end 
********************************************************************************

*
*
*

********************************************************************************
* PYTHON FUNCTIONS
********************************************************************************
*
python:

# ******************************************************************************
# * TITLE:  "DECISION TREE CLASSIFIER USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "DecisionTreeClassifier()"
# *****************************************************************************
def c_tree():
	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE a TREE (with the "number of leaves" parameter=5)
	model=DecisionTreeClassifier(max_depth=5,random_state=R)

	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED
	tree_depth = Matrix.get("mat_tree_depth")
	k_range=tree_depth[0]
	# k_range = list(range(1,16))
	k_range = [int(x) for x in k_range]

	# CREATE A PARAMETER GRID: 
	# MAP THE PARAMETER NAMES TO THE VALUES THAT SHOULD BE SEARCHED
	param_grid = dict(max_depth=k_range)

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 

	# STORE THE BEST NUMBER OF NEIGHBORS INTO A STATA SCALAR
	opt_leaves=grid.best_params_.get('max_depth')
	Scalar.setValue('OPT_LEAVES',opt_leaves,vtype='visible')

	# STORE THE BEST PARAMETER INTO A VARIABLE
	opt_tt=grid.best_params_.get('max_depth')

	# USING THE BEST PARAMETER TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST PARAMETERS
	model = DecisionTreeClassifier(max_depth=opt_tt,random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_max_depth','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['Tree depth', 'Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','Tree depth', 'Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("================================================")
	print(" Results of cross-validation grid search")
	print("================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("================================================")
	
#******************************************************************************
# * TITLE:  "SUPPORT VECTOR MACHINE CLASSIFICATION USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "svm.SVC()"
# *****************************************************************************
def c_svm():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn import svm
	from sklearn.svm import SVC
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE A SVM (with parameters: kernel='rbf', C = 10.0, gamma=0.1)
	model = svm.SVC(kernel='rbf', C = 10.0, gamma=0.1, probability=True,random_state=R)

	# SVMC "CROSS-VALIDATION" FOR "C" AND "GAMMA" BY PRODUCING A "GRID SEARCH"
	# GENERATE THE TWO PARAMETERS' GRID AS A "LIST"
	
	_M = Matrix.get("mat_c")
	gridC=_M[0]
	
	_M = Matrix.get("mat_gamma")
	gridG=_M[0]
	
	#gridC=list(range(1,101,10))
	#gridG=[0.1,0.2,0.35,0.5,0.65,0.8,1,10]

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'C': gridC, 'gamma': gridG}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	
	# GET THE VALUE "opt_c" AND PUT IT INTO A STATA SCALAR "OPT_C"
	opt_c=grid.best_params_.get('C')
	Scalar.setValue('OPT_C',opt_c,vtype='visible')

	# GET THE VALUE "opt_gamma" AND PUT IT INTO A STATA SCALAR "OPT_GAMMA"
	opt_gamma=grid.best_params_.get('gamma')
	Scalar.setValue('OPT_GAMMA',opt_gamma, vtype='visible')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = svm.SVC(kernel='rbf', C=opt_c, gamma=opt_gamma, probability=True, random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_C','param_gamma','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['c', 'Gamma' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','c', 'Gamma' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("===============================================")
	print(" Results of cross-validation grid search")
	print("===============================================")
	print(ALL_CV_RES.to_string(index=False))
	print("===============================================")

#******************************************************************************
# * TITLE:  "RANDOM FOREST CLASSIFICATION USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "RandomForestClassifier()"
# *****************************************************************************
def c_randomforest():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# COMPUTE THE NUMBER OF FEATURES 
	X = np.array(X)
	n_features=int(len(X[0]))

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE a RF-TREE (with RF parameters)
	model = RandomForestClassifier(max_depth=5, n_estimators=3, max_features=2,random_state=R)

	# RFC "CROSS-VALIDATION":
	# WE CROSS-VALIDATE OVER TWO PARAMETERS:
	# 1. "D = tree depth" (i.e., number of layers of the tree);
	# 2. "G = n. of features to randomly consider at each split"
	# 3. "B = number of bootstraps"

	# GENERATE THE TWO PARAMETERS' GRID AS "LISTS"
	
	_M = Matrix.get("mat_tree_depth")
	gridD=_M[0]
	gridD = [int(x) for x in gridD]
	
	_M = Matrix.get("mat_max_features")
	gridG=_M[0]
	gridG = [int(x) for x in gridG]
	
	_M = Matrix.get("mat_n_estmators")
	gridB=_M[0]
	gridB = [int(x) for x in gridB]

	# GENERATE THE TWO PARAMETERS' GRID AS "LISTS"
	#gridD=list(range(1,31))
	#gridG=list(range(1,n_features+1))
	#gridB=[50,100,150,200]

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'max_depth': gridD, 'max_features': gridG, 'n_estimators': gridB}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 

	# GET THE VALUE "opt_max_depth" AND PUT IT INTO A STATA SCALAR "OPT_MAX_DEPTH"
	opt_max_depth=grid.best_params_.get('max_depth')
	Scalar.setValue('OPT_MAX_DEPTH',opt_max_depth,vtype='visible')

	# GET THE VALUE "opt_max_features" AND PUT IT INTO A STATA SCALAR "OPT_MAX_FEATURES"
	opt_max_features=grid.best_params_.get('max_features')
	Scalar.setValue('OPT_MAX_FEATURES',opt_max_features, vtype='visible')
	
	# GET THE VALUE "opt_n_estimators" AND PUT IT INTO A STATA SCALAR "OPT_N_ESTIMATORS"
	opt_n_estimators=grid.best_params_.get('n_estimators')
	Scalar.setValue('OPT_N_ESTIMATORS',opt_n_estimators, vtype='visible')

	# STORE THE THREE BEST PARAMETERS INTO THREE VARIABLES 
	opt_max_depth=grid.best_params_.get('max_depth')
	opt_max_features=grid.best_params_.get('max_features')
	opt_n_estimators=grid.best_params_.get('n_estimators')

	# USING THE BEST PARAMETERS TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = RandomForestClassifier(max_depth=opt_max_depth, n_estimators=opt_n_estimators, max_features=opt_max_features,random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_max_depth','param_max_features','param_n_estimators','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['Tree depth', 'N. of splitting features', 'N. of trees','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','Tree depth', 'N. of splitting features','N. of trees','Train accuracy','Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("=====================================================================================")
	print(" Results of cross-validation grid search")
	print("=====================================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("=====================================================================================")
	
#******************************************************************************
# * TITLE:  "NEURAL-NETWORK CLASSIFICATION USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "MLPClassifier()"
# *****************************************************************************
def c_neuralnet():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.neural_network import MLPClassifier
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE a BOOSTING-TREE (with boosting parameters)
	model = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=R)

	# ABC "CROSS-VALIDATION"
	# WE CROSS-VALIDATE OVER TWO PARAMETERS:
	# 1. "D = learning_rate"
	# 2. "G = n_estimators"

	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED FOR CROSS-VALIDATION
	# 1. "NUMBER OF NEURONS LAYER 1" 
	# 2. "NUMBER OF NEURONS LAYER 2" 
	# 3. "L2 PENALIZIATION PARAMETER"
	# 4. "RANDOM SEED" 

	# CREATE A PARAMETER GRID AS A 2D "LIST" FOR NEURONS-LAYER-1 AND NEURONS-LAYER-2
	_M = Matrix.get("mat_n_neurons_l1")
	gridL1=_M[0]
	gridL1 = [int(x) for x in gridL1]
	
	_M = Matrix.get("mat_n_neurons_l2")
	gridL2=_M[0]
	gridL2 = [int(x) for x in gridL2]
	
	grid=[]
	for i in gridL1:
	   for j in gridL2:
	 	 g=(i,j)
	 	 grid.append(g)

	# GENERATE THE GRID (JUST ONE NUMBER) FOR THE "RANDOM SEED"	
	gridR=[1]
	
	# GRID FOR "alpha"
	_M = Matrix.get("mat_alpha")
	grid_alpha=_M[0]
		
	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid={'hidden_layer_sizes': grid , 'random_state':gridR, 'alpha': grid_alpha}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 

	# STORE THE TWO BEST PARAMETERS INTO TWO VARIABLES 
	opt_nn=grid.best_params_.get('hidden_layer_sizes')
	opt_neurons1=opt_nn[0]
	opt_neurons2=opt_nn[1]
	Scalar.setValue('OPT_NEURONS_L_1',opt_neurons1,vtype='visible')
	Scalar.setValue('OPT_NEURONS_L_2',opt_neurons2,vtype='visible')
	opt_alpha=grid.best_params_.get('alpha')
	Scalar.setValue('OPT_ALPHA',opt_alpha,vtype='visible')

	# USING THE BEST PARAMETERS TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = MLPClassifier(solver='lbfgs',hidden_layer_sizes=opt_nn, alpha=opt_alpha , random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_hidden_layer_sizes','param_alpha','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['(N. of neurons in L1, N. of neurons in L2)','Alpha','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','(N. of neurons in L1, N. of neurons in L2)','Alpha','Train accuracy','Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("======================================================================================")
	print(" Results of cross-validation grid search")
	print("======================================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("======================================================================================")

#******************************************************************************
# * TITLE:  "NEAREST_NEIGHBOR USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "KNeighborsClassifier()"
# *****************************************************************************
def c_nearestneighbor():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE a BOOSTING-TREE (with boosting parameters)
	model = KNeighborsClassifier(n_neighbors=5)

	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED
	#k_range = list(range(1, 31))
	nn = Matrix.get("mat_n_neighbor")
	k_range=nn[0]
	k_range = [int(x) for x in k_range]
	weight_options = ['uniform', 'distance']

	# CREATE A PARAMETER GRID: 
	# MAP THE PARAMETER NAMES TO THE VALUES THAT SHOULD BE SEARCHED
	param_grid = dict(n_neighbors=k_range, weights=weight_options)

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 

	# STORE THE BEST NUMBER OF NEIGHBORS INTO A STATA SCALAR
	opt_nn=grid.best_params_.get('n_neighbors')
	Scalar.setValue('OPT_NN',opt_nn,vtype='visible')

	# STORE THE BEST WEIGHT-TYPE INTO A STATA SCALAR
	opt_weight=grid.best_params_.get('weights')
	Macro.setGlobal('OPT_WEIGHT',opt_weight, vtype='visible')

	# STORE THE TWO BEST PARAMETERS INTO TWO VARIABLES 
	opt_nn = grid.best_params_['n_neighbors']
	opt_weight = grid.best_params_['weights']

	# USING THE BEST PARAMETERS TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = KNeighborsClassifier(n_neighbors=opt_nn, weights=opt_weight)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_n_neighbors','param_weights','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['N. of nearest neighbors','Kernel','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','N. of nearest neighbors','Kernel','Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("=======================================================================")
	print(" Results of cross-validation grid search")
	print("=======================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("=======================================================================")
	
#******************************************************************************
# * TITLE:  "BOOSTING-TREE CLASSIFICATION USING CROSS-VALIDATION" 
# * DATE:   02/11/2021
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "AdaBoostClassifier()"
# *****************************************************************************
def c_boost():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# ESTIMATE A "ABC" AT GIVEN PARAMETERS (JUST TO TRY IF IT WORKS)
	model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=R)

	# ABC "CROSS-VALIDATION"
	# WE CROSS-VALIDATE OVER TWO PARAMETERS:
	# 1. "D = learning_rate"
	# 2. "G = n_estimators"
    # 3. "H = tree depth" (i.e., number of layers of the tree)
	
	# GENERATE THE TWO PARAMETERS' GRID AS "LISTS"
	# GRID FOR "learning_rate"
	# gridD=[0.001,0.005,0.01,0.05,0.1,0.20]
	# GRID FOR "n_estimators"
	# gridG=list(range(1,21))
	# GRID FOR "max_depth"
	# gridH=list(range(1,11))
	
	_M = Matrix.get("mat_learning_rate")
	gridD=_M[0]
	
	_M = Matrix.get("mat_n_estmators")
	gridG=_M[0]
	gridG = [int(x) for x in gridG]
	
	_M = Matrix.get("mat_max_depth")
	gridH=_M[0]
	gridH = [int(x) for x in gridH]
	
	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'learning_rate': gridD, 'n_estimators': gridG, 'max_depth': gridH}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_LEARNING_RATE',params_values[0],vtype='visible')
	Scalar.setValue('OPT_N_ESTIMATORS',params_values[2],vtype='visible')
	Scalar.setValue('OPT_MAX_DEPTH',params_values[1],vtype='visible')

	# GET THE VALUE "opt_learning_rate" AND PUT IT INTO A STATA SCALAR "opt_n_estimators"
	opt_learning_rate=grid.best_params_.get('learning_rate')
	opt_n_estimators=grid.best_params_.get('n_estimators')
	opt_max_depth=grid.best_params_.get('max_depth')

	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	model = GradientBoostingClassifier(learning_rate=opt_learning_rate, 
									  n_estimators=opt_n_estimators,
									  max_depth=opt_max_depth,
									  random_state=R)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
		
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_learning_rate','param_n_estimators','param_max_depth','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['Learning rate', 'N. of trees', 'Tree depth','Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','Learning rate','N. of trees','Tree depth','Train accuracy','Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("==========================================================================")
	print(" Results of cross-validation grid search")
	print("==========================================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("==========================================================================")	
	
#******************************************************************************
# * TITLE:  "REGULARIZED MULTINOMIAL CLASSIFICATION USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "LogisticRegression()"
# *****************************************************************************
def c_rmn():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar , Matrix
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE THE MODEL
	model = LogisticRegression(penalty='elasticnet',C=0.5, solver='saga',
									   multi_class='multinomial', l1_ratio = 0.2)

	# MLRC "CROSS-VALIDATION" FOR "C" AND "GAMMA" BY PRODUCING A "GRID SEARCH"
	# GENERATE THE TWO PARAMETERS' GRID AS A "LIST":
	# 1. C = INVERSE OF REGULARIZATION STRENGTH (SMALLER VALUES SPECIFY STRONGER REGULARIZATION)
	# 2. G = "l1_ratio"= THE ELASTIC-NET MIXING PARAMETER (WITH 0 <= l1_ratio <= 1)
	
	_M = Matrix.get("mat_alpha")
	gridC=_M[0]
	
	_M = Matrix.get("mat_l1_ratio")
	gridG=_M[0]

	# Grid for "C"
	#gridC=[1.2,2,5,10,15,20,40,60,80,120,150,180,200,250,300]

	# Grid for "G"
	#gridG=[0,0.2,0.5,0.8,1]

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'C': gridC, 'l1_ratio': gridG}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_LEARNING_RATE',params_values[0],vtype='visible')
	Scalar.setValue('OPT_N_ESTIMATORS',params_values[1],vtype='visible')

	# STORE THE TWO BEST PARAMETERS INTO TWO VARIABLES 
	opt_c=grid.best_params_.get('C')
	Scalar.setValue('OPT_PENALIZATION',opt_c,vtype='visible')

	opt_gamma=grid.best_params_.get('l1_ratio')
	Scalar.setValue('OPT_L1_RATIO',opt_gamma,vtype='visible')


	# USING THE BEST PARAMETERS TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST PARAMETERS
	model = LogisticRegression(penalty='elasticnet',C=opt_c, solver='saga',
							 multi_class='multinomial',l1_ratio=opt_gamma)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
	# PRINT GRID SEARCH RESULTS
	ALL_CV_RES=pd.DataFrame(grid.cv_results_)[['param_l1_ratio','param_C','mean_train_score','mean_test_score']]
	ALL_CV_RES.columns = ['L1 ratio', 'Alpha' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES['Index'] = ALL_CV_RES.index
	column_names = ['Index','L1 ratio', 'Alpha' , 'Train accuracy', 'Test accuracy']
	ALL_CV_RES = ALL_CV_RES.reindex(columns=column_names)
	print("====================================================")
	print(" Results of cross-validation grid search")
	print("====================================================")
	print(ALL_CV_RES.to_string(index=False))
	print("====================================================")
	
#******************************************************************************
# * TITLE:  "NAIVE-BAYES CLASSIFICATION USING CROSS-VALIDATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "GaussianNB()"
# *****************************************************************************
def c_naivebayes():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.naive_bayes import GaussianNB
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE THE FIT
	model=GaussianNB(var_smoothing=1e-9) 

	# DEFINE THE PARAMETER VALUES THAT SHOULD BE SEARCHED
	k_range = [1e-19, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = dict(var_smoothing=k_range)

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_VAR_SMOOTHING',params_values[0],vtype='visible')

	# STORE THE TWO BEST PARAMETERS INTO TWO VARIABLES 
	opt_index=grid.best_index_+1
	opt_tt=grid.best_params_.get('var_smoothing')

	# USING THE BEST PARAMETERS TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST KNOWN PARAMETERS
	nbc=GaussianNB(var_smoothing=opt_tt)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)
	
#******************************************************************************
# * TITLE:  "MULTINOMIAL CLASSIFICATION" 
# * DATE:   02/05/2022
# * AUTHOR: GIOVANNI CERULLI
# *****************************************************************************
# * USE THE "scikit-learn" PYTHON PACKAGE, METHOD: "LogisticRegression()"
# *****************************************************************************
def c_multinomial():

	# IMPORT THE NEEDED PYTHON PACKAGES
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import GridSearchCV
	from sfi import Macro, Scalar
	from sfi import Data , SFIToolkit
	import numpy as np
	import pandas as pd
	import os

	# SET THE DIRECTORY
	dir=Macro.getLocal("dir")
	os.chdir(dir)

	# SET THE TRAIN/TEST DATASET AND THE NEW-INSTANCES-DATASET
	dataset=Macro.getLocal("data_fitting")

	# LOAD A STATA DATASET LOCATED INTO THE DIRECTORY AS PANDAS DATAFRAME
	df = pd.read_stata(dataset,convert_categoricals=False)

	# DEFINE y THE TARGET VARIABLE
	y=df.iloc[:,0]

	# DEFINE X THE FEATURES
	X=df.iloc[:,1::]

	# READ THE "SEED" FROM STATA
	R=int(Macro.getLocal("seed"))

	# INITIALIZE THE MODEL
	model = LogisticRegression(penalty='none',C=0.5, solver='saga',
									   multi_class='multinomial')

	# MLRC "CROSS-VALIDATION" FOR "C" AND "GAMMA" BY PRODUCING A "GRID SEARCH"
	# GENERATE THE TWO PARAMETERS' GRID AS A "LIST":
	# 1. C = INVERSE OF REGULARIZATION STRENGTH (SMALLER VALUES SPECIFY STRONGER REGULARIZATION)
	# 2. G = "l1_ratio"= THE ELASTIC-NET MIXING PARAMETER (WITH 0 <= l1_ratio <= 1)

	# Grid for "C"
	gridC=[0.5]

	# Grid for "G"
	gridG=list([0.5])

	# PUT THE GENERATED GRIDS INTO A PYTHON DICTIONARY 
	param_grid = {'C': gridC, 'l1_ratio': gridG}

	# READ THE NUMBER OF CV-FOLDS "n_folds" FROM STATA
	n_folds=int(Macro.getLocal("n_folds"))

	# INSTANTIATE THE GRID
	grid = GridSearchCV(model, param_grid, cv=n_folds, scoring='accuracy', return_train_score=True)

	# FIT OVER THE GRID
	grid.fit(X, y)

	# VIEW THE RESULTS 
	CV_RES=pd.DataFrame(grid.cv_results_)[['mean_train_score','mean_test_score','std_test_score']]
	D=Macro.getLocal("cross_validation") 
	D=D+".dta"
	CV_RES.to_stata(D)

	# PUT OPTIMAL PARAMETER(S) INTO STATA SCALAR(S)
	params_values=list(grid.best_params_.values()) 
	Scalar.setValue('OPT_LEARNING_RATE',params_values[0],vtype='visible')
	Scalar.setValue('OPT_N_ESTIMATORS',params_values[1],vtype='visible')

	# STORE THE TWO BEST PARAMETERS INTO TWO VARIABLES 
	# STORE THE TWO BEST PARAMETERS INTO TWO VARIABLES 
	opt_c=grid.best_params_.get('C')
	Scalar.setValue('OPT_PENALIZATION',opt_c,vtype='visible')

	opt_gamma=grid.best_params_.get('l1_ratio')
	Scalar.setValue('OPT_L1_RATIO',opt_gamma,vtype='visible')


	# USING THE BEST PARAMETERS TO MAKE PREDICTIONS
	# TRAIN YOUR MODEL USING ALL DATA AND THE BEST PARAMETERS
	model = LogisticRegression(penalty='elasticnet',C=opt_c, solver='saga',
							 multi_class='multinomial',l1_ratio=opt_gamma)

	# FIT THE MODEL
	model.fit(X, y)

	# MAKE IN-SAMPLE PREDICTION FOR y and prob, AND PUT IT INTO A DATAFRAME
	y_hat = model.predict(X)
	prob = model.predict_proba(X)

	# STACK THE PREDICTIONS
	in_sample=np.column_stack((y_hat,prob))
	in_sample = pd.DataFrame(in_sample)
					
	# GET RESULTS INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____in_prediction") 
	D=D+".dta"
	in_sample.to_stata(D)

	# MAKE OUT-OF-SAMPLE "LABEL" PREDICTION FOR y USING A PREPARED DATASET
	D=Macro.getLocal("_____out_sample_x") 
	D=D+".dta"
	Xnew = pd.read_stata(D,convert_categoricals=False)
	ynew = model.predict(Xnew)

	# MAKE OUT-OF-SAMPLE "PROBABILITY" PREDICTION FOR y USING A PREPARED DATASET
	prob_new = model.predict_proba(Xnew)
	Prob_new  = pd.DataFrame(prob_new )

	# EXPORT LABEL PREDICTION FOR y INTO AN EXCEL FILE
	Ynew = pd.DataFrame(ynew)

	# MERGE LABEL AND PROBABILITY PREDICTION FOR y INTO AN EXCEL FILE
	# Use "numpy" to stack by column 'ynew' and 'prob_new'
	out=np.column_stack((ynew,prob_new))

	# GENERATE A DATAFRAME 'OUT' FROM THE "out" ARRAY
	OUT = pd.DataFrame(out)
					
	# PUT "OUT" INTO STATA
	# (NOTE: the first column is the prediction "y_hat")
	D=Macro.getLocal("_____out_prediction") 
	D=D+".dta"
	OUT.to_stata(D)

end