/* svm_predict:  after fitting an SVM model with svm, construct predicted classes/values (depending on the type of the active SVM) */

/* load the C extension */
svm_ensurelib           // check for libsvm
program _svmachines, plugin    // load the wrapper for libsvm

program define svm_predict, eclass
  version 13
  syntax newvarname [if] [in], [PROBability] [scores] [Verbose]
  local target = "`varlist'"
  local _in = "`in'" //these need to be stashed because the hack below will smash them
  local _if = "`if'"

  if("`probability'"!="" & "`scores'"!="") {
    di as err "Error: probability and scores are mutually exclusive options."
    exit 2
  }
  
  // C plugins can only speak to variables mentioned in the varlist they are called with
  // that is, if we are going predict on some vectors, we need to know what X variables we're
  // predicting on in their entirety before we call down to C--and they should match what 
  
  // I haven't discovered how regress and friends manage to know which variables to predict on
  // the only place I see them record what they did is in e(cmdline)
  // but that has cruft in it
  // the easiest way I can think to extract the predictor list is to *reparse* the command line
  // TODO: consider if it's saner to simply pre-store e(indepvars) or e(predictors) or something
  local 0 = "`e(cmdline)'"
  gettoken cmd 0 : 0 /*remove the command which was artificially tacked on by svm_train*/
  syntax varlist [if] [in], * //* puts the remainder in `options' and allows this code to be isolated from svm_train (it's not like we actually could tweak anything, since the svm_model is stored on the plugin's C heap)
  if("`e(svm_type)'"!="ONE_CLASS") {
    gettoken y varlist : varlist  // pop the first variable
    assert "`y'" == "`e(depvar)'" // and check consistency with the svm_train

    // make the target column
    // it is safe to assume that `target' is a valid variable name: "syntax" above enforces that
    //  and it should be safe to assume the same about `e(depvar)': unless the user is messing with us (in which case, more power to them), it should have been created by svm_train and validated at that point
    quietly clone `target' `e(depvar)' if 0 //'if 0' leaves the values as missing, which is important: we don't want a bug in the plugin to translate to source values sitting in the variable (and thus inflating the observed prediction rate)
    local L : variable label `target'
    if("`L'"!="") {
      label variable `target' "Predicted `L'"
    }
  }
  else {
    //ONE_CLASS
    quietly gen int `target' = .
    label variable `target' "Within support"
  }
  
  if("`probability'"!="") {
    // allocate space (we use new variables) to put probability estimates for each class for each prediction

    // ensure model is a classification
    // this duplicates code over in svm_train, but I think this is safest:
    //  svm_import allows you to pull in svm_models created by other libsvm
    //  interfaces, and they mostly don't have this protection. 
    if("`e(svm_type)'" != "SVC" & "`e(svm_type)'" != "NU_SVC") {
      // in svm-predict.c, the equivalent section is:
      /*
       *        if (predict_probability && (svm_type==SVC || svm_type==NU_SVC))
       *                predict_label = svm_predict_probability(model,x,prob_estimates);
       *        else
       *                predict_label = svm_predict(model,x);
       */
      // it is cleaner to error out, rather than silently change the parameters, which is what the command line tools do
      di as error "Error: trained model is a `e(svm_type)'. You can only use the probability option with classification models (SVC, NU_SVC)."
      exit 2
    }
    
    // save the top level description to splay across the stemmed variables
    local D : variable label `target'
    
    // Collect (and create) the probability columns
    // TODO:  get it to generate the columns in the "levelsof" order, but actually use them in the libsvm order
    //         -> right now it is in the libsvm order, which is fine. the results are correct. they're just not as convenient. 
    // BEWARE: the order of iteration here is critical:
    //     it MUST match the order in svm_model->labels or results will silently be permuted
    //     the only way to achieve this is to record the order in svm_model->labels and loop over that explicitly, which is what e(levels) is for
    assert "`e(levels)'" != ""
    foreach l in `e(levels)' {
      // l is the "label" for each class, but it's just an integer (whatever was in the original data table)
      
      // We try to label each column by the appropriate string label, for readability,
      //  but if it doesn't exist we fall back on the integer label.
      //
      // The command to do this is poorly documented. What this line does is
      //  look up the value label for value `l'
      //  *or* give back `l' unchanged if `target' has no labels
      //  which is precisely what we want it to do here.
      local L : label (`e(depvar)') `l'
      
      // compute the full variable name for level `l'
      local stemmed = "`target'_`L'"
      local stemmed = strtoname("`stemmed'") //sanitize the new name; this summarily avoids problems like one of your classes being "1.5"
      
      // finally, allocate it
      // unlike `target' which clones its source, we use doubles
      // because these are meant to hold probabilities
      
      // TODO: what happens if there's a name collision partially through this loop?
      //       what I want to happen is for any name collision or other bug to abort (i.e. rollback) the entire operation
      //       This can be achieved with "snapshot": snapshot; capture {}; if(fail) { rollback to snapshot }"
      quietly generate double `stemmed' = .
      label variable `stemmed' "Pr(`D'==`L')"
      
      // attach the newcomers to the varlist so the plugin is allowed to edit them
      local varlist = "`varlist' `stemmed'"
    }
  }
  else if("`scores'"!="") { // else-if because these options are mutually exclusive (which is enforced above)
    // Allocate space for the decision values
    // This is more complicated because we need to go down a lower triangle of a matrix -- so, a length-changing nested loop.
    // we have to use word("`e(levels)'", i) to extract the ith level
    // which means we have an extra layer of indirection to deal with, so there's x_i the index into e(labels), x the integer label, and X the string (or possibly integer) label
    
    // we need to split the cases of classification and non-classification models
    //  reason i:  non-classification models have model->label == NULL which means e(levels) is missing which breaks this code
    //  reason ii: non-classification models only have one decision value, so the sensible label is just "`target'_score"
    if("`e(svm_type)'" == "ONE_CLASS" | "`e(svm_type)'" == "SVR" | "`e(svm_type)'" == "NU_SVR") {
      // generate the name of the new column.
      // it is, unfortunate, somewhat terse, in hopes of keeping within 32 characters
      local stemmed = "`target'_score"
      local stemmed = strtoname("`stemmed'")  //make it Stata-safe
      
      // allocate the decision value column
      quietly generate double `stemmed' = .
      label variable `stemmed' "`target' svm score"
      
      // attach the newcomers to the varlist so the plugin is allowed to edit them
      local varlist = "`varlist' `stemmed'"
    }
    else if("`e(svm_type)'" == "SVC" | "`e(svm_type)'" == "NU_SVC") {
      local no_levels = `e(N_class)'
      forvalues l_i = 1/`no_levels' {
        //di "l_i = `l_i'"
        local l = word("`e(levels)'", `l_i')
        local L : label (`e(depvar)') `l'
        forvalues r_i = `=`l_i'+1'/`no_levels' {
          //di "r_i = `r_i'"
          local r = word("`e(levels)'", `r_i')  // map the index into the labels
          local R : label (`e(depvar)') `r'
          //di "generating svm score column (`l_i',`r_i') <=> (`l',`r') <=> (`L',`R')"
          
          // generate the name of the new column.
          // it is, unfortunate, somewhat terse, in hopes of keeping within 32 characters
          local stemmed = "`target'_`L'_`R'"
          local stemmed = strtoname("`stemmed'")  //make it Stata-safe
        
          // allocate the decision value column
          quietly generate double `stemmed' = .
          label variable `stemmed' "`target' svm score `L' vs `R'"
          
          // attach the newcomers to the varlist so the plugin is allowed to edit them
          local varlist = "`varlist' `stemmed'"
        }
      }
    }
    else {
      di as error "Unrecognized svm_type `e(svm_type)'; unable to define svm score columns."
      exit 2
    }
  }
  
  // call down into C
  // we indicate "probability" mode by passing a non-empty list of levels
  //  this list implicitly *removes* from the set range of variables to predict from: the trailing variables are instead write locations
  //  (this feels like programming a hardware driver)
  // Subtlety: we don't quote levels, on the assumption that it is always a list of integers;
  //           that way, the levels are pre-tokenized and the count easily available as argc
  
  plugin call _svmachines `target' `varlist' `_if' `_in', `verbose' predict `probability' `scores'

  if("`e(svm_type)'"=="ONE_CLASS") {    
    // libsvm gives {1,-1} for its one-class predictions;
    // normalize these to {1,0}
    qui replace `target' = 0 if `target' == -1
  }
end





/* clone.ado: generate a perfect copy of a variable: type, labels, etc.

 syntax:
  clone newvar oldvar [if] [in]
 
 You can use 'if' and 'in' to control what values; values that don't match will be set to missing.
 If you want to clone a variable's metadata but not values use the idiom ". clone new old if 0".

 NB: The reason the syntax is not "clone newvar = oldvar", even though that would fit the pattern
     set by generate and egen, is that syntax's =/exp option insists on parsing numeric expressions,
     so string variables wouldn't be cloneable.
 */
 

program define clone
  version 13
  
  // parse once to extract the basic pieces of syntax
  syntax namelist [if] [in]
  local _if = "`if'" //save these for later; the other syntax commands will smash them
  local _in = "`in'"
  
  gettoken target source : namelist
  
  // enforce types
  confirm new variable `target'
  confirm variable `source'
  
  // save attributes
  local T : type `source' //the data type
  local N : variable label `source' //the human readable description
  local V : value label `source' // the name of the label map in use, if there is one
                                 // Stata maintains a dictionary of dictionaries, each of which
                                 // maps integers to strings. Multiple variables can share a dictionary,
                                 // though it is rare except for e.g. "boolean"
  
  // make new variable
  generate `T' `target' = `source' `_if' `_in'
  
  // clone attributes if they exist
  // (except for type, which always exists and cannot be reassigned without
  //  another 'generate' doing a whole new malloc())
  if("`N'"!="") {
    label variable `target' "`N'"  //Yes, the setters and getters are...
  }
  if("`V'"!="") {
    label value `target' "`V'"     //...in fact reverses of each other
  }
end