//*! version 1.00 - Mike Lacy 15Feb2013.  
cap prog drop ccmpre
prog ccmpre, rclass
// Calculate proportional reduction in error measures after a ccm model
// based on predicting the raw response data.  
//
syntax varlist [if] [in],  key(name) COMPetence(name) [base(string)]
version 11.3  // Might well work on a lower version; I don't know.
//
// competence: N X 1 matrix giving competence scores
// key: matrix of answer key probs, where key[k,l] gives Prob(response l is right answer to question k
// base: string giving user choice of base guessing model //
//       with {U I T} (uniform,individual, total} 
marksample touse
// validate input
quiet count if `touse'
if (r(N) != rowsof(`competence')) {
  di as error "Error, data given to -ccmpre- has different N than matrix `competence'".
  exit
}
if (colsof(`competence') > 1) {
  di as error "Error, competence matrix must be N X 1.".
  exit
}
confirm matrix `key'
confirm matrix `competence'
//
unab X: `varlist' 
local K = rowsof(`key')
local L = colsof(`key')
di "Key covers `K' questions with `L' possible responses. "
local N = rowsof(`competence')
// which guessing model
if ("`base'" == "") | (!inlist("`base'", "I", "T", "U")) {
   local base = "U"
}
// Mata does the work
mata: ResponsePRE("`X'", "`competence'", "`key'", "`base'",  "`touse'")
// Handle the returns
return add 
// Here is what was returned by Mata to r()
//	   E2Item	errors expected for each item under the model
//    E2   		total of errors under the model
//	   E1Item   errors expected for each item under the baseline model
//    E1    	total of errors under the baseline model 
//    PRE   	the PRE measure
return add 
return local basemodel = "`base'"
// Put nice rownames onto matrices before return
tempname temp
foreach M in E2Item E1Item {
   mat `temp' = return(`M')
	mat rownames `temp' = `X'
	return matrix `M' = `temp'
}	
//
di as text "Proportional Reduction in Error:"
di as text _col(5) "Predicted error under baseline(`base'): " as result(return(E1))
di as text _col(5)  "Predicted error under the model: " as result return(E2)
di as text _col(5) "PRE = " as result return(PRE) 
end
//
cap mata mata drop ResponsePRE()
mata:
void ResponsePRE(string scalar Xvarlist, ///   /* varlist of variables with raw response data */
           string scalar Dmatname, ///      /* name of competence matrix*/
           string scalar Zmatname, ///      /* name of key matrix */
           string scalar base,    ///       /* model for baseline error */
           string scalar touse)             /* name of the touse variable */
{   
// Names of response variables, competence matrix, and key matrix
// Calculates the response-based PRE measure, returning the overall and per item measure
  real scalar i, k, l
  real matrix X   // Xik is response of person i to item k,
  real matrix Z   // Zkl is Prob(l is the right answer to item k
  real colvector D  // Di is competence of person i
  real colvector E  // Ek is expected prediction error for item k
  real rowvector pobs // pobs[i] is the probability of getting person i's 
                      // observed response on a given item, given the key and D[i]
  real matrix pImarg // pImarg_il is person i's marginal response proportion for category l
  real matrix pTmarg // pTmarg_l is all persons' person marginal response proportion for category l
  //
   
  // Obtain raw data
  X = st_data(., tokens(Xvarlist), touse) 
  D = st_matrix(Dmatname) 
  Z = st_matrix(Zmatname) 
  N = rows(X)  // n of subjects
  nitems= cols(X) // n of items
  ncat = cols(Z)  // number of response categories each item
  //
  /*
  // Echo input
  printf("Competence\n")
  round(D, 0.001)
  printf("Key")
  round(Z, 0.01)
  printf("Observed Response matrix\n")
  X
  */
  //
  //
  // Calculation of error under the model, given key and competence
  pobs = J(N, 1, 0)
  E = J(nitems, 1, 0) // initialize for sum of errors on each item
                      // E[k,1] = expected errors on item k, 0 < E < N
  for (k = 1; k <= nitems; k++) { // one item at a time
      for ( i = 1; i <= N; i++) {  
        observed = X[i,k] 
        // Model based prob(observed)
        pobs[i] = Z[k,observed]*D[i]  + (1-D[i])/ncat
        //debug printf("pobs %2.0f = %6.3f\n", i, pobs[i])  
      }
      // errors on this item, sum across all persons
      E[k] = N - sum(pobs)  // Error = Total - expected correct
      //debug printf ("C item %2.0f:",k); sum(pobs); printf("\n") 
      //debug printf ("E item %2.0f:",k); E[k]; printf("\n")  
         
  }
  st_matrix("r(E2Item)", E)  // errors expected for each item under the model
  E2 = sum(E)  // total of errors expected under the model
  st_numscalar("r(E2)", E2)
  //
  // Calculate errors under the baseline or guessing model,
  // We first must set up for whichever of the various guessing 
  // regimes is being used
  // 
  if (base != "U" ) {  
     // Get each individuals' marginals for each response category 
     // across all items. Can use as is for I or collapse to T.
     pImarg = J(N, ncat, 0)
     for ( i = 1; i <= N ; i++) {
        for ( k = 1; k <= nitems; k++) {
           observed = X[i,k]
        	  pImarg[i, observed]  = pImarg[i, observed] + 1 
        }	
     }   
     pImarg = pImarg * (1/nitems)
     // " debug pImarg"
     //debug pImarg 
     if (base == "T")  {  // collapse I marginals to T
        pTmarg = J(1, ncat, 0) // 
        pTmarg = colsum(pImarg) * (1/N) 
     }
     
  }
  // Else guessing will just be U uniform, the default.
  // 
  // Count expected errors under the baseline model
  pobs  = J(N, 1, 0)  
  E = J(nitems, 1, 0) // initialize for accumulation
  for (k = 1; k <= nitems; k++) { 
     for ( i = 1; i <= N; i++) {  
        observed = X[i,k] 
        if (base == "I") {
           pobs[i] = pImarg[i,observed] // individual marg prob for the obs cat
           //debug printf("Ebase pobs item %2.0f, person %2.0f = %6.3f\n", k,i, pobs[i]) 
        }
        else if (base == "T") {
           pobs[i] = pTmarg[observed]   // overall marg prob for the obs cat
        }
        else if (base == "U") {
           pobs[i] = 1/ncat             // uniform prob of the observed cat
        }
      }
      E[k] = N - sum(pobs)              // Error on this item = Total - expected correct
  }  
  st_matrix("r(E1Item)", E)          // Errors expected for each item under the baseline model
  E1 = sum(E)
  st_numscalar("r(E1)", E1) 			// total of errors expected under the baseline
  //
  st_numscalar("r(PRE)", (E1 - E2)/E1)
  //
  //
}
end // mata
//