/*
mahascores.ado
David Kantor
Began 2-19-2008

This will call mahascore repeatedly to form the full set of Mahalanobis
distances between every pair of observations (possibly limited if -treated()-
is specified).

See mahascore.ado for more background.

There are 3 ways to get the results:
 varprefix()
 genmat()
 genfile()


Changes made around 21mar2008:
 idvar is optional; for defaults, you get...
  obs1, ons2, etc. as row & col names;
  `varprefix'1, `varprefix'2, etc. as varnames;
  values 1, 2 (numeric) in the genfile.

 Changed prime_id to name1; added name2. Defaults depend on whether idvar is
 present.

 Added an option for transposing the matrix.

 Using a tempname for the genfile handle; improves break behavior.


2012feb8: enhanced handling of errors from covariancemat.

*/


*! version 1.0.7 2012feb8
/* prior:
1.0.6 31mar2008 (or apr 1)
*/

prog def mahascores
version 8.2


/*
A set of new variables will be computed that reflects the distance between each
observation and the other observations.

Alternatively, a matrix is generated.
Or a file is written.

We have a -treated- option; it is used in limiting the covariance calculations
to the treated cases.  It also limits the vars (or matrix rows) generated,
unless -all- is specified.

*/

#delimit ;

syntax varlist [aw fw iw pw] ,
 [
 idvar(varname)

 varprefix(string)
 genmat(name)
 genfile(string) name1(name) name2(name) scorevar(name) replace full 

 treated(varname numeric) INVCOVarmat(name) COMPUTE_invcovarmat
 DISPlay(string)

 UNSQuared EUCLidean VERBose float all TRANSpose
 noCOVTRLIMitation
 ]
 ;
#delimit cr

local progname "mahascores"
local maxint = 32740
local maxbyte = 100


if "`varprefix'" == "" & "`genmat'" == "" & "`genfile'" == "" {
 disp as err "`progname': varprefix, genmat, or genfile must be specified"
 exit 198
}



/* Some code adapted from mahapick */
if "`genfile'" ~= "" {

 if "`name1'" == "" {
  if "`idvar'" == "" {
   local name1 "_refobs"
  }
  else {
   local name1 "_refid"
  }
 }

 if "`name2'" == "" {
  if "`idvar'" == "" {
   local name2 "_obs"
  }
  else {
   local name2 "`idvar'"
  }
 }



 if "`scorevar'" == "" {
  local scorevar "_score"
 }

 assert_distinct name1 scorevar `name1' `scorevar' `progname'
 assert_distinct name2 scorevar `name2' `scorevar' `progname'
 assert_distinct name1 name2 `name1' `name2' `progname'


 if "`replace'" == "" {
  confirm new file `genfile'
 }
 else {
  capture confirm new file `genfile'
   local rc1 = _rc
  capture confirm file `genfile'
   local rc2 = _rc
  if `rc1' & `rc2' {
    disp as error "`progname': filespec for genfile invalid: `genfile'"
    exit 198
  }
 }
}
else { // no genfile
 local genfile_options "name1 name2 replace scorevar"
 local x_without_genfile 0

 foreach opt of local genfile_options {
  if "``opt''" ~= "" {
   disp as error "`progname': `opt' invalid without genfile"
   local x_without_genfile 1
  }
 }

 if `x_without_genfile' {
  exit 198
 }

}



/*
Note that we checked that the genfile_options were specified only if genfile
was also specified.  There are certain other options that apply under other
conditions, but we don't bother checking.


It is the user's responsibility to assure that `idvar', if present, has unique
values; and no nulls or embedded blanks.
This is more or less important, depending on which output option is used.


This allows a pre-calculated inverse covariance matrix (invcovarmat) to be
passed in as an option.  The purpose is just to allow the user the option.
See mahascore.ado for other reasons -- though there is a different
emphasis there -- to eliminate repeated calculations.  (And HERE we use that
facility.)

*/

/* Borrow code from mahasore; we need to replicate some of it because of
the INVCOVarmat(name) and COMPUTE_invcovarmat options.
*/

local treated_orig "`treated'"

if "`treated'" ~= "" {
 if "`covtrlimitation'" =="" {
  local iftreated_for_cov "if `treated'"
 }
}
else {
 tempvar treated
 gen byte `treated' = 1
 /* -- needed in loop */
}
/* Note that now, `treated' is certainly non-empty. But `treated_orig' retains
the original value, and may be empty.
*/

if "`compute_invcovarmat'" ~= "" {
 if "`invcovarmat'" ~= "" {
  disp as err "`progname': note: both compute_invcovarmat and invcovarmat were specified; ignoring invcovarmat"
 }
	/* covariancemat is an ado by d.k. */
	tempname cov invcov
	capture covariancemat `varlist' `iftreated_for_cov' [`weight' `exp'], covarmat(`cov')
	if _rc {
		disp as err "error from covariancemat"
		error _rc
	}


 if index("`display'", "covar") {
  disp _new as text "covariance matrix:"
  mat list `cov'
 }

 if "`euclidean'" ~= "" {
  /* -euclidean- option means to zero the off-diagonal elements.  It should
  be equivalent to the behavior of the old mahascore; see notes at top.
  This can only apply under the compute_invcovarmat option, as it needs to be
  done BEFORE the inverting.
  The resulting inverted matrix should be just the reciprocals in the diagonal.
  */

  local crows = rowsof(`cov')
  // local ccols = colsof(`cov') -- should equal crows
  forvalues j = 2 / `crows' {
   forvalues k = 1 / `=`j'-1' {
    mat `cov'[`j', `k'] = 0
    mat `cov'[`k', `j'] = 0
   }
  }

  if index("`display'", "covar") {
   disp _new as text "covariance matrix, after euclidean operation:"
   mat list `cov'
  }

 }


 /* disp "begin inv(cov)" */
 mat `invcov' = inv(`cov')
 /* -- you may be able to do invsym(`cov')
 But there are pitfalls, if `cov' is not positive-definite.
 (It ought to be at least positive-semidefinite.)
 But the most general choice is inv().
 */
 /* disp "end inv(cov)" */
}
else {
 /* compute_invcovarmat not specified */

 if "`invcovarmat'" == "" {
  disp as err "`progname': you must specify compute_invcovarmat or invcovarmat"
  exit 198
 }
 else {
  capture confirm matrix `invcovarmat'
  if _rc ~=0 {
   disp as error "`progname': invcovarmat must be a matrix (inverse covariance)"
   exit 198
  }
 }

 if "`weight'" ~= "" {
  disp as err "`progname': weight will be ignored, since compute_invcovarmat was not specified"
 }
 if "`euclidean'" ~= "" {
  disp as err "`progname': euclidean will be ignored, since compute_invcovarmat was not specified"
 }

 local invcov "`invcovarmat'"
}

if index("`display'", "invcov") {
 disp _new as text "invcovariance matrix:"
 mat list `invcov'
}



local all = "`all'" ~= ""
local full = "`full'" ~= ""

if "`genmat'" ~= "" {


 if "`treated_orig'" == "" | `all' {
  local Nj = _N
 }
 else {
  qui count if `treated'
  local Nj = r(N)
 }

 if "`treated_orig'" == "" | `all' | `full' {
  local Nk = _N
 }
 else {
  qui count if ~`treated'
  local Nk = r(N)
 }

 matrix `genmat' = J(`Nj', `Nk', .)

 forvalues j0 = 1/`=_N' {
  if "`treated_orig'" == "" | `all' | `treated'[`j0'] {
   if "`idvar'" =="" {
    local rownames "`rownames' obs`j0'"
   }
   else {
    local rownames "`rownames' `=`idvar'[`j0']'"
   }
  }
  if "`treated_orig'" == "" | `all' | `full' | ~`treated'[`j0'] {
   if "`idvar'" =="" {
    local colnames "`colnames' obs`j0'"
   }
   else {
    local colnames "`colnames' `=`idvar'[`j0']'"
   }
  }
 }

 matrix colnames `genmat' = `colnames'
 matrix rownames `genmat' = `rownames'
}


if "`genfile'" ~= "" {
 local scoretype "double"
 if "`float'" ~= "" {
  local scoretype "float"
 }



 if "`idvar'" ~= "" {
  local typ1: type `idvar'
  local typ2: type `idvar'
 }
 else {
  tempvar n // to optimize typ1 & typ2
  local max_var1 = _N
  local max_var2 = _N

  if ~("`treated_orig'" == "" | `all') {
   gen long `n' = _n
   summ `n' if `treated', meanonly
   local max_var1 = r(max)
  }

  if ~("`treated_orig'" == "" | `all' | `full') {
   summ `n' if ~`treated', meanonly
   local max_var2 = r(max)
  }
  /* Note that the -summ `n'...- operation is valid in all cases. It is just
  more efficient to use _N when appropriate.  Also note that if `n' is
  needed in the second -summ-, it has already been generated in the first --
  because of the order and because the condition
   ~("`treated_orig'" == "" | `all') 
  is more inclusive than
   ~("`treated_orig'" == "" | `all' | `full')
  */
  /* disp "max_var1= `max_var1'" */
  /* disp "max_var2= `max_var2'" */

  #delimit ;
  local typ1 =
  cond(`max_var1' <= `maxbyte', "byte",
  cond(`max_var1' <= `maxint',  "int",
  "long"));

  local typ2 =
  cond(`max_var2' <= `maxbyte', "byte",
  cond(`max_var2' <= `maxint',  "int",
  "long"));

  #delimit cr

 }

 /* disp "typ1= `typ1'" */
 /* disp "typ2= `typ2'" */

 tempname genfile_handle

 postfile `genfile_handle' `typ1' (`name1') `typ2' (`name2') `scoretype' (`scorevar') using `genfile', `replace'
 disp as text "file `genfile' opened for posting"
}


/* End of preparation */


/* Begin generating results */

local j1 = 1
forvalues j0 = 1/`=_N' {
 if `all' | `treated'[`j0'] {

  if "`varprefix'" == "" {
   tempvar newvar
  }
  else {
   if "`idvar'" == "" {
    local newvar "`varprefix'`j0'"
   }
   else {
    local newvar "`varprefix'`=`idvar'[`j0']'"
   }
  }
  /* actually, "`varprefix'`=`idvar'[`j0']'" works out okay if `idvar' is
  empty, but it's not quite proper to do it that way.
  */


  #delimit ;
  mahascore `varlist' ,
   gen(`newvar')
   invcovarmat(`invcov')
   refobs(`j0')
   `unsquared' `verbose' `float'
  ;
  #delimit cr


  if "`genmat'" ~= "" | "`genfile'" ~= "" {
   local k1 = 1
   forvalues k0 = 1/`=_N' {
    if "`treated_orig'" == "" |  `all' | `full' | ~`treated'[`k0'] {

     if "`genmat'" ~= "" {
      matrix `genmat'[`j1', `k1'] = `newvar'[`k0']
      local ++k1
     }

     if "`genfile'" ~= "" {
      if "`idvar'" == "" {
       post `genfile_handle'  (`j0') (`k0') (`newvar'[`k0'])
      }
      else {
       post `genfile_handle'  (`idvar'[`j0']) (`idvar'[`k0']) (`newvar'[`k0'])
      }
     }
    }
   }
  }

  if "`varprefix'" == "" {
   drop `newvar'
  }

  local ++j1
 }
}


if "`genfile'" ~= "" {
 postclose `genfile_handle'
 disp as text "file `genfile' closed"
}

if "`genmat'" ~= "" & "`transpose'" ~= "" {
 matrix `genmat' = `genmat''
}


/* Note that [`weight' `exp'] is not included in the call to mahascore
not appropriate with invcovarmat(`invcov').

Also, it does not use the compute_invcovarmat option; that would defeat the
purpose of it having the invcovarmat() option -- to eliminate repeated (lengthy)
computations of the same values.

*/

end // mahascores



prog def assert_distinct // borrowed from mahapick
args name1 name2 c1 c2 progname
/*
`name1' and `name2' are allegedly the names of macros.
`c1' and `c2' are supposed to be the corresponding contents.
We need to pass names and contents separately, because they are local to the
calling context.
*/
if "`c1'" == "`c2'" {
 disp as error "`progname': `name1' and `name2' must be distinct"
 exit 198
}
end  // assert_distinct