// Nov 13 2011 14:35:03
// Exploit maketrpr to calculate dynamic Hamming


mata:

// This function is not specific to this adofile and is repeated: should be in a central location
real matrix function expandpwdist(real matrix raw, real matrix seqid, real matrix nd)
  {
  output = mm_expand(raw,nd,nd,1)
  return(output[invorder(seqid), invorder(seqid)])
  }
end


mata:

  // Setup and work through all the pairs, calculating the DH distance
real matrix function dynhamming (string vl, string sm, real scalar nstates, real scalar buzz, real scalar nrefs) {
  real scalar i,j,k
  real matrix subsmat, resmat

  st_view(data = .,.,(tokens(vl)))
  maxwidth = cols(data)
  nobs = rows(data)
  resmat = J(nobs, (nrefs==0 ? nobs : nrefs), 0)

  // (N*T) * N transition probability matrix
  subsmat = st_matrix(sm)

  // add p_{ij} to p_{ji} : would  "x :+ x'" be better
  for (i = 1;i<maxwidth;i++) {
    for (j = 1;j<=nstates;j++) {
      subsmat[j + (i-1)*nstates, j] = 2*subsmat[j + (i-1)*nstates, j] // for buzz, diagonal
      for (k = j+1;k<=nstates;k++) {
        subsmat[j + (i-1)*nstates, k] = subsmat[j + (i-1)*nstates, k] + subsmat[k + (i-1)*nstates, j]
        subsmat[k + (i-1)*nstates, j] = subsmat[j + (i-1)*nstates, k]
      }
    }
  }

  if (nrefs==0) {
    // Get the DH distance for each pair
    for (i = 1;i <= nobs;i++) {
      for (j = i + 1;j <= nobs;j++)
        { // Note: explicitly leave a zero in diagonal
          resmat[i,j] = dynhammingdistance(i, j, data, subsmat, nstates, maxwidth, buzz)
        }
      if (buzz==1) {
        resmat[i,i] = dynhammingdistance(i, i, data, subsmat, nstates, maxwidth, buzz)
      }
    }
    resmat = resmat + transposeonly(resmat)
  } else {
    for (i = 1;i <= nobs;i++) {
      for (j = 1;j <= nrefs;j++)
        {
          resmat[i,j] = dynhammingdistance(i, j, data, subsmat, nstates, maxwidth, buzz)
        }
    }
  }
  return(resmat)
}


// Calculate the DH distance for pair of sequences
function dynhammingdistance (real scalar first,
                             real scalar second,
                             real matrix data,
                             real matrix subsmat, 
                             real scalar nstates, 
                             real scalar maxwidth, 
                             real scalar buzz) {

  real scalar hammingdistance, i, trprob;

  hammingdistance = 0;

      // for 1 to length - 1, apply the t/t+1 transition probabilities
      // The distance is \sum (2 - trprob). This is in contrast to Lesnard
      // who applies the t-1/t and t/t+1 transitions, with 4 - tr(-1) - tr(+1)
      // However, this is not necessary with smoothed transition rates.
      
  for (i = 1;i<maxwidth;i++) {
         trprob = subsmat[ data[first,i] + (i-1)*nstates, data[second,i] ];
         if ((data[first,i]!=data[second,i]) | buzz==1) { // Feb 18 2012 00:49:00: understandable but spoils the logic!
            hammingdistance = hammingdistance + (2-trprob); 
            }
         }
      
      // for the final time period, there is no transition rate: use the most recent one. 
      i = maxwidth;
      trprob = subsmat[data[first,i] + (i-2)*nstates,data[second,i]];
      if ((data[first,i]!=data[second,i]) | buzz==1) { // Feb 18 2012 00:49:00: understandable but spoils the logic!
         hammingdistance = hammingdistance + (2-trprob);
         }
      
      return(hammingdistance/maxwidth);
      
      }
end
   
   
capture program drop dynhamming
program define dynhamming
version 9
syntax varlist , PWDist(string)  [DUps BUzz MAwindow(integer 3) REF(integer 0)]

// Facility to over-ride exclusion of duplicates
// Nov 14 2011 : there is a bug here, in that without
// exclusion of duplicates, matches between duplicates
// (in some cases) do not return zero. When duplicates
// are excluded and then expanded, this doesn't seem to
// happen.

if ("`dups'"=="") {
  local dups 0
}
else {
  local dups 1
}

if (`ref'!=0) {
  local dups 1
}
   
if ("`buzz'"=="") {
  local buzz 0
}
else {
  local buzz 1
}

tempname subsmat

   // Generate the dynamic transition probability matrix
   qui maketrpr `varlist', mat(`subsmat') ma(`mawindow')

   local nstates `r(nstates)'

   tempname ndups
   tempname first

   tempvar idvar

   //matrix `pwdist' = J(_N,_N,0);
   mata: st_matrix("`pwdist'", J(`=_N',`=_N',0));
   
   gen `idvar'=_n

   if (`dups'==0) {   
      preserve
      
      sort `varlist'
      //                   mkmat `idvar';
      mata: st_matrix("`idvar'", st_data(.,"`idvar'"));
      by `varlist': gen `ndups' = _N
      by `varlist': gen `first' = _n==1

      qui count if `first'
      di "`r(N)' unique observations"
      qui keep if `first'
      
      // mkmat `ndups'
      mata: st_matrix("`ndups'", st_data(.,"`ndups'"))
      }
   
   // Apply it to the data, via Mata function
   mata `pwdist' = dynhamming("`varlist'","`subsmat'",`nstates',`buzz',`ref')
   // Save the distances (duplicates unexpanded)
   mata: st_matrix("`pwdist'",`pwdist')

   // Expand the duplicates and re-save
   if (`dups'==0) {
      tempname pwdtemp
      
      capture mata mata which mm_expand()
      if _rc {
         di as error "mm_expand() from -moremata- is required; type -ssc install moremata- to obtain it"
         di as error "Alternatively, use the {cmd:dups} option to treat duplicate sequences"
         exit 499
         }
      mata: `pwdtemp'= expandpwdist(st_matrix("`pwdist'"),st_matrix("`idvar'"),st_matrix("`ndups'"))
      mata: st_matrix("`pwdist'",`pwdtemp')
      
      restore
      }
end