*! Date    : 21 Jul 2009
*! Version : 1.40
*! Author  : Adrian Mander
*! Email   : adrian.mander@mrc-bsu.cam.ac.uk

*! Iterative proportional fitting in contingency tables

/*
 1Nov2006 v1.38 Add in the squared ascii character to the output
15May2009 v1.39 Move to version 10 added extra numeric check on unique varlist
21Jul2009 v1.40 Remove the extra numeric check on unique varlist because hapipf needs strings
*/

program define ipf, rclass
version 10.0
preserve

di "{txt}Deleting all matrices......"
matrix drop _all

tempfile master
qui save "`master'"

/*********************************************************************************
 * parse the syntax fit(x*y + x*z +y*z)
 *
 * NB if the user doesn't do a varlist then the program will calculate it for you 
 * NOTE then you will have a blank varlist and you might use ulist (unique list) or 
 * vlist (variable list from fit() instead throughout
 *********************************************************************************/

syntax [varlist(default=none)] [fweight/] , FIT(string) [ CONstr(string) CONFILE(string) CONVARS(varlist) SAVE(string) EXPect NOLOG ACC(real 0.000001)]
tokenize "`varlist'"

di
di "{txt}Expansion of the various marginal models"
di "{dup 40:{c -}}"

local i 1
local vlist " "
gettoken mmodel fit:fit, parse("+")
while "`mmodel'"~="" {
   while "`mmodel'"=="+" {
     gettoken mmodel fit:fit, parse("+") 
   }

/* now split mmodel into the variables in the interaction terms */

   gettoken left mmodel:mmodel, parse("*")

/*
  Split left into the individual variables
  The resultant varlist is put into marg1-margn
*/

   while "`left'"~="" {
     while "`left'"=="*" {
       gettoken left mmodel:mmodel, parse("*") 
     }
     
     cap confirm variable `left'
     if _rc==111 di "{err}Make sure `left' exists in your dataset"
     confirm variable `left'

     local marg`i' "`marg`i'' `left'"
     gettoken left mmodel:mmodel, parse("*")
   }

/* Put the complete set of variables in vlist WITH replicates */

   local vlist "`vlist' `marg`i++''" 
   gettoken mmodel fit:fit, parse("+")
} 

/****************************************************************************
 * The next bit of the code will be able to remove the
 * multiples in the varlist constructed from the model statement
 * this is held in `vlist'
 *
 * NB: There could be a problem here if a variable in the varlist has the
 * same name as ANY local macro used in the program
 ****************************************************************************/

foreach uvar of varlist `vlist' {
  if "``uvar''"=="" {
    local ulist "`ulist' `uvar'"
    local `uvar' "seen"
  }
}

isnumeric `uvar'

/****************************************************************************
 * Take individual data as the default or else use the frequency variable
 * from the option
 ****************************************************************************/  

tempvar freqwt
if "`weight'"=="" qui gen double `freqwt'=1
else qui gen double `freqwt'=`exp'

/****************************************************************************
   Display the marginal models varlists This is part check part information 
   Make sure you DONT change the local macro i from the previous usage
 ****************************************************************************/
   
local nomargs = `i'-1
forval i=1/`nomargs' {
  di as res "marginal model `i' varlist : `marg`i'' "
}

/********************************************************************************
 * Now I need to expand to dataset to cover all possible cells and give
 * them all frequency 0 before doing a contract statement
 ********************************************************************************/

if "`varlist'"=="" {
   local ind 1
   foreach var of varlist `ulist' {
     _unique `var'
     mat u`ind'=resp
     local `var' "`r(unique)'"
     local ind = `ind'+1
   }
   tempfile temp
   qui save "`temp'",replace
   drop _all

   foreach var of local ulist {
     gen `var'=.
   }


   tokenize "`ulist'"
   
   qui set obs ``1''
   local last ""
   sort `1'
   local ind 1
   while "`1'"~="" {
      local last "`last' `1'"
      sort `last'
      qui by `last': replace `1'=u`ind'[_n,1]
      cap expand ``2''
      sort `1'
      local last "`last' `1'"
      mac shift 1
      local ind = `ind'+1
   }
   qui gen double `freqwt'=0
   sort `ulist'


}

/******************************************************
 * The varlist defines the dimension of the data space 
 * (saturated model)
 ******************************************************/

else {
   local ind 1
   tokenize "`varlist'"
   while "`1'"~="" {
     _unique `1'
     mat u`ind'=resp
     local `1' "`r(unique)'"
     local ind = `ind'+1
     mac shift 1
   }

   tempfile temp
   qui save "`temp'",replace
   drop _all
   tokenize "`varlist'"
   while "`1'"~="" {
     gen `1'=.
     mac shift 1
   }

   tokenize "`varlist'"
   qui set obs ``1''
   local last ""
   sort `1'
   local ind 1
   while "`1'"~="" {
      local last "`last' `1'"
      sort `last'
      qui by `last': replace `1'=u`ind'[_n,1]
      cap expand ``2''
      sort `1'
      local last "`last' `1'"
      mac shift 1
      local ind=`ind'+1
   }
   qui gen double `freqwt'=0
   sort `varlist'
}

/*****************************************************************
 * Calculate the model degrees of freedom. Obviously due to
 * structural zeroes and sampling zeroes the degrees of freedom
 * may be lower.
 *****************************************************************/

/* Put the name of unique variables in the macro tok`tind' */

local tind 1
foreach uvar of varlist `ulist' {
  local tok`tind' "`uvar'"
  local tind=`tind'+1
}

/* di "NOW to make an ordered set of margi's" */

local term ""

forval i=1/`nomargs' {
  local uind 1
  while `uind' <= `tind' {
    tokenize "`marg`i''"
    while "`tok`uind''"~="`1'" & "`1'"~="" {
      mac shift 1
    }
    if "`1'"~="" local term`i' "`term`i'' `1'"
    local uind = `uind'+1
  }
}

forval i=1/`nomargs' {
   local marg`i' "`term`i''"
}

/* So now marg1-margn now contain a list of factors that are in the order the varlist */
    
di "unique varlist `ulist'"
    
local term 1
forval i=1/`nomargs' {
  tokenize "`marg`i''"
  
/*
 Compare `marg`i'' to `ulist' ..... 
 I think put the variable number in the macro u1-un
*/ 

  local mind 1
  while "``mind''"~="" {
    local uind 1
    while "``mind''"~="`tok`uind''" {
      local uind=`uind'+1
    }
    local u`mind' = `uind'
    local mind = `mind'+1
  }

/*
  Take each marginal model 

  Expand out the number of interactions in the binary statement
*/

  tokenize "`marg`i''"
  
  local nterms 1
  while "``nterms''"~="" {
    local nterms=`nterms'+1
  }
  local nterms = `nterms'-1 
    local j 0
    while `j'< 2^`nterms' {
       binary `j' `nterms'
       local it 1
       local newuterms ""
       while `it'<=`nterms' {
         if "`newuterms'"=="" local newuterms "`u`r(s`it')''"
         else if "`u`r(s`it')''"~="" local newuterms "`newuterms'-`u`r(s`it')''" 
         local it = `it'+1
       }
       local nmylist "`nmylist' `newuterms'"
       local j=`j'+1
    }

}

/* Having expanded out all the interactions it is time to remove the repeats */

tokenize "`nmylist'"
local mlind 1
while "``mlind''"~="" {
  local ind 1
  while "``mlind''" ~= "``ind''" {
    local ind=`ind'+1
  }
  if (`mlind' == `ind') local nblist "`nblist' ``ind''"
  local mlind=`mlind'+1
}

/* 
  nblist contains numbered variables with interactions AND it is unique!! 
  1 2 3 4 5 6 7 8 9 10 9-10

*/

local nparms 0
foreach var of local nblist {
  local len=length("`var'")
  local prod 1
  if `len'>1 {
    local adevar "`var'"
    gettoken ade adevar: adevar, parse("-")
    while "`ade'"~="" {
      if "`ade'"~="-" local prod = `prod' * (``tok`ade'''-1) 
      gettoken ade adevar: adevar, parse("-")
   }    
  }
  else {
      local ade "`var'"
      local prod = `prod' * (``tok`ade'''-1)
  }
  local nparms = `nparms'+`prod'
}

local ncells 1
local i 1
tokenize "`ulist'"
while "``i''"~="" {
  local ncells = `ncells'* ``tok`i'''
  local i = `i'+1
}
      
/* add one for the constant term :) */

local nparms =`nparms'+1
local df = `ncells'-`nparms'
di
di "{txt}N.B.  structural/sampling zeroes may lead to an incorrect df"
di as res "{txt}Residual degrees of freedom = {res}`df'"
di as res "{txt}Number of parameters        = {res}`nparms'"
di as res "{txt}Number of cells             = {res}`ncells'"
di

/* Now I am adding in the original data.... */

qui append using "`temp'"
   
/***************************************************************************
 * Do a contracting of the dataset on the varlist since
 * the likelihood will be calculated over this space as
 * the model may be on a smaller space
 *
 * NB: the ulist should be a SUBSET of the varlist
 ***************************************************************************/

local LHOOD 0
if "`varlist'"~="" {
   local LHOOD 1
   di as text "N.B. The likelihood is calculated for the cells spanned by `varlist'"
   global tablist "`varlist'"

   sort `varlist'
   qui by `varlist': replace `freqwt'=sum(`freqwt')
   qui by `varlist': keep if _n==_N
   qui save lhood,replace
}

/*********************************************************
 *      Do a contracting of the dataset!
 * 1) This has to be done on the unique list in the model 
 *********************************************************/

sort `ulist'
qui by `ulist': replace `freqwt'=sum(`freqwt')
qui by `ulist': keep if _n==_N

/***********************************************************************
 * Initialise the Expected Frequencies to either be contained in
 * a file or by using the syntax I have devised.
 *
 *  [D==1.T==1]=2 means
 * when D is 1 AND T is 1 the expected starting value
 * is 2 the values of the D and T must be specified otherwise what will
 * the baseline category be?
 ***********************************************************************/

cap confirm new Efreq
qui gen double Efreq=.
cap confirm new Efreqold
qui gen double Efreqold = 1

if "`constr'"~="" & "`confile'"=="" {

  /* get the first variable */

  while "`constr'"~="" {
    gettoken start constr:constr,parse(",")
    gettoken part start:start,parse("[ ")
    gettoken part start:start,parse("]") 
    local conif " `part'"
    gettoken part start:start,parse("] ") 
    di "{err}replacing all values with the condition `conif' with `start'"
     qui replace Efreqold = `start' if `conif'
     gettoken start constr:constr,parse(",")
  }
}


if "`confile'"~="" {
   if "`convars'"=="" {
     di "{err} You must specify convars option when using CONFILE"
     exit(198)
   }
   drop Efreqold
   sort `convars'
   cap drop _merge
   merge `convars' using `confile'
   cap drop _merge

/*Efreqold should be in `confile' */

   cap confirm variable Efreqold
   if _rc==111 {
     di "{err} the constraints file must include Efreqold"
     exit(111)
   }
   tempvar constr constrm
   gen `constr'=(Efreqold ~=.)
   gen `constrm'=(Efreqold ==.)
   replace Efreqold=1 if Efreqold==.
}
gen double Ofreq = `freqwt'


/*********************************************************
 *      Do a contracting of the dataset!
 * Because of the merging
 *********************************************************

sort `ulist' Efreqold
qui by `ulist': replace `freqwt'=sum(`freqwt')
qui by `ulist': keep if _n==_N

*/


/*******************************************************
 * The algorithm loop that stops when the loglikelihood
 *doesnt change much
 *******************************************************/

local llh 1000
local oldllh 0
local iter 1
cap gen double marg=.
cap gen double nmarg=.
  
while (abs(`llh'-`oldllh')>`acc') {
       
   /* Sum over the marginal models */
  
   forval i=1/`nomargs' {
     
     sort `marg`i''
     qui by `marg`i'' : replace marg=sum(Efreqold)
     qui by `marg`i'' : replace nmarg=sum(Ofreq)
     qui by `marg`i'' : replace Efreq=Efreqold*(nmarg[_N]/marg[_N])
                       
     /* Make a copy of the new estimates */
     qui replace Efreqold=Efreq
   }
     
    /* Sum over the constrained section of the continguency table. */

   if "`convars'"~="" {
     sort `convars'
     qui by `convars' : replace marg=sum(Efreqold) if `constr'==0
     qui by `convars' : replace marg=marg[_N] if `constr'==0
     qui by `convars' : replace nmarg=sum(Ofreq) if `constr'==0
     qui by `convars' : replace nmarg=nmarg[_N] if `constr'==0
     qui by `convars' : replace Efreq=Efreqold*(nmarg[_N]/marg[_N]) if `constr'==0
      
     /* Make a copy of the new estimates */
     qui replace Efreqold=Efreq  if `constr'==0
   }
    
/************************************************************************
 * Calculate the Multinomial/Poisson Likelihood if a varlist else the poisson
 * sampling likelihood
 ************************************************************************/

   if "`varlist'"~="" {
     sort `varlist'
     qui save temp,replace
     qui use lhood,replace
     cap drop _merge
     merge `varlist' using temp
     cap drop _merge

     sort `ulist'
     qui by `ulist': replace Efreq=Efreq[_N]/2
     drop Ofreq
     rename `freqwt' Ofreq
     sort `varlist'
   }


/* This is kernel of likelihood only */
      
   tempvar addall
   gen double `addall' = sum( Ofreq * log(Efreq) - Efreq )

   local oldllh = `llh'
   local llh= `addall'[_N]
  if "`nolog'"=="" di "{txt}Loglikelihood = {res}`llh'"
  drop  `addall'

  local iter=`iter'+1
}


/*********************************************************
 * Pearson's and likelihood ratio tests of goodness of fit
 *********************************************************/

tempvar addmn addm
gen double `addmn'=sum(((Efreq-Ofreq)^2)/Efreq)
gen double `addm'=sum(Ofreq*log(Ofreq/Efreq))
local g2 = 2*`addm'[_N]
local x2 = `addmn'[_N]
local pv_g = chiprob(`df',`g2')
local pv_x = chiprob(`df',`x2')
di
di "{txt}Goodness of Fit Tests"
di "{dup 21:{c -}}"
di "{txt}df = {res}`df'"
di "{txt}Likelihood Ratio Statistic G{c 178} = {res}" %8.4f `g2', "{txt}p-value = {res}" %5.3f `pv_g'
di "{txt}Pearson Statistic          X{c 178} = {res}" %8.4f `x2', "{txt}p-value = {res}" %5.3f `pv_x'

drop `addm' `addmn'

/*********************************************************
 * Calculating the probabilities of each category with
 * groups defined by the varlist
 *********************************************************/

if "`varlist'"~="" {
   sort `varlist'
   qui save temp,replace
   use lhood,replace
   merge `varlist' using temp
   sort `ulist'
   qui by `ulist': replace Efreq=Efreq[_N]
   
   sort `varlist'
   qui gen double prob=sum(Efreq)
   qui replace prob=Efreq/prob[_N]
   sort `varlist'
   qui compress
   if "`expect'"~="" l `varlist' Efreq Ofreq prob, noobs nodisplay 
   if "`save'"~="" {
      keep `varlist' Efreq Ofreq prob
      cap confirm new `save'.dta
      if _rc~=0 save "`save'", replace 
      else save "`save'" 
   }
}
else {
   qui sort `ulist'
   qui gen double prob=sum(Efreq)
   qui replace prob=Efreq/prob[_N]
   qui compress
   if "`expect'"~="" l `ulist' Efreq Ofreq prob, noobs nodisplay 
   if "`save'"~="" {
      keep `ulist' Efreq Ofreq prob
      cap confirm new `save'.dta
      if _rc~=0 save "`save'", replace
      else save "`save'"
   }
}

use "`master'",replace
return local vlist "`ulist'"
return scalar df=`df'
return scalar parms=`nparms'
return scalar ncells=`ncells'
return scalar llh = `llh'
return scalar g2 = `g2'
return scalar x2 = `x2'
return scalar pvg = `pv_g'
return scalar pvx = `pv_x'
restore

end
          
/**********************************************
 * Calculate the unique values in
 **********************************************/

program define _unique, rclass
  syntax [varlist]

preserve
qui bys `varlist': keep if _n==1
qui count if `varlist'==.
if `r(N)'>0 {
  di "{err} `varlist' has missing data and missing data must be coded as an integer value"
  exit(666)
}
return scalar unique=_N
cap mkmat `varlist', matrix(resp)
if _rc~=0 {
 di "{err}Too many unique values to form resp! in _unique"
 di "{err} there are {res}"_N 
 noi list `varlist'
}
restore

end

/**********************************************
 * Work out binary
 **********************************************/

program define binary, rclass
args dec nterms

local two=2^(`nterms'-1)
local ind "`nterms'"

local rem "`dec'"
local str`nterms' bin ""
while `two'>1 {
  if `rem'>=`two' {
    return local s`ind'=`ind'
    local bin "`bin'1"
  }
  else {
    return local s`ind'=""
    local bin  "`bin'0"
  }
  local rem=mod(`rem',`two')
  
  local ind=`ind'-1
  local two=`two'/2
}
local bin "`bin'`rem'"
return local bin="`bin'"
if `rem'==0 return local s1="" 
else  return local s1=`rem' 

end

/**********************************************
 * Work out binary
 **********************************************/
/*
This was to check for string variables but I have dropped this again due to hapipf interaction
*/
pr isnumeric
/*syntax [varlist(numeric)]*/
syntax [varlist]
end