*! VERSION 1.4.1 12/2/92 for STATA version 3.0
*! Slightly cleaned up version 10/8/98.
*! WARNING!  This program is in testing only, and does not incorporate full
*! data protection features
/* Written by:
     D.H. Judson
     Nevada State Demographer's Office/024
     University of Nevada, Reno
     Reno, NV 89557-0100
     (702) 784-6352
     (702) 784-6353 Fax
     email: djudson@unr.edu
Program code enhancements and suggestions are welcome at the above address.
     This program calls ado files:
          centroid.ado
          medoid.ado
This program is based on Anderberg, Michael (1973). Cluster analysis for
applications. NY, NY: Academic press. Chapter 7, Forgy's method, page161.
*/
capture program drop cluster
prog def cluster
version 3.0
local options "Iter(string) Groups(string) Dist(string) Gen(string) Stop(string) MEDoid KEEP STD Random First *"
local varlist "req ex"
local in "opt"
local if "opt"
local weight "opt fweight aweight noprefix"
parse "`*'"
parse "`varlist'", parse(" ")

if "`gen'"=="" {
     di in bl "You must specify a variable name in gen(variable)"
     di in bl "to contain your clustering variable."
     error 9
}

if "`groups'"=="" {
     di in ye "NOTE: Assuming only 2 groups in the data."
}

if "`iter'"=="" {
          local iter=20
}

if "`dist'"=="" {
          local dist=2
}

if ("`groups'"=="")|("`groups'"=="0") {
          local groups=2
}
local sfn "$S_FN"
tempfile user
qui save `user',replace

/* GENERATE ORIGINAL CLUSTERS (RANDOMLY OR SYSTEMATICALLY) */
local CLUSTER `gen'
/* PLACE OBJECTS RANDOMLY INTO CLUSTERS */
if "`random'"~="" {
     gen byte `CLUSTER'=int(uniform()*`groups')+1
     /* CHECK TO MAKE SURE ENOUGH CLUSTERS GENERATED */
     qui tab `CLUSTER' 
     mac def k=_result(2)
          while $k<`groups' {
          replace `CLUSTER'=int(uniform()*`groups')+1
          qui tab `CLUSTER'
          mac def k=_result(2)
          }
}

/* FIRST K OBJECTS DEFINE INITIAL CLUSTERS */
else if "`first'"~="" {
     qui gen byte `CLUSTER'=1
     while "`1'"~="" {
               qui replace `CLUSTER'=. if `1'==.
                    macro shift
     }
     qui replace `CLUSTER'=sum(`CLUSTER')
     qui replace `CLUSTER'=. if `CLUSTER'>`groups'
     parse "`varlist'", parse(" ")
}

/* (APPROXIMATELY) EQUAL NUMBERS IN EACH CLUSTER, TAKEN SYSTEMATICALLY */
else { 
     gen byte `CLUSTER'=int(((_n-1)/_N)*`groups')+1
     }


/* LISTWISE DELETION OF CASES WITH 1 OR MORE MISSING VALUES */

while "`1'"~="" {
      qui replace `CLUSTER'=. if `1'==.
      macro shift
}

/* CHECK IF STOP=0 */
if "`stop'"=="0" {
     stop
}

parse "`varlist'", parse(" ")

/* SEE STATA RELEASE 3.0 REFERENCE MANUAL, [4] PROGRAM_FRAGMENTS, P.305
     THIS SECTION PROTECTS DATA SET FROM -BREAK-, ERRORS, ETC. */

if _N==0 {
      error 2000
}

capture {
/* STANDARDIZE ALL VARIABLES */
      if "`std'"~="" {
           local usrlst "`varlist'"
           local varlist " "
           while "`1'"~="" {
               local 1=substr("`1'",1,length("`1'")-(length("`1'")==8))
               qui egen S`1'=std(`1')
               local varlist "`varlist' S`1'"
               macro shift
           }
           parse "`varlist'", parse(" ")
      }
      mac def iter=0
      mac def conv=0
      while ($iter<$_iter)*($conv~=1) {
      mac def iter=$iter+1

/* DEFINE DISTANCE VARIABLE STRING */
      qui tab `CLUSTER'
      mac def k=_result(2) /*  REPLACE WITH `GROUPS' */
      local it=2
      mac def dvars "DIST1"
           while `it'<=$k {
                mac def dvars "$dvars ,DIST`it'"
           local it=`it'+1
           }
/* CALCULATE CENTROIDS AND DISTANCES */
tempvar MIND
qui gen `MIND'=99999
if "`medoid'"=="" {
     qui centroid `varlist' `if' `in', group(`CLUSTER') dist(`dist') }
else {
     qui medoid `varlist' `if' `in', group(`CLUSTER') dist(`dist') }
qui replace `MIND'=min($dvars)

/* CLASSIFY ACCORDING TO DISTANCES */
     tempvar OLDCL
     qui gen `OLDCL'=`CLUSTER'

     local it=1
     while `it'<=$k {
          qui replace `CLUSTER'=`it' if (DIST`it'==`MIND')*(DIST`it'~=.)
     local it=`it'+1
     }

/* TEST SECTION--COMMENTED OUT SO IT WON'T RUN
     qui gen OLDCL$iter=`CLUSTER'
     qui gen MIN$iter=`MIND'
	local it=1
	while `it'<=$k {
		assert `MIND'<=DIST`it'
	local it=`it'+1
	}
*/


/* CONVERGENCE ACHIEVED (NO CHANGE)? */
     tempvar sumdist
     qui gen `sumdist'=sum(`MIND')
     local sumd=round(`sumdist'[_N],.001)
     cap assert `OLDCL'==`CLUSTER'
     if (_rc==0)|("`stop'"=="$iter") {
           mac def conv=1
           qui tab `CLUSTER' if ((`OLDCL'-`CLUSTER')~=0)
           mac def changes=_result(1)
           noi di "Converged at iteration: $iter" _col(28) /*
           */ "Number of cases changing cluster: $changes"
           noi di _col(28) "Final Sum of Distances: `sumd'"  
           /* qui cap drop `MIND' `OLDCL' */
           qui format `CLUSTER' %2.0f
           qui format DIST* %5.2f

           if "`keep'"~="" {
                qui save `user',replace
           }

           if "`std'"~="" {
               qui cap drop `varlist'
               local varlist "`usrlst'"
           }
           qui sort `CLUSTER'
           noi di in bl "CLUSTER SUMMARY MEASURES:"
           noi by `CLUSTER': summ `varlist'
           qui cap drop DIST*

           if "`keep'"=="" {
               qui save `user',replace
           }
     }
     else if ($iter>=`iter') {
           noi di in red "`iter' iterations exceeded without convergence."
           noi di in red "Please restart or try different start values."
           error 430
      }
     else {
           qui tab `CLUSTER' if ((`OLDCL'-`CLUSTER')~=0)
           mac def changes=_result(1)
           qui cap drop `OLDCL' DIST* `MIND'

     noi di "Iteration number: $iter" _col(23) /*
          */ "Number of cases changing cluster: $changes"
     noi di _col(23) "Sum of distances: `sumd'"   
     }
     }
}
local rc=_rc
qui use `user', clear
mac def S_FN "`sfn'"
erase `user'
error `rc'
end