*! VERSION 1.4.1 12/2/92 for STATA version 3.0 *! Slightly cleaned up version 10/8/98. *! WARNING! This program is in testing only, and does not incorporate full *! data protection features /* Written by: D.H. Judson Nevada State Demographer's Office/024 University of Nevada, Reno Reno, NV 89557-0100 (702) 784-6352 (702) 784-6353 Fax email: djudson@unr.edu Program code enhancements and suggestions are welcome at the above address. This program calls ado files: centroid.ado medoid.ado This program is based on Anderberg, Michael (1973). Cluster analysis for applications. NY, NY: Academic press. Chapter 7, Forgy's method, page161. */ capture program drop cluster prog def cluster version 3.0 local options "Iter(string) Groups(string) Dist(string) Gen(string) Stop(string) MEDoid KEEP STD Random First *" local varlist "req ex" local in "opt" local if "opt" local weight "opt fweight aweight noprefix" parse "`*'" parse "`varlist'", parse(" ") if "`gen'"=="" { di in bl "You must specify a variable name in gen(variable)" di in bl "to contain your clustering variable." error 9 } if "`groups'"=="" { di in ye "NOTE: Assuming only 2 groups in the data." } if "`iter'"=="" { local iter=20 } if "`dist'"=="" { local dist=2 } if ("`groups'"=="")|("`groups'"=="0") { local groups=2 } local sfn "$S_FN" tempfile user qui save `user',replace /* GENERATE ORIGINAL CLUSTERS (RANDOMLY OR SYSTEMATICALLY) */ local CLUSTER `gen' /* PLACE OBJECTS RANDOMLY INTO CLUSTERS */ if "`random'"~="" { gen byte `CLUSTER'=int(uniform()*`groups')+1 /* CHECK TO MAKE SURE ENOUGH CLUSTERS GENERATED */ qui tab `CLUSTER' mac def k=_result(2) while $k<`groups' { replace `CLUSTER'=int(uniform()*`groups')+1 qui tab `CLUSTER' mac def k=_result(2) } } /* FIRST K OBJECTS DEFINE INITIAL CLUSTERS */ else if "`first'"~="" { qui gen byte `CLUSTER'=1 while "`1'"~="" { qui replace `CLUSTER'=. if `1'==. macro shift } qui replace `CLUSTER'=sum(`CLUSTER') qui replace `CLUSTER'=. if `CLUSTER'>`groups' parse "`varlist'", parse(" ") } /* (APPROXIMATELY) EQUAL NUMBERS IN EACH CLUSTER, TAKEN SYSTEMATICALLY */ else { gen byte `CLUSTER'=int(((_n-1)/_N)*`groups')+1 } /* LISTWISE DELETION OF CASES WITH 1 OR MORE MISSING VALUES */ while "`1'"~="" { qui replace `CLUSTER'=. if `1'==. macro shift } /* CHECK IF STOP=0 */ if "`stop'"=="0" { stop } parse "`varlist'", parse(" ") /* SEE STATA RELEASE 3.0 REFERENCE MANUAL, [4] PROGRAM_FRAGMENTS, P.305 THIS SECTION PROTECTS DATA SET FROM -BREAK-, ERRORS, ETC. */ if _N==0 { error 2000 } capture { /* STANDARDIZE ALL VARIABLES */ if "`std'"~="" { local usrlst "`varlist'" local varlist " " while "`1'"~="" { local 1=substr("`1'",1,length("`1'")-(length("`1'")==8)) qui egen S`1'=std(`1') local varlist "`varlist' S`1'" macro shift } parse "`varlist'", parse(" ") } mac def iter=0 mac def conv=0 while ($iter<$_iter)*($conv~=1) { mac def iter=$iter+1 /* DEFINE DISTANCE VARIABLE STRING */ qui tab `CLUSTER' mac def k=_result(2) /* REPLACE WITH `GROUPS' */ local it=2 mac def dvars "DIST1" while `it'<=$k { mac def dvars "$dvars ,DIST`it'" local it=`it'+1 } /* CALCULATE CENTROIDS AND DISTANCES */ tempvar MIND qui gen `MIND'=99999 if "`medoid'"=="" { qui centroid `varlist' `if' `in', group(`CLUSTER') dist(`dist') } else { qui medoid `varlist' `if' `in', group(`CLUSTER') dist(`dist') } qui replace `MIND'=min($dvars) /* CLASSIFY ACCORDING TO DISTANCES */ tempvar OLDCL qui gen `OLDCL'=`CLUSTER' local it=1 while `it'<=$k { qui replace `CLUSTER'=`it' if (DIST`it'==`MIND')*(DIST`it'~=.) local it=`it'+1 } /* TEST SECTION--COMMENTED OUT SO IT WON'T RUN qui gen OLDCL$iter=`CLUSTER' qui gen MIN$iter=`MIND' local it=1 while `it'<=$k { assert `MIND'<=DIST`it' local it=`it'+1 } */ /* CONVERGENCE ACHIEVED (NO CHANGE)? */ tempvar sumdist qui gen `sumdist'=sum(`MIND') local sumd=round(`sumdist'[_N],.001) cap assert `OLDCL'==`CLUSTER' if (_rc==0)|("`stop'"=="$iter") { mac def conv=1 qui tab `CLUSTER' if ((`OLDCL'-`CLUSTER')~=0) mac def changes=_result(1) noi di "Converged at iteration: $iter" _col(28) /* */ "Number of cases changing cluster: $changes" noi di _col(28) "Final Sum of Distances: `sumd'" /* qui cap drop `MIND' `OLDCL' */ qui format `CLUSTER' %2.0f qui format DIST* %5.2f if "`keep'"~="" { qui save `user',replace } if "`std'"~="" { qui cap drop `varlist' local varlist "`usrlst'" } qui sort `CLUSTER' noi di in bl "CLUSTER SUMMARY MEASURES:" noi by `CLUSTER': summ `varlist' qui cap drop DIST* if "`keep'"=="" { qui save `user',replace } } else if ($iter>=`iter') { noi di in red "`iter' iterations exceeded without convergence." noi di in red "Please restart or try different start values." error 430 } else { qui tab `CLUSTER' if ((`OLDCL'-`CLUSTER')~=0) mac def changes=_result(1) qui cap drop `OLDCL' DIST* `MIND' noi di "Iteration number: $iter" _col(23) /* */ "Number of cases changing cluster: $changes" noi di _col(23) "Sum of distances: `sumd'" } } } local rc=_rc qui use `user', clear mac def S_FN "`sfn'" erase `user' error `rc' end