*! version 1.51 P.MILLAR 14Apr2011 *! Copyright 2005-2011 Paul Millar *! This software can be used for non-commercial purposes only. *! The copyright is retained by the developer. *! Version 1.1 changed to allow proper handling of weights *! Version 1.2 changed to fix bugs and to add maxcut and cttype options *! Version 1.5 added new algorithm to max cut, added propcut feature 27 Jan 2009 *! Version 1.51 changed to reflect new versions of Stata program define pre, rclass syntax [anything] , [Cutoff(real 0.5) CTtype(string) Maxcut PROPcut ROUNDmaxcut(integer 4) WEIGHTone(real 1.0)] /* The approach of this procedure has three steps */ /* 1. find the errors when guessing the mean, median or mode = E1 */ /* 2. find the errors when guessing what the model predicts = E2 */ /* 3. PRE = (E1-E2)/E1 */ local ver=c(version) if `ver' < 9.0 { local ver="8.0" version 8.0 local outname="outcomes" } else { local outname="out" local vername=round(`ver',1) version `vername' } if `cutoff'<0 | `cutoff'>1 { display as error "*** pre: Value of Cutoff must be between 0 and 1" exit 198 } if "`maxcut'" == "maxcut" & "`propcut'"=="propcut" { di as error "*** pre: maxcut and propcut cannot be specified together" exit 198 } if `roundmaxcut'<1 | `roundmaxcut'>9 { local roundmaxcut=8 } /* initialize */ local type=e(cmd) if substr("`e(cmd)'",1,2) == "xt" { local type=substr("`e(cmd)'",3,8) local xt="xt" } tempvar yp ybar stot sres sreg sstot ssreg ssres ymode ymedian devscore sdevscore res reg yp2 fmode sfmode sortord yy tempvar totcorrect correct cuts yyp2 /* get the dependent variable */ local y=e(depvar) qui gen `yy'=`y' if e(sample) if ("`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit" | "`type'"=="cloglog") { qui replace `yy'=1 if `yy'~=0 } local wtype="`e(wtype)'" /* since we are not calculating standard deviations, we can use pweight as if aweight */ if "`e(wtype)'" == "pweight" { local wtype="aweight" } /* get the predicted values (except for mlogit, mprobit, ologit and oprobit) */ if "`type'"!="mlogit" & "`type'"!="mprobit" & "`type'"!="ologit" & "`type'"!="oprobit" & "`type'"!="gologit2" { quietly predict `yp' if e(sample) } if "`type'" == "ereg" { qui drop `yp' quietly predict `yp' if e(sample), `cttype' } if "`e(cmd)'" == "xtlogit" { quietly replace `yp'=exp(`yp')/(1+exp(`yp')) if e(sample) } /* get the predicted values (for mlogit, mprobit, ologit and oprobit) */ if "`type'"=="mlogit" | "`type'"=="mprobit" | "`type'"=="ologit" | "`type'"=="oprobit" | "`type'"=="gologit2" { if "`type'"=="mprobit" { local ncats=colsof(e(outcomes)) mat def tempprobit=e(outcomes) mat temp=J(1,`ncats',0) forvalues i = 1/`ncats' { mat temp[`i',1]=tempprobit[1,`i'] } mat drop tempprobit } else if "`type'"=="ologit" | "`type'"=="oprobit" | "`type'"=="gologit2" { local ncats=colsof(e(cat)) mat def temp=e(cat) } else { local ncats=colsof(e(out)) mat def temp=e(out) } /* Get predicted probabilities for each category of DV */ forvalues i = 1/`ncats' { local catval=temp[1,`i'] tempvar prob`catval' quietly predict `prob`catval'' if e(sample), outcome(`catval') } /* Find the maximum probability for each case */ tempvar maxprob quietly gen `maxprob'=0 forvalues i = 1/`ncats' { local catval=temp[1,`i'] quietly replace `maxprob'=`prob`catval'' if `prob`catval'' > `maxprob' } /* Use the maximum probability to get the prediction of the DV for each case */ qui gen `yp2'=. forvalues i = 1/`ncats' { local catval=temp[1,`i'] quietly replace `yp2'=`catval' if `prob`catval'' == `maxprob' & e(sample) } qui gen `yp'=`yp2' } /* ------------------------------------------- */ /* set the type of central tendency to be used */ /* ------------------------------------------- */ if "`cttype'" == "mean" | "`cttype'" == "median" | "`cttype'" == "mode" { // di " " } else if "`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit" | "`type'"=="mlogit" { local cttype="mode" } else if "`type'"=="ologit" | "`type'"=="oprobit" | "`type'"=="gologit2" { local cttype="median" } else { local cttype="mean" } /* ---------------------------- */ /* find first case in e(sample) */ /* ---------------------------- */ if "`type'" != "factor" { local firstcase=0 local i=0 while `firstcase'==0 { local i=`i'+1 local yvalue=`yy' in `i' if `yvalue' !=. { local firstcase=`i' } } } else { local firstcase=1 } if `firstcase' < 1 | `firstcase' > _N { di as error "*** error in PRE *** could not find first non-missing case" exit 198 } /* ------------------------------------------------------------------------ */ /* find the maximum cutoff point (if specified) for logistic regression pre */ /* ------------------------------------------------------------------------ */ if ("`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit"| "`type'"=="cloglog") & "`maxcut'" == "maxcut" { local nobs=_N qui gen `correct'=0 qui gen `sortord'=_n qui gen `cuts'=. sort `yp' local nobs=_N local prevval=0 local maxright=0 local savecut=0 local i=`firstcase'-1 local ncuts=0 local cut0=0 local endit=0 /* Make a first pass, recording all unique values of potential cutpoints */ while `endit'==0 { local i=`i'+1 local curval=`yp' in `i' /* check to see if we have reached the end */ if "`curval'" == "." | `i'==`nobs' { local ncuts=`ncuts'+1 local curcut=(1+`prevval')/2 local cut`ncuts'=`curcut' local morenobs=`nobs'+1 qui set obs `morenobs' qui replace `cuts'=1 in `ncuts' local endit=1 } else if "`curval'" != "`prevval'" { local ncuts=`ncuts'+1 local curcut=(`curval'+`prevval')/2 local cut`ncuts'=`curcut' qui replace `cuts'=`curcut' in `ncuts' } local prevval=`curval' } /* now find the most predictive cutpoint by successive approximations */ local prevcut=0 local decimal=1 local lower=0 local upper=1 local start=1 local end=`ncuts' while `start' < `end' { local maxright=0 forvalues i=`start'/`end' { local rawcut=`cuts' in `i' local curcut=round(`rawcut',10^(-`decimal')) if `curcut' >= `lower' & `curcut' < `upper' { if `prevcut' != `curcut' { local saveend=`i' local prevcut=`curcut' qui replace `correct' = abs(1-abs(`y'-int(`yp'+(1-`curcut')))) if e(sample) if `weightone' != 1 { qui replace `correct'=`correct'*`weightone' if e(sample) & `correct'==1 & `y'==1 } qui egen `totcorrect'=sum(`correct') if e(sample) local numright=`totcorrect' in `firstcase' if `numright' > `maxright' { local maxright=`numright' local savecut=`curcut' local saverawcut=`rawcut' local savestart=`i' } qui drop `totcorrect' } } local prevcut=`curcut' } local lower=`savecut'-(.5/10^(`decimal')) local upper=`savecut'+(.5/10^(`decimal')) local start=`savestart' local end=`saveend' local decimal=`decimal'+1 local maxright=0 } local decimal=`decimal'-1 qui drop in `morenobs' di as text _newline "The cutpoint for maximum predictiveness is " as result %8.`decimal'f `saverawcut' local cutoff=`saverawcut' } /* --------- end of maxcut ----------- */ /* ----------------------------------------------------------------------------- */ /* find the proportional cutoff point (if specified) for logistic regression pre */ /* ----------------------------------------------------------------------------- */ if ("`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit"| "`type'"=="cloglog") & "`propcut'" == "propcut" { local nobs=_N qui gen `sortord'=_n sort `yp' local ncases=0 local endit=0 local nsuccess=0 /* Make a first pass, recording all unique values of potential cutpoints */ while `endit'==0 { local i=`i'+1 local curyp=`yp' in `i' local cury=`y' in `i' /* check to see if we have reached the end */ if `i'==`nobs' { local ncases=`ncases'+1 local endit=1 local nsuccess=`nsuccess'+`cury' } else if "`curyp'" == "." { local endit=1 } else { local ncases=`ncases'+1 local nsuccess=`nsuccess'+`cury' } } local prop=`nsuccess'/`ncases' if `nsuccess'==0 { local cut1=`yp' in `ncases' local cut2=1 } else if `nsuccess'>=`ncases' { local cut1=0 local cut2=`yp' in 1 } else { local locone=`ncases'-`nsuccess'+1 local loctwo=`locone'+1 local cut1=`yp' in `locone' local cut2=`yp' in `loctwo' } local propcut=(`cut1'+`cut2')/2 qui sort `sortord' local cutoff=`propcut' // di "`ncases' cases found, of which `nsuccess' were successes'" // di "locone=`locone'; loctwo=`loctwo'; cut1=`cut1'; cut2=`cut2';" di as text _newline "Proportional cut is " as result %6.4f `propcut' } /* --------- end of propcut ----------- */ /* ------------ */ /* Calculate E1 */ /* ------------ */ if "`cttype'" == "mode" { quietly summ `yy' if e(sample) [`wtype' `e(wexp)'] local nobs=r(N) qui egen `ymode' = mode(`yy') if e(sample),minmode qui summ `yy' if e(sample) & `yy' == `ymode' [`wtype' `e(wexp)'] local modef=r(N) local e1=`nobs'-`modef' } if "`cttype'" == "median" { qui egen `ymedian'=median(`y') if e(sample) local median = `ymedian' in `firstcase' qui gen `devscore' = abs(`y' - `median') if e(sample) qui egen `sdevscore'=sum(`devscore') if e(sample) local e1=`sdevscore' in `firstcase' } if "`cttype'" == "mean" { qui egen `ybar' = mean(`y') if e(sample) qui gen `stot' = (`y' - `ybar')^2 if e(sample) qui egen `sstot' = sum(`stot') if e(sample) local e1=`sstot' in `firstcase' } if "`e(cmd)'" == "factor" { local e1=`e(df_m)'*`e(f)' } /* ------------ */ /* Calculate E2 */ /* ------------ */ if "`cttype'" == "mode" { if "`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit" | "`type'"=="cloglog" { qui gen `yyp2'=int(`yp'+(1-`cutoff')) if e(sample) quietly summ `y' [`wtype' `e(wexp)'] if `y' != `yyp2' & e(sample) local e2=r(N) } else if "`type'"=="mlogit" | "`type'"=="ologit" | "`type'"=="oprobit" | "`type'"=="gologit2" { quietly summ `y' [`wtype' `e(wexp)'] if (`y'!= `yp2') & e(sample) qui gen `yyp2'=`yp2' local e2=r(N) mat drop temp } else { tempvar wrong qui gen `wrong'=0 qui replace `wrong'=1 if round(`yp') != round(`y') qui egen `sres' = sum(`wrong') if e(sample) local e2=`sres' in `firstcase' } } if "`cttype'" == "median" { qui gen `res'=abs(`y'-`yp') if e(sample) qui egen `sres'=sum(`res') if e(sample) local e2=`sres' in `firstcase' } if "`cttype'" == "mean" { qui gen `yyp2'=`yp' qui gen `sres' = (`y' - `yp' )^2 if e(sample) qui gen `sreg' = `stot' - `sres' if e(sample) qui egen `ssreg' = sum(`sreg') if e(sample) qui egen `ssres' = sum(`sres') if e(sample) local e2=`ssres' in `firstcase' } if "`e(cmd)'" == "factor" { mat fload=e(L) local k=`e(df_m)' local err=0 forvalues i=1/`k' { local h=fload[`i',1] local U=1-(`h'*`h') local err=`err'+`U' } local e2=`err' } /* --------------------------------- */ /* Now we are ready to calc PRE!!! */ /* --------------------------------- */ local pre=(`e1'-`e2')/`e1' /* cox regression does not produce predicted duration values (only hazard ratios) */ if "`type'" == "cox" { local e1="." local e2="." local pre="." } else if "`type'"=="factor" { di as text _newline "Average Variance Explained of the factor is " as result %6.2f `pre'*100 "%" } else { di as text _newline "Model reduces errors in the prediction of `y' by " as result %6.2f `pre'*100 "%" } /* get matrix of predicted versus actual */ if ("`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit" | "`type'"=="mlogit" | "`type'"=="mprobit") & "`cttype'"=="mode" { label variable `yyp2' "Prediction of `y'" label variable `yy' "`y'" tab `yy' `yyp2' if e(sample), matcell(temp0) matcol(temp1) matrow(temp2) } if "`cttype'"=="mode" { if "`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit" | "`type'"=="cloglog" { local p0=temp0[1,1]/(temp0[1,1]+temp0[1,2])*100 local p1=temp0[2,2]/(temp0[2,1]+temp0[2,2])*100 if temp0[1,2]==. { local p0=100 } if temp0[1,1]==. { local p0=0 } if temp0[2,2]==. { local p1=0 } if temp0[2,1]==. { local p1=100 } di as text "Model predicts `y'=0 correctly " as result %2.0f `p0' "% " as text "of the time" di as text "Model predicts `y'=1 correctly " as result %2.0f `p1' "% " as text "of the time" } else if "`type'" != "cox" { local good=`nobs'-`e2' local good=`good'/`nobs'*100 di as text "Model predicts `y' correctly " %2.0f `good' "% of the time" } } return local pre=`pre' if ("`type'"=="logistic" | "`type'"=="logit" | "`type'"=="probit") & "`maxcut'" == "maxcut" { return local cutoff=`cutoff' } return local e1=`e1' return local e2=`e2' end