*! version 1.1.5 24may2012 by austinnichols@gmail.com and nwinter@virginia.edu *! create violin plots /* changelog 1.1.5 add check for N>0 with [if] [in] etc. 1.1.4 fix double `over' -levels- call, changed default label orientation 1.1.3 support weights 1.1.2 minimized memory footprint when -total- option is specified 1.1.1 truncate() option within over() 1.1.0 multiple y variables added over() suboptions -total-, -missing-, -nolabel- added -noylab- option 1.0.2 disabled accidental use of -by- option added -sortpreserve- */ program vioplot, sortpreserve version 9.2 syntax varlist [if] [in] [aw pw fw], /// [ Over1(string) Over2(string) Over3(string) /// noFill /// HORizontal VERTical /// (vertical) is default Kernel(passthru) BWidth(passthru) /// kdensity options DScale(real 100) BARWidth(real 100) /// rescale the densities DENsity(string) /// density plotting options BAR(string) /// bar plotting options (note barw() is percentage) LINE(string) /// line plotting options MEDian(string) /// median plotting options OBs1 OBsalt(string) OBSOPTs(string) /// obs or obs(alt) to add (n) to plot OGap(real 100) YGap(real 100) noYLabel noLabelrotate /// by(string) /// sic * /// overall graph options ] if `"`exp'"'!="" loc wt `"[aw`exp']"' if "`horizontal'"!="" & "`ylabel'"=="" loc y0 "yla(,angle(0))" if "`horizontal'"=="" & "`xlabel'"=="" loc y0 "xla(,angle(90))" if "`horizontal'"!="" & "`ylabel'"=="" & "`labelrotate'"=="" loc y0 "yla(,angle(90))" if "`horizontal'"=="" & "`xlabel'"=="" & "`labelrotate'"=="" loc y0 "xla(,angle(0))" local gropts `options' local yvars `varlist' if `"`by'"'!="" { di as error "by() not allowed; use over() to specify categories" exit 190 } if `"`over1'"'=="" { // no over() specified local no_over_spec 1 tempvar no_over tempname no_over_lab qui gen byte `no_over'=1 local over1 `no_over' la de `no_over_lab' 1 "`varlist'" la val `no_over' `no_over_lab' local gropts xscale(range(0.5 1.5)) `gropts' } else { local no_over_spec 0 } if `"`over3'"'!="" { di as error "too many over() options specified" exit 198 } if "`obsalt'"!="" { if "`obsalt'"=="alt" local obs obs else { di as error "invalid obs()" exit 198 } } if "`obs1'"=="obs1" local obs obs if `: word count `horizontal' `vertical''>1 { di as error "may not combine options horizontal and vertical" exit 198 } else if "`horizontal'`vertical'"=="" local vertical vertical if "`horizontal'"=="horizontal" { local kdorientation vertical /* needs to be reverse of overall */ local cataxis y /* category labels are ylab */ } else { local kdorientation horizontal local cataxis x } marksample touse, novarlist // based on varlist (i.e., yvar(s)) // additional -markout- below after processing of -over- option if "`fill'"=="nofill" { // need to dump categories that have no observations for *any* y-var tempvar marker qui gen `marker'=. foreach var of local varlist { qui replace `marker'=1 if !mi(`var') // set to one for any obs that is avail for any y-var } markout `touse' `marker' // drop obs that don't exist for any y-var // done with `marker' at this point } //deal with over() options local 0 `over1' syntax [varname (default=none)] , [ Missing1 noLABel1 Total1 TRuncate(passthru) ] if "`varlist'"!="" { local m = cond("`missing1'"=="missing1","missing","") if substr("`: type `varlist''",1,3)!="str" & "`label1'"=="nolabel1" { // string variables need -label- option in egen tempvar tmp // qui gen `tmp' = `varlist' // so that egen, label will grab values r/t original labels if any local varlist `tmp' } tempvar over1 sort `varlist' qui egen `over1'=group(`varlist') if `touse', `m' label `truncate' // so we have a numeric variable, labelled if `"`: label (`over1') 1'"'=="" { label define `over1' 1 ".", modify // egen used varname as labelname } capture drop `tmp' // done with `tmp'; need capture in case we didn't ever use it } local 0 `over2' syntax [varname (default=none)] , [ Missing2 noLABel2 Total2 TRuncate(passthru) ] if "`varlist'"!="" { local m = cond("`missing2'"=="missing2","missing","") if substr("`: type `varlist''",1,3)!="str" & "`label2'"=="nolabel2" { // string variables need -label- option in egen tempvar tmp // qui gen `tmp' = `varlist' local varlist `tmp' } tempvar over2 sort `varlist' qui egen `over2'=group(`varlist') if `touse', `m' label `truncate' // so we have a numeric variable, labelled if `"`: label (`over2') 1'"'=="" { // if string with blanks label define `over2' 1 ".", modify // egen used varname as labelname } capture drop `tmp' // done with `tmp'; need capture in case we didn't ever use it } local over `over1' `over2' local nover : word count `over' if "`total1'"!="" & "`total2'"!="" { di as error "May only specify total option for one over() variable" exit 198 } local preserved 0 if `nover'==2 & "`fill'"!="nofill" { preserve local preserved 1 fillin `over' // create missing values of over combinations qui replace `touse'=1 if _fillin==1 // so that new categories exist in group variable } if "`total1'`total2'"!="" { if !`preserved' preserve // preserve if we didn't already do it above local preserved 1 keep `yvars' `touse' `over1' `over2' // will be appending entire dataset, so drop unnecessary vars } * marksample occurs before -fillin- * then -fillin- * then flag new observations as in-sample (even though they are missing on DV(s) * because we want those categories to exist on the category axis * Then we markout the -over- vars *after* the fillin, in order to markout any * that fillin created with missing on the over variables (unless -missing- option specified) if "`missing1'"!="missing1" markout `touse' `over1' // no strok b/c group variables created by egen above if "`missing2'"!="missing2" markout `touse' `over2' * `touse' is all set at this point local 0 , `bar' // pull off the barwidth argument (note it's now a percentage) syntax , [ BARWidth1(string) * ] local bar `options' if "`barwidth1'"!="" { di as error "must specify barwidth() as main option, not suboption for bar()" exit 198 } local n_y : word count `yvars' local barwidth barwidth(`=0.05*(`barwidth'/100)*(1/`n_y')') local overgap 0.5*(`ogap'/100) qui { tempvar overvar // to hold category values 1 2 3 . . . . tempvar xpos // position on "x"-axis for each category // note: val label for each group is [o2level] [o1level] // GenGroup sorts data: `touse' `over2' `over1' GenGroup "`overvar'" "`touse'" "`over1'" "`over2'" "`label1'" "`label2'" "`total1'" "`total2'" "`preserved'" sum `overvar', meanonly local nlev `r(max)' if "`nlev'"=="" error 2000 // work out xpos and labelling for second -over- variable, if any if `nover'==1 { gen `xpos'=`overvar' } else { gen `xpos'=. local prev_o2 : word 1 of `: label (`overvar') 1' // initialize to the first one local cur_o2_min 1 local curxpos 1 forval i=1/`nlev' { local cur_o2 : word 1 of `: label (`overvar') `i'' if `"`cur_o2'"'!=`"`prev_o2'"' { local cur_o2_range = `curxpos' - `cur_o2_min' - 1 // minus 1 is b/c already incremented for this one local ppos = (`curxpos'-1)-(`cur_o2_range')/2 // midpoint of previous group local curxpos = `curxpos' + `overgap' // jump up by the gap local cur_o2_min `curxpos' * generate axis label part for this group if "`cataxis'"=="x" local catlabs2 `catlabs2' `ppos' `"" " "`prev_o2'""' else local catlabs2 `catlabs2' `ppos' `""`prev_o2'" " ""' local prev_o2 `cur_o2' } replace `xpos' = `curxpos' if `overvar'==`i' // for creation of stuff local curxpos = `curxpos' + 1 // increment for next } * do final group local cur_o2_range = `curxpos' - `cur_o2_min' - 1 // minus 1 is b/c already incremented for this one local ppos = (`curxpos'-1)-(`cur_o2_range')/2 // midpoint of previous group if "`cataxis'"=="x" local catlabs2 `catlabs2' `ppos' `"" " "`cur_o2'""' else local catlabs2 `catlabs2' `ppos' `""`cur_o2'" " ""' } local legend_num 1 local cur_y_n 0 local kdmin . local kdmax . foreach cur_y of varlist `yvars' { tempvar inspike p25 p50 p75 first lav uav min max nobs tempvar my_xpos local cur_y_n=`cur_y_n'+1 local offset = (`cur_y_n' - `n_y'/2 - 0.5)*(1/(`n_y'+1))*(`ygap'/100) // creates spacing of ygap between categories gen `my_xpos' = `xpos'+`offset' // generate statistics for bar and spike (i.e., boxplot stats) bys `touse' `over' (`cur_y'): egen `p25'=count(`cur_y') if `touse' // sic gen `nobs' = "(" + string(`p25',"%9.0g") + ")" drop `p25' by `touse' `over': egen `min'=min(`cur_y') if `touse' by `touse' `over': egen `max'=max(`cur_y') if `touse' g double `p25'=. g double `p50'=. g double `p75'=. levels `overvar' if `touse', loc(overlevs) foreach olev of loc overlevs { _pctile `cur_y' `wt' if `touse' & `overvar'==`olev', nq(4) replace `p50'=r(r2) if `touse' & `overvar'==`olev' replace `p25'=r(r1) if `touse' & `overvar'==`olev' replace `p75'=r(r3) if `touse' & `overvar'==`olev' } // flag observations within the spike range gen `inspike'=inrange(`cur_y',`p25' - 1.5*(`p75'-`p25'),`p75' + 1.5*(`p75'-`p25')) if `touse' // flag first observation within each group--this is the one that is plotted for each bar bysort `touse' `over' `inspike' (`cur_y'): gen `first' = (_n==1) if `inspike'==1 & `touse' // flag lav and uav for plotting the spike by `touse' `over' `inspike': gen `lav' = `cur_y'[1] if `inspike'==1 & `touse' by `touse' `over' `inspike': gen `uav' = `cur_y'[_N] if `inspike'==1 & `touse' forval i=1/`nlev' { tempvar x_`i' d_`i' nd_`i' local curlev `i' // : word `i' of `xlevels' capture kdensity `cur_y' `wt' if `overvar'==`curlev' & `touse', gen(`x_`i'' `d_`i'') nograph `kernel' `bwidth' if _rc==2001 { // only one observation; treat as if none for purposes of density gen `x_`i''=. gen `d_`i''=. } else if _rc { di as error "kdensity produced an error:" exit _rc } sum `d_`i'', meanonly local scalefac = (.3/(r(max)-r(min)))*(1/`n_y')*(`dscale'/100) replace `d_`i'' = `scalefac' * (`d_`i''-r(min)) // scales & forces the mirror images to touch gen `nd_`i'' = -`d_`i'' // create mirror if "`obs'"!="" { sum `x_`i'', meanonly if `r(N)' { local kdmin = min(`kdmin',`r(min)') local kdmax = max(`kdmax',`r(max)') } } sum `my_xpos' if `overvar'==`curlev', meanonly replace `d_`i'' = `d_`i'' + `r(min)' // move over replace `nd_`i'' = `nd_`i'' + `r(min)' if `cur_y_n'==1 { // cat axis label; only needed for first y sum `xpos' if `overvar'==`curlev', meanonly // doesn't include offset! local thelabel : label (`overvar') `curlev' local thelab_1 : word `nover' of `thelabel' // word two if two over vars; word one if one local thelab_2 : word 1 of `thelabel' local catlabs `catlabs' `r(min)' `"`thelab_1'"' // add var1 label } local kdplots `kdplots' /// rarea `nd_`i'' `d_`i'' `x_`i'' , `kdorientation' pstyle(p`cur_y_n') finten(35) `density' sort || if `i'==1 { local cylab : variable label `cur_y' if "`cylab'"=="" | "`ylabel'"=="noylabel" local cylab `cur_y' local legend_text `legend_text' `legend_num' `"`cylab'"' } local ++legend_num } local scatter_vars = cond("`vertical'"=="vertical","`p50' `my_xpos'","`my_xpos' `p50'") if "`obs'"=="obs" { tempvar obsy if "`obsalt'"=="" gen `obsy' = `kdmin' - 0.05*(`kdmax'-`kdmin') else gen `obsy' = `kdmax' + 0.05*(`kdmax'-`kdmin') local obs_vars = cond("`vertical'"=="vertical","`obsy' `my_xpos'","`my_xpos' `obsy'") local obsplot || scatter `obs_vars' , /// msym(none) mlab(`nobs') mlabpos(0) mlabsize(vsmall) mlabcolor("scheme foreground") `obsopts' local ++legend_num } local curgraph `kdplots' /// rbar `p25' `p75' `my_xpos' if `first', /// `horizontal'`vertical' pstyle(p`cur_y_n') `barwidth' `bar' || /// scatter `scatter_vars' if `first', /// pstyle(p`cur_y_n') msym(O) mfcolor(white) `median' || /// rspike `p75' `uav' `my_xpos' if `first', /// `horizontal'`vertical' pstyle(p`cur_y_n') `line' || /// rspike `lav' `p25' `my_xpos' if `first', /// `horizontal'`vertical' pstyle(p`cur_y_n') `line' /// `obsplot' local kk `kk' `kdplots' || local oo `oo' || /// rbar `p25' `p75' `my_xpos' if `first', /// `horizontal'`vertical' pstyle(p`cur_y_n') `barwidth' `bar' || /// scatter `scatter_vars' if `first', /// pstyle(p`cur_y_n') msym(O) mfcolor(white) `median' || /// rspike `p75' `uav' `my_xpos' if `first', /// `horizontal'`vertical' pstyle(p`cur_y_n') `line' || /// rspike `lav' `p25' `my_xpos' if `first', /// `horizontal'`vertical' pstyle(p`cur_y_n') `line' /// `obsplot' local wholegraph `wholegraph' || `curgraph' local kdplots } // foreach cur_y of yvars if `no_over_spec' { // ie, no over specified; need to sort out labelling of x-axis local catlabs forval i=1/`n_y' { local cy : word `i' of `yvars' local cylab : variable label `cy' if "`cylab'"=="" | "`ylabel'"=="noylabel" local cylab `cy' local offset = (`i' - `n_y'/2 - 0.5)*(1/(`n_y'+1))*(`ygap'/100) // creates spacing of ygap between categories local catlabs `catlabs' `=1+`offset'' `"`cylab'"' } } if `n_y'>1 & !`no_over_spec' local legend legend(order(`legend_text')) else local legend legend(off) gr twoway `kk' `oo' /// `cataxis'lab(`catlabs' `catlabs2', notick) /// `legend' `y0' `gropts' } // quietly end /* nwviolin */ program GenGroup args overvar touse over1 over2 label1 label2 total1 total2 preserved local sortorder `over2' `over1' if "`total1'`total2'"!="" { if "`total1'"!="" { local tot_var `over1' local tot_other `over2' local sortorder `over2' `orig' `over1' } else { local tot_var `over2' local tot_other `over1' local sortorder `orig' `over2' `over1' } drop if !`touse' // keep things simple: drop unused observations // this will drop or keep missing values of over_vars as necessary // dataset was already -preserve-d when parsing total options before tempfile tmp save `"`tmp'"' tempvar orig gen `orig'=1 append using `"`tmp'"' replace `orig'=2 if mi(`orig') // put at end sum `tot_var', meanonly // this has been egen'd, so no longer string replace `tot_var' = `r(max)'+1 if `orig'==2 // collapse relevant over variable after all other values local lll : value label `tot_var' if "`lll'"=="" { tempname lll } la de `lll' `=`r(max)'+1' "(total)" , add // label new category as "(Total)" la val `tot_var' `lll' } * code stolen from and modified: _ggroup.ado sort `touse' `sortorder' by `touse' `sortorder' : gen `overvar'=1 if _n==1 & `touse' replace `overvar'=sum(`overvar') replace `overvar'=. if `touse'!=1 local lname `overvar' local dfltfmt : set dp local dfltfmt = cond("`dfltfmt'"=="period","%9.0g","%9,0g") count if !`touse' // touse==0 sorted first local j = 1 + r(N) // first observation in the first group sum `overvar', meanonly local max `r(max)' // max number of categories local i 1 while `i' <= 0`max' { // while we are not through all the categories tokenize `over2' `over1' // local vtmp " " local iteration `: word count `over2' `over1'' local x 1 while "`1'"!="" { local vallab : value label `1' local val = `1'[`j'] if "`vallab'" != "" { // & "`label`iteration''"!="nolabel`iteration'" local vtmp2 : label `vallab' `val' // if label, grab it } else { cap confirm numeric var `1' if _rc==0 { local vtmp2 = string(`1'[`j'],"`dfltfmt'") // if not a label, grab the value itself } else { local vtmp2 = trim(`1'[`j']) // if not a label, grab the string itself (sic; all numeric at this point) } } local x = `x' + length("`vtmp2'") + 1 local vtmp `"`vtmp' "`vtmp2'" "' mac shift local --iteration } local val `"`vtmp'"' label def `lname' `i' `"`val'"', modify count if `overvar' == `i' local j = `j' + r(N) local i = `i' + 1 } label val `overvar' `lname' *END Labelling code from _ggroup.ado end exit /* * To Do * -- relabel option for x-axis labelling -- individual heights for densities; widths for bars -- create a scheme for a nice default coloring? */