*! version 7.02 24nov2013 Michael Stepner, stepner@mit.edu /* CC0 license information: To the extent possible under law, the author has dedicated all copyright and related and neighboring rights to this software to the public domain worldwide. This software is distributed without any warranty. This code is licensed under the CC0 1.0 Universal license. The full legal text as well as a human-readable summary can be accessed at http://creativecommons.org/publicdomain/zero/1.0/ */ * Why did I include a formal license? Jeff Atwood gives good reasons: http://www.codinghorror.com/blog/2007/04/pick-a-license-any-license.html program define binscatter, eclass sortpreserve version 12.1 syntax varlist(min=2 numeric) [if] [in] [aweight fweight], [by(varname) /// Nquantiles(integer 20) GENxq(name) discrete xq(varname numeric) MEDians /// CONTROLs(varlist numeric ts fv) absorb(varname) noAddmean /// LINEtype(string) rd(numlist ascending) reportreg /// COLors(string) MColors(string) LColors(string) Msymbols(string) /// savegraph(string) savedata(string) replace /// nofastxtile randvar(varname numeric) randcut(real 1) randn(integer -1) /// /* LEGACY OPTIONS */ nbins(integer 20) create_xq x_q(varname numeric) symbols(string) method(string) unique(string) /// *] set more off * Create convenient weight local if ("`weight'"!="") local wt [`weight'`exp'] ***** Begin legacy option compatibility code if (`nbins'!=20) { if (`nquantiles'!=20) { di as error "Cannot specify both nquantiles() and nbins(): both are the same option, nbins is supported only for backward compatibility." exit } di as text "NOTE: legacy option nbins() has been renamed nquantiles(), and is supported only for backward compatibility." local nquantiles=`nbins' } if ("`create_xq'"!="") { if ("`genxq'"!="") { di as error "Cannot specify both genxq() and create_xq: both are the same option, create_xq is supported only for backward compatibility." exit } di as text "NOTE: legacy option create_xq has been renamed genxq(), and is supported only for backward compatibility." local genxq="q_"+word("`varlist'",-1) } if ("`x_q'"!="") { if ("`xq'"!="") { di as error "Cannot specify both xq() and x_q(): both are the same option, x_q() is supported only for backward compatibility." exit } di as text "NOTE: legacy option x_q() has been renamed xq(), and is supported only for backward compatibility." local xq `x_q' } if ("`symbols'"!="") { if ("`msymbols'"!="") { di as error "Cannot specify both msymbols() and symbols(): both are the same option, symbols() is supported only for backward compatibility." exit } di as text "NOTE: legacy option symbols() has been renamed msymbols(), and is supported only for backward compatibility." local msymbols `symbols' } if ("`linetype'"=="noline") { di as text "NOTE: legacy line type 'noline' has been renamed 'none', and is supported only for backward compatibility." local linetype none } if ("`method'"!="") { di as text "NOTE: method() is no longer a recognized option, and will be ignored. binscatter now always uses the fastest method without a need for two instances" } if ("`unique'"!="") { di as text "NOTE: unique() is no longer a recognized option, and will be ignored. binscatter now considers the x-variable discrete if it has fewer unique values than nquantiles()" } ***** End legacy option capatibility code *** Perform checks * Set default linetype and check valid if ("`linetype'"=="") local linetype lfit else if !inlist("`linetype'","connect","lfit","qfit","none") { di as error "linetype() must either be connect, lfit, qfit, or none" exit } * Check that nofastxtile isn't combined with fastxtile-only options if "`fastxtile'"=="nofastxtile" & ("`randvar'"!="" | `randcut'!=1 | `randn'!=-1) { di as error "Cannot combine randvar, randcut or randn with nofastxtile" exit } * Misc checks if ("`genxq'"!="" & ("`xq'"!="" | "`discrete'"!="")) | ("`xq'"!="" & "`discrete'"!="") { di as error "Cannot specify more than one of genxq(), xq(), and discrete simultaneously." exit } if ("`genxq'"!="") confirm new variable `genxq' if ("`xq'"!="") { capture assert `xq'==int(`xq') & `xq'>0 if _rc!=0 { di as error "xq() must contain only positive integers." exit } if ("`controls'`absorb'"!="") di as text "warning: xq() is specified in combination with controls() or absorb(). note that binning takes places after residualization, so the xq variable should contain bins of the residuals." } if `nquantiles'!=20 & ("`xq'"!="" | "`discrete'"!="") { di as error "Cannot specify nquantiles in combination with discrete or an xq variable." exit } if "`reportreg'"!="" & !inlist("`linetype'","lfit","qfit") { di as error "Cannot specify 'reportreg' when no fit line is being created." exit } if "`replace'"=="" { if `"`savegraph'"'!="" { if regexm(`"`savegraph'"',"\.[a-zA-Z0-9]+$") confirm new file `"`savegraph'"' else confirm new file `"`savegraph'.gph"' } if `"`savedata'"'!="" { confirm new file `"`savedata'.csv"' confirm new file `"`savedata'.do"' } } * Mark sample (reflects the if/in conditions, and includes only nonmissing observations) marksample touse markout `touse' `by' `xq' `controls' `absorb', strok qui count if `touse' local samplesize=r(N) local touse_first=_N-`samplesize'+1 local touse_last=_N * Parse varlist into y-vars and x-var local x_var=word("`varlist'",-1) local y_vars=regexr("`varlist'"," `x_var'$","") local ynum=wordcount("`y_vars'") * Check number of unique byvals & create local storing byvals if "`by'"!="" { local byvarname `by' capture confirm numeric variable `by' if _rc { * by-variable is string => generate a numeric version tempvar by tempname bylabel egen `by'=group(`byvarname'), lname(`bylabel') } local bylabel `:value label `by'' /*catch value labels for numeric by-vars too*/ tempname byvalmatrix qui tab `by' if `touse', nofreq matrow(`byvalmatrix') local bynum=r(r) forvalues i=1/`bynum' { local byvals `byvals' `=`byvalmatrix'[`i',1]' } } else local bynum=1 ****** Create residuals ****** if (`"`controls'`absorb'"'!="") quietly { * Parse absorb to define the type of regression to be used if `"`absorb'"'!="" { local regtype "areg" local absorb "absorb(`absorb')" } else { local regtype "reg" } * Generate residuals local firstloop=1 foreach var of varlist `x_var' `y_vars' { tempvar residvar `regtype' `var' `controls' `wt' if `touse', `absorb' predict `residvar' if e(sample), residuals if ("`addmean'"!="noaddmean") { summarize `var' `wt' if `touse', meanonly replace `residvar'=`residvar'+r(mean) } label variable `residvar' "`var'" if `firstloop'==1 { local x_r `residvar' local firstloop=0 } else local y_vars_r `y_vars_r' `residvar' } } else { /*absorb and controls both empty, no need for regression*/ local x_r `x_var' local y_vars_r `y_vars' } ****** Regressions for fit lines ****** if ("`reportreg'"=="") local reg_verbosity "quietly" if inlist("`linetype'","lfit","qfit") `reg_verbosity' { * If doing a quadratic fit, generate a quadratic term in x if "`linetype'"=="qfit" { tempvar x_r2 gen `x_r2'=`x_r'^2 } * Create matrices to hold regression results tempname e_b_temp forvalues i=1/`ynum' { tempname y`i'_coefs } * LOOP over by-vars local counter_by=1 if ("`by'"=="") local noby="noby" foreach byval in `byvals' `noby' { * LOOP over rd intervals tokenize "`rd'" local counter_rd=1 while ("`1'"!="" | `counter_rd'==1) { * display text headers if "`reportreg'"!="" { di "{txt}{hline}" if ("`by'"!="") { if ("`bylabel'"=="") di "-> `byvarname' = `byval'" else { di "-> `byvarname' = `: label `bylabel' `byval''" } } if ("`rd'"!="") { if (`counter_rd'==1) di "RD: `x_var'<=`1'" else if ("`2'"!="") di "RD: `x_var'>`1' & `x_var'<=`2'" else di "RD: `x_var'>`1'" } } * set conditions on reg local conds `touse' if ("`by'"!="" ) local conds `conds' & `by'==`byval' if ("`rd'"!="") { if (`counter_rd'==1) local conds `conds' & `x_r'<=`1' else if ("`2'"!="") local conds `conds' & `x_r'>`1' & `x_r'<=`2' else local conds `conds' & `x_r'>`1' } * LOOP over y-vars local counter_depvar=1 foreach depvar of varlist `y_vars_r' { * display text headers if (`ynum'>1) { if ("`controls'`absorb'"!="") local depvar_name : var label `depvar' else local depvar_name `depvar' di as text "{bf:y_var = `depvar_name'}" } * perform regression if ("`reg_verbosity'"=="quietly") capture reg `depvar' `x_r2' `x_r' `wt' if `conds' else capture noisily reg `depvar' `x_r2' `x_r' `wt' if `conds' * store results if (_rc==0) matrix e_b_temp=e(b) else if (_rc==2000) { if ("`reg_verbosity'"=="quietly") di as error "no observations for one of the fit lines. add 'reportreg' for more info." if ("`linetype'"=="lfit") matrix e_b_temp=.,. else /*("`linetype'"=="qfit")*/ matrix e_b_temp=.,.,. } else { error _rc exit _rc } * relabel matrix row if ("`by'"!="") matrix roweq e_b_temp = "by`counter_by'" if ("`rd'"!="") matrix rownames e_b_temp = "rd`counter_rd'" else matrix rownames e_b_temp = "=" * save to y_var matrix if (`counter_by'==1 & `counter_rd'==1) matrix `y`counter_depvar'_coefs'=e_b_temp else matrix `y`counter_depvar'_coefs'=`y`counter_depvar'_coefs' \ e_b_temp * increment depvar counter local ++counter_depvar } * increment rd counter if (`counter_rd'!=1) mac shift local ++counter_rd } * increment by counter local ++counter_by } * relabel matrix column names forvalues i=1/`ynum' { if ("`linetype'"=="lfit") matrix colnames `y`i'_coefs' = "`x_var'" "_cons" else if ("`linetype'"=="qfit") matrix colnames `y`i'_coefs' = "`x_var'^2" "`x_var'" "_cons" } } ******* Define the bins ******* * Specify and/or create the xq var, as necessary if "`xq'"=="" { if !(`touse_first'==1 & word("`:sortedby'",1)=="`x_r'") sort `touse' `x_r' if "`discrete'"=="" { /* xq() and discrete are not specified */ * Check whether the number of unique values > nquantiles, or <= nquantiles capture mata: characterize_unique_vals_sorted("`x_r'",`touse_first',`touse_last',`nquantiles') if (_rc==0) { /* number of unique values <= nquantiles, set to discrete */ local discrete discrete if ("`genxq'"!="") di as text `"note: the x-variable has fewer unique values than the number of bins specified (`nquantiles'). It will therefore be treated as discrete, and genxq() will be ignored"' local xq `x_r' local nquantiles=r(r) if ("`by'"=="") { tempname xq_boundaries xq_values matrix `xq_boundaries'=r(boundaries) matrix `xq_values'=r(values) } } else if (_rc==134) { /* number of unique values > nquantiles, perform binning */ if ("`genxq'"!="") local xq `genxq' else tempvar xq if ("`fastxtile'"!="nofastxtile") fastxtile `xq' = `x_r' `wt' in `touse_first'/`touse_last', nq(`nquantiles') randvar(`randvar') randcut(`randcut') randn(`randn') else xtile `xq' = `x_r' `wt' in `touse_first'/`touse_last', nq(`nquantiles') if ("`by'"=="") { mata: characterize_unique_vals_sorted("`xq'",`touse_first',`touse_last',`nquantiles') if (r(r)!=`nquantiles') { di as text "warning: nquantiles(`nquantiles') was specified, but only `r(r)' were generated. see help file under nquantiles() for explanation." local nquantiles=r(r) } tempname xq_boundaries xq_values matrix `xq_boundaries'=r(boundaries) matrix `xq_values'=r(values) } } else { error _rc } } else { /* discrete is specified, xq() & genxq() are not */ if ("`controls'`absorb'"!="") di as text "warning: discrete is specified in combination with controls() or absorb(). note that binning takes places after residualization, so the residualized x-variable may contain many more unique values." capture mata: characterize_unique_vals_sorted("`x_r'",`touse_first',`touse_last',`=`samplesize'/2') if (_rc==0) { local xq `x_r' local nquantiles=r(r) if ("`by'"=="") { tempname xq_boundaries xq_values matrix `xq_boundaries'=r(boundaries) matrix `xq_values'=r(values) } } else if (_rc==134) { di as error "discrete specified, but number of unique values is > (sample size/2)" exit 134 } else { error _rc } } } else { if !(`touse_first'==1 & word("`:sortedby'",1)=="`xq'") sort `touse' `xq' * set nquantiles & boundaries mata: characterize_unique_vals_sorted("`xq'",`touse_first',`touse_last',`=`samplesize'/2') if (_rc==0) { local nquantiles=r(r) if ("`by'"=="") { tempname xq_boundaries xq_values matrix `xq_boundaries'=r(boundaries) matrix `xq_values'=r(values) } } else if (_rc==134) { di as error "discrete specified, but number of unique values is > (sample size/2)" exit 134 } else { error _rc } } ********** Compute scatter points ********** if ("`by'"!="") { sort `touse' `by' `xq' tempname by_boundaries mata: characterize_unique_vals_sorted("`by'",`touse_first',`touse_last',`bynum') matrix `by_boundaries'=r(boundaries) } forvalues b=1/`bynum' { if ("`by'"!="") { mata: characterize_unique_vals_sorted("`xq'",`=`by_boundaries'[`b',1]',`=`by_boundaries'[`b',2]',`nquantiles') tempname xq_boundaries xq_values matrix `xq_boundaries'=r(boundaries) matrix `xq_values'=r(values) } /* otherwise xq_boundaries and xq_values are defined above in the binning code block */ * Define x-means tempname xbin_means if ("`discrete'"=="discrete") { matrix `xbin_means'=`xq_values' } else { means_in_boundaries `x_r' `wt', bounds(`xq_boundaries') `medians' matrix `xbin_means'=r(means) } * LOOP over y-vars to define y-means local counter_depvar=0 foreach depvar of varlist `y_vars_r' { local ++counter_depvar means_in_boundaries `depvar' `wt', bounds(`xq_boundaries') `medians' * store to matrix if (`b'==1) { tempname y`counter_depvar'_scatterpts matrix `y`counter_depvar'_scatterpts' = `xbin_means',r(means) } else { * make matrices conformable before right appending local rowdiff=rowsof(`y`counter_depvar'_scatterpts')-rowsof(`xbin_means') if (`rowdiff'==0) matrix `y`counter_depvar'_scatterpts' = `y`counter_depvar'_scatterpts',`xbin_means',r(means) else if (`rowdiff'>0) matrix `y`counter_depvar'_scatterpts' = `y`counter_depvar'_scatterpts', ( (`xbin_means',r(means)) \ J(`rowdiff',2,.) ) else /*(`rowdiff'<0)*/ matrix `y`counter_depvar'_scatterpts' = ( `y`counter_depvar'_scatterpts' \ J(-`rowdiff',colsof(`y`counter_depvar'_scatterpts'),.) ) ,`xbin_means',r(means) } } } *********** Perform Graphing *********** * If rd is specified, prepare xline parameters if "`rd'"!="" { foreach xval in "`rd'" { local xlines `xlines' xline(`xval', lpattern(dash) lcolor(gs8)) } } * Fill colors if missing if `"`colors'"'=="" local colors /// navy maroon forest_green dkorange teal cranberry lavender /// khaki sienna emidblue emerald brown erose gold bluishgray /// /* lime magenta cyan pink blue */ if `"`mcolors'"'=="" { if (`ynum'==1 & `bynum'==1 & "`linetype'"!="connect") local mcolors `: word 1 of `colors'' else local mcolors `colors' } if `"`lcolors'"'=="" { if (`ynum'==1 & `bynum'==1 & "`linetype'"!="connect") local lcolors `: word 2 of `colors'' else local lcolors `colors' } local num_mcolor=wordcount(`"`mcolors'"') local num_lcolor=wordcount(`"`lcolors'"') * Prepare connect & msymbol options if ("`linetype'"=="connect") local connect "c(l)" if "`msymbols'"!="" { local symbol_prefix "msymbol(" local symbol_suffix ")" } *** Prepare scatters * c indexes which color is to be used local c=0 local counter_series=0 * LOOP over by-vars local counter_by=0 if ("`by'"=="") local noby="noby" foreach byval in `byvals' `noby' { local ++counter_by local xind=`counter_by'*2-1 local yind=`counter_by'*2 * LOOP over y-vars local counter_depvar=0 foreach depvar of varlist `y_vars' { local ++counter_depvar local ++c * LOOP over rows (each row contains a coordinate pair) local row=1 local xval=`y`counter_depvar'_scatterpts'[`row',`xind'] local yval=`y`counter_depvar'_scatterpts'[`row',`yind'] if !missing(`xval',`yval') { local ++counter_series local scatters `scatters' (scatteri if ("`savedata'"!="") { if ("`by'"=="") local savedata_scatters `savedata_scatters' (scatter `depvar' `x_var' else local savedata_scatters `savedata_scatters' (scatter `depvar'_by`counter_by' `x_var'_by`counter_by' } } else { * skip the rest of this loop iteration continue } while (`xval'!=. & `yval'!=.) { local scatters `scatters' `yval' `xval' local ++row local xval=`y`counter_depvar'_scatterpts'[`row',`xind'] local yval=`y`counter_depvar'_scatterpts'[`row',`yind'] } * Add options local scatter_options `connect' mcolor(`: word `c' of `mcolors'') lcolor(`: word `c' of `lcolors'') `symbol_prefix'`: word `c' of `msymbols''`symbol_suffix' local scatters `scatters', `scatter_options') if ("`savedata'"!="") local savedata_scatters `savedata_scatters', `scatter_options') * Add legend if "`by'"=="" { if (`ynum'==1) local legend_labels off else local legend_labels `legend_labels' lab(`counter_series' `depvar') } else { if ("`bylabel'"=="") local byvalname=`byval' else { local byvalname `: label `bylabel' `byval'' } if (`ynum'==1) local legend_labels `legend_labels' lab(`counter_series' `byvarname'=`byvalname') else local legend_labels `legend_labels' lab(`counter_series' `depvar': `byvarname'=`byvalname') } if ("`by'"!="" | `ynum'>1) local order `order' `counter_series' } } *** Fit lines if inlist(`"`linetype'"',"lfit","qfit") { * c indexes which color is to be used local c=0 local rdnum=wordcount("`rd'")+1 tempname fitline_bounds if ("`rd'"=="") matrix `fitline_bounds'=.,. else matrix `fitline_bounds'=.,`=subinstr("`rd'"," ",",",.)',. * LOOP over by-vars local counter_by=0 if ("`by'"=="") local noby="noby" foreach byval in `byvals' `noby' { local ++counter_by ** Set the column for the x-coords in the scatterpts matrix local xind=`counter_by'*2-1 * Set the row to start seeking from * note: each time we seek a coeff, it should be from row (rd_num)(counter_by-1)+counter_rd local row0=( `rdnum' ) * (`counter_by' - 1) * LOOP over y-vars local counter_depvar=0 foreach depvar of varlist `y_vars_r' { local ++counter_depvar local ++c * Find lower and upper bounds for the fit line matrix `fitline_bounds'[1,1]=`y`counter_depvar'_scatterpts'[1,`xind'] local fitline_ub_rindex=`nquantiles' local fitline_ub=. while `fitline_ub'==. { local fitline_ub=`y`counter_depvar'_scatterpts'[`fitline_ub_rindex',`xind'] local --fitline_ub_rindex } matrix `fitline_bounds'[1,`rdnum'+1]=`fitline_ub' * LOOP over rd intervals forvalues counter_rd=1/`rdnum' { if (`"`linetype'"'=="lfit") { local coef_quad=0 local coef_lin=`y`counter_depvar'_coefs'[`row0'+`counter_rd',1] local coef_cons=`y`counter_depvar'_coefs'[`row0'+`counter_rd',2] } else if (`"`linetype'"'=="qfit") { local coef_quad=`y`counter_depvar'_coefs'[`row0'+`counter_rd',1] local coef_lin=`y`counter_depvar'_coefs'[`row0'+`counter_rd',2] local coef_cons=`y`counter_depvar'_coefs'[`row0'+`counter_rd',3] } if !missing(`coef_quad',`coef_lin',`coef_cons') { local leftbound=`fitline_bounds'[1,`counter_rd'] local rightbound=`fitline_bounds'[1,`counter_rd'+1] local fits `fits' (function `coef_quad'*x^2+`coef_lin'*x+`coef_cons', range(`leftbound' `rightbound') lcolor(`: word `c' of `lcolors'')) } } } } } * Prepare y-axis title if (`ynum'==1) local ytitle `y_vars' else if (`ynum'==2) local ytitle : subinstr local y_vars " " " and " else local ytitle : subinstr local y_vars " " "; ", all * Display graph local graphcmd twoway `scatters' `fits', graphregion(fcolor(white)) `xlines' xtitle(`x_var') ytitle(`ytitle') legend(`legend_labels' order(`order')) `options' if ("`savedata'"!="") local savedata_graphcmd twoway `savedata_scatters' `fits', graphregion(fcolor(white)) `xlines' xtitle(`x_var') ytitle(`ytitle') legend(`legend_labels' order(`order')) `options' `graphcmd' ****** Save results ****** * Save graph if `"`savegraph'"'!="" { * check file extension using a regular expression if regexm(`"`savegraph'"',"\.[a-zA-Z0-9]+$") local graphextension=regexs(0) if inlist(`"`graphextension'"',".gph","") graph save `"`savegraph'"', `replace' else graph export `"`savegraph'"', `replace' } * Save data if ("`savedata'"!="") { *** Save a CSV containing the scatter points tempname savedatafile file open `savedatafile' using `"`savedata'.csv"', write text `replace' * LOOP over rows forvalues row=0/`nquantiles' { *** Put the x-variable at the left * LOOP over by-vals forvalues counter_by=1/`bynum' { if (`row'==0) { /* write variable names */ if "`by'"!="" local bynlabel _by`counter_by' file write `savedatafile' "`x_var'`bynlabel'," } else { /* write data values */ if (`row'<=`=rowsof(`y1_scatterpts')') file write `savedatafile' (`y1_scatterpts'[`row',`counter_by'*2-1]) "," else file write `savedatafile' ".," } } *** Now y-variables at the right * LOOP over y-vars local counter_depvar=0 foreach depvar of varlist `y_vars' { local ++counter_depvar * LOOP over by-vals forvalues counter_by=1/`bynum' { if (`row'==0) { /* write variable names */ if "`by'"!="" local bynlabel _by`counter_by' file write `savedatafile' "`depvar'`bynlabel'" } else { /* write data values */ if (`row'<=`=rowsof(`y`counter_depvar'_scatterpts')') file write `savedatafile' (`y`counter_depvar'_scatterpts'[`row',`counter_by'*2]) else file write `savedatafile' "." } * unless this is the last variable in the dataset, add a comma if !(`counter_depvar'==`ynum' & `counter_by'==`bynum') file write `savedatafile' "," } /* end by-val loop */ } /* end y-var loop */ file write `savedatafile' _n } /* end row loop */ file close `savedatafile' di as text `"(file `savedata'.csv written containing saved data)"' *** Save a do-file with the commands to generate a nicely labeled dataset and re-create the binscatter graph file open `savedatafile' using `"`savedata'.do"', write text `replace' file write `savedatafile' `"insheet using `savedata'.csv"' _n _n if "`by'"!="" { foreach var of varlist `x_var' `y_vars' { local counter_by=0 foreach byval in `byvals' { local ++counter_by if ("`bylabel'"=="") local byvalname=`byval' else { local byvalname `: label `bylabel' `byval'' } file write `savedatafile' `"label variable `var'_by`counter_by' "`var'; `byvarname'==`byvalname'""' _n } } file write `savedatafile' _n } file write `savedatafile' `"`savedata_graphcmd'"' _n file close `savedatafile' di as text `"(file `savedata'.do written containing commands to process saved data)"' } *** Return items ereturn post, esample(`touse') ereturn scalar N = `samplesize' ereturn local graphcmd `"`graphcmd'"' if inlist("`linetype'","lfit","qfit") { forvalues yi=`ynum'(-1)1 { ereturn matrix y`yi'_coefs=`y`yi'_coefs' } } if ("`rd'"!="") { tempname rdintervals matrix `rdintervals' = (. \ `=subinstr("`rd'"," ","\",.)' ) , ( `=subinstr("`rd'"," ","\",.)' \ .) forvalues i=1/`=rowsof(`rdintervals')' { local rdintervals_labels `rdintervals_labels' rd`i' } matrix rownames `rdintervals' = `rdintervals_labels' matrix colnames `rdintervals' = gt lt_eq ereturn matrix rdintervals=`rdintervals' } if ("`by'"!="" & "`by'"=="`byvarname'") { /* if a numeric by-variable was specified */ forvalues i=1/`=rowsof(`byvalmatrix')' { local byvalmatrix_labels `byvalmatrix_labels' by`i' } matrix rownames `byvalmatrix' = `byvalmatrix_labels' matrix colnames `byvalmatrix' = `by' ereturn matrix byvalues=`byvalmatrix' } end ********************************** * Helper programs program define means_in_boundaries, rclass version 12.1 syntax varname(numeric) [aweight fweight], BOUNDsmat(name) [MEDians] * Create convenient weight local if ("`weight'"!="") local wt [`weight'`exp'] local r=rowsof(`boundsmat') matrix means=J(`r',1,.) if ("`medians'"!="medians") { forvalues i=1/`r' { sum `varlist' in `=`boundsmat'[`i',1]'/`=`boundsmat'[`i',2]' `wt', meanonly matrix means[`i',1]=r(mean) } } else { forvalues i=1/`r' { _pctile `varlist' in `=`boundsmat'[`i',1]'/`=`boundsmat'[`i',2]' `wt', percentiles(50) matrix means[`i',1]=r(r1) } } return clear return matrix means=means end *** copy of: version 1.21 8oct2013 Michael Stepner, stepner@mit.edu program define fastxtile, rclass version 11 * Parse weights, if any _parsewt "aweight fweight pweight" `0' local 0 "`s(newcmd)'" /* command minus weight statement */ local wt "`s(weight)'" /* contains [weight=exp] or nothing */ * Extract parameters syntax newvarname=/exp [if] [in] [,Nquantiles(integer 2) Cutpoints(varname numeric) ALTdef /// CUTValues(numlist ascending) randvar(varname numeric) randcut(real 1) randn(integer -1)] * Mark observations which will be placed in quantiles marksample touse, novarlist markout `touse' `exp' qui count if `touse' local popsize=r(N) if "`cutpoints'"=="" & "`cutvalues'"=="" { /***** NQUANTILES *****/ if `"`wt'"'!="" & "`altdef'"!="" { di as error "altdef option cannot be used with weights" exit 198 } if `randn'!=-1 { if `randcut'!=1 { di as error "cannot specify both randcut() and randn()" exit 198 } else if `randn'<1 { di as error "randn() must be a positive integer" exit 198 } else if `randn'>`popsize' { di as text "randn() is larger than the population. using the full population." local randvar="" } else { local randcut=`randn'/`popsize' if "`randvar'"!="" { qui sum `randvar', meanonly if r(min)<0 | r(max)>1 { di as error "with randn(), the randvar specified must be in [0,1] and ought to be uniformly distributed" exit 198 } } } } * Check if need to gen a temporary uniform random var if "`randvar'"=="" { if (`randcut'<1 & `randcut'>0) { tempvar randvar gen `randvar'=runiform() } * randcut sanity check else if `randcut'!=1 { di as error "if randcut() is specified without randvar(), a uniform r.v. will be generated and randcut() must be in (0,1)" exit 198 } } * Mark observations used to calculate quantile boundaries if ("`randvar'"!="") { tempvar randsample mark `randsample' `wt' if `touse' & `randvar'<=`randcut' } else { local randsample `touse' } * Error checks qui count if `randsample' local samplesize=r(N) if (`nquantiles' > r(N) + 1) { if ("`randvar'"=="") di as error "nquantiles() must be less than or equal to the number of observations [`r(N)'] plus one" else di as error "nquantiles() must be less than or equal to the number of sampled observations [`r(N)'] plus one" exit 198 } else if (`nquantiles' < 2) { di as error "nquantiles() must be greater than or equal to 2" exit 198 } * Compute quantile boundaries _pctile `exp' if `randsample' `wt', nq(`nquantiles') `altdef' * Store quantile boundaries in list forvalues i=1/`=`nquantiles'-1' { local cutvallist `cutvallist' r(r`i') } } else if "`cutpoints'"!="" { /***** CUTPOINTS *****/ * Parameter checks if "`cutvalues'"!="" { di as error "cannot specify both cutpoints() and cutvalues()" exit 198 } if "`wt'"!="" | "`randvar'"!="" | "`ALTdef'"!="" | `randcut'!=1 | `nquantiles'!=2 | `randn'!=-1 { di as error "cutpoints() cannot be used with nquantiles(), altdef, randvar(), randcut(), randn() or weights" exit 198 } tempname cutvals qui tab `cutpoints', matrow(`cutvals') if r(r)==0 { di as error "cutpoints() all missing" exit 2000 } else { local nquantiles = r(r) + 1 forvalues i=1/`r(r)' { local cutvallist `cutvallist' `cutvals'[`i',1] } } } else { /***** CUTVALUES *****/ if "`wt'"!="" | "`randvar'"!="" | "`ALTdef'"!="" | `randcut'!=1 | `nquantiles'!=2 | `randn'!=-1 { di as error "cutvalues() cannot be used with nquantiles(), altdef, randvar(), randcut(), randn() or weights" exit 198 } * parse numlist numlist "`cutvalues'" local cutvallist `"`r(numlist)'"' local nquantiles=wordcount(`"`r(numlist)'"')+1 } * Pick data type for quantile variable if (`nquantiles'<=100) local qtype byte else if (`nquantiles'<=32,740) local qtype int else local qtype long * Create quantile variable local cutvalcommalist : subinstr local cutvallist " " ",", all qui gen `qtype' `varlist'=1+irecode(`exp',`cutvalcommalist') if `touse' label var `varlist' "`nquantiles' quantiles of `exp'" * Return values if ("`samplesize'"!="") return scalar n = `samplesize' else return scalar n = . return scalar N = `popsize' tokenize `"`cutvallist'"' forvalues i=`=`nquantiles'-1'(-1)1 { return scalar r`i' = ``i'' } end version 12.1 set matastrict on mata: void characterize_unique_vals_sorted(string scalar var, real scalar first, real scalar last, real scalar maxuq) { // Inputs: a numeric variable, a starting & ending obs #, and a maximum number of unique values // Requires: the data to be sorted on the specified variable within the observation boundaries given // (no check is made that this requirement is satisfied) // Returns: the number of unique values found // the unique values found // the observation boundaries of each unique value in the dataset // initialize returned results real scalar Nunique Nunique=0 real matrix values values=J(maxuq,1,.) real matrix boundaries boundaries=J(maxuq,2,.) // initialize computations real scalar var_index var_index=st_varindex(var) real scalar curvalue real scalar prevvalue // perform computations real scalar obs for (obs=first; obs<=last; obs++) { curvalue=_st_data(obs,var_index) if (curvalue!=prevvalue) { Nunique++ if (Nunique<=maxuq) { prevvalue=curvalue values[Nunique,1]=curvalue boundaries[Nunique,1]=obs if (Nunique>1) boundaries[Nunique-1,2]=obs-1 } else { exit(error(134)) } } } boundaries[Nunique,2]=last // return results stata("return clear") st_numscalar("r(r)",Nunique) st_matrix("r(values)",values[1..Nunique,.]) st_matrix("r(boundaries)",boundaries[1..Nunique,.]) } end