*! percentile as estimation procedure * v.1.2 by Stas Kolenikov, skolenik at gmail dot com program define epctile, eclass sortpreserve properties(svyb svyj) * survey properties of the program? version 11 if !replay() { syntax varname [if] [in] [pw fw iw/], Percentiles(numlist) /// [ svy over(varlist) Level(cilevel) subpop(string) VALuemask( string ) SPEClabel noSE ] * the main study variable local y `varlist' * sample conditions marksample touse sort `touse' `over' `y' * parse the subpop option if "`subpop'"!="" { ParseSubPop `subpop', touse(`touse') local subpop subpop( `subpop' ) } * parse the weights tempvar wgt if "`weight'" == "" { * no weight specified qui gen byte `wgt' = `touse' } else { * something was put down as weights qui gen double `wgt' = `exp' * `touse' local wexp [pw=`wgt'] } * parse the survey option if "`svy'"~="" { * are there survey settings to fall back on? qui svyset if "`r(wvar)'" != "" { * if there are any weights found in svy settings, replace the weight var qui replace `wgt' = `r(wvar)' } if "`r(settings)'" != ", clear" { * if there are any svy settings at all local svy svy , `subpop' : } else { * svy option was specified, but no svy settings found di as err "Warning: no svy settings found" local wexp [pw=`wgt'] } if "`if'`in'"!="" { * if/in or subpop? qui count if `touse' if r(N) != _N di as err "Warning: if and in conditions should be implemented via svy, subpop option" } } qui sum `wgt' if `touse' local sumw = r(sum) * check if percentiles are within bounds tokenize `percentiles' local np : word count `percentiles' forvalues j=1/`np' { local p`j' : word `j' of `percentiles' if `p`j'' < 10 local p`j' 0`p`j'' if `p`j'' <= 0 & `p`j''>=100 { di as err "percentiles must be between 0 and 100" exit 198 } } qui count if `touse' local nobs = r(N) tempname b bb V * 1. get percentiles via _pctile if "`over'" == "" { _pctile `y' [pw=`wgt'] if `touse', percentiles( `percentiles' ) mat `b' = J(1,`np',.) forvalues j=1/`np' { mat `b'[1,`j'] = r(r`j') local cnames `cnames' `=subinstr("p`p`j''",".","d",.)' } } else { * handle the OVER option here! unab overlist : `over' * number of variables local novervar : word count `overlist' FormXLevels `overlist', valuemask( "`valuemask'" ) mat `b' = J(1,`np'*`s(ncombos)', .) * the parameter estimates should run by percentile, then by `over' variables * to match the estimation output of -mean- forvalues k=1/`s(ncombos)' { _pctile `y' [pw=`wgt'] if `touse' `s(if`k')', percentiles( `percentiles' ) forvalues j=1/`np' { mat `b'[1,(`j'-1)*`s(ncombos)' + `k'] = r(r`j') } } forvalues j=1/`np' { forvalues k=1/`s(ncombos)' { if "`speclabel'" == "" local cnames `cnames' `=subinstr("p`p`j''",".","d",.)':`s(vmlab`k')' else local cnames `cnames' `=subinstr("p`p`j''",".","d",.)'_`s(vmlab`k')' } } } if "`se'" != "nose" { * 2. compute the linearization pieces if "`over'" == "" { forvalues k=1/`np' { tempvar d`k' qui gen `d`k'' = ( (`y'<`b'[1,`k']) - 0.01*`p`k'' ) local dlist `dlist' `d`k'' } } else { qui forvalues k=1/`np' { tempvar d`k' gen `d`k'' = . forvalues j=1/`s(ncombos)' { replace `d`k'' = ( (`y'<`b'[1,(`k'-1)*`s(ncombos)' + `j']) - 0.01*`p`k'' ) if `touse' `s(if`j')' } local dlist `dlist' `d`k'' } } * 3. compute the covariance with -svy: mean- if "`over'" != "" local overopt over(`over', nolabel) qui `svy' mean `dlist' if `touse' `wexp', `overopt' local vcetype `e(vcetype)' local vce `e(vce)' mat `V' = e(V) * sreturn values are lost by now if "`over'"!= "" FormXLevels `overlist', valuemask( "`valuemask'" ) * 4. compute the Woodruff estimates of variance tempname scale mat `scale' = J(1,`np'*`e(N_over)',.) forvalues j=1/`np' { if "`over'" != "" { forvalues k=1/`s(ncombos)' { local pplus = `p`j'' + 200*_se[ `d`j'':`s(lab`k')' ] if `pplus' >= 100 local pplus `p`j'' * have to use one-sided derivative local pminus = `p`j'' - 200*_se[ `d`j'':`s(lab`k')' ] if `pminus' <= 0 local pminus `p`j'' * have to use one-sided derivative _pctile `y' [pw=`wgt'] if `touse' `s(if`k')', percentiles( `pminus' `pplus' ) mat `scale'[1,(`j'-1)*`s(ncombos)'+`k'] = 100*( r(r2) - r(r1) )/( `pplus'-`pminus' ) /* mat `scale'[1,(`j'-1)*`s(ncombos)'+`k'] = ( r(r2) - r(r1) )/( 4 * _se[ `d`j'':`s(lab`k')' ] ) if `pplus' == `p`j'' | `pminus' == `p`j'' { mat `scale'[1,(`j'-1)*`s(ncombos)'+`k'] = 2 * `scale'[1,(`j'-1)*`s(ncombos)'+`k'] } */ } } else { local pplus = `p`j'' + 200*_se[ `d`j'' ] if `pplus' >= 100 local pplus `p`j'' * have to use one-sided derivative local pminus = `p`j'' - 200*_se[ `d`j'' ] if `pminus' <= 0 local pminus `p`j'' * have to use one-sided derivative _pctile `y' [pw=`wgt'] if `touse', percentiles( `pminus' `pplus' ) mat `scale'[1,`j'] = 100*( r(r2) - r(r1) )/( `pplus'-`pminus' ) /* mat `scale'[1,`j'] = ( r(r2) - r(r1) )/( 4 * _se[ `d`j'' ] ) if `pplus' == `p`j'' | `pminus' == `p`j'' mat `scale'[1,`j'] = 2 * `scale'[1,`j'] */ } } * 5. combine into the covariance matrix of the percentiles mat `V' = diag( `scale' ) * `V' * diag( `scale' ) } else { * no standard errors mat `V' = J( colsof(`b'), colsof(`b'), 0 ) } * grand finale: nice labels mat rownames `V' = `cnames' mat colnames `V' = `cnames' mat rownames `b' = `y' mat colnames `b' = `cnames' mat colnames `scale' = `cnames' mat rownames `scale' = `y' ereturn post `b' `V', obs(`nobs') depname(`y') esample(`touse') ereturn matrix scale = `scale' ereturn scalar N_over = `np' ereturn local over `over' ereturn local depvar `y' ereturn local vce `vce' ereturn local vcetype `vcetype' ereturn local cmd epctile * set e(cmd) last } else { // replay if "`e(cmd)'"!="epctile" error 301 syntax [, Level(cilevel)] } * output any header above the coefficient table di _n as text "Percentile estimation" ereturn display, level(`level') di _n end program define ParseSubPop * Jeff Pitblado's suggestion: undocumented _svy_subpop marking tool syntax [varlist(max=1 default=none)] [if/] , touse(varname) * subpop is [varname] [if] * target: zero out observations outside of the subpopulation * the input variable is typically `touse' confirm variable `touse' if "`varlist'`if'" == "" { di as err "undefined subpopulation" exit 198 } qui { if "`varlist'" == "" local varlist 1 if "`if'" == "" local if 1 replace `touse' = `touse' & (`varlist') & (`if') } end exit History: v. 1.0 -- March 05, 2010. Basic features: percentiles for i.i.d. data and -svy- via the option v. 1.1 -- March 06, 2010. Options: over, valuemask, speclabel, subpop, v. 1.2 -- May 25, 2010. Fixed an issue with standard errors and -over- option Simulations: * i.i.d. data set seed 10101 pro def epctsim drop _all set obs 1000 gen y = uniform() epctile y , p(25 50 75) end simulate _b _se , reps(1000) saving( epctile_iid, replace) : epctsim sum * stratified single stage clear all set seed 10101 set obs 500000 gen long id = _n gen byte h = (_n-1)/10000+1 bysort h (id) : gen m = 2+uniform() if _n == 1 bysort h (id) : gen s = 0.5+0.1*uniform() if _n == 1 bysort h (id) : replace m = m[1] bysort h (id) : replace s = s[1] gen y = exp( m + rnormal()*s ) sum y, d scalar pop05 = r(p5) foreach k in 10 25 50 75 90 95 { scalar pop`k' = r(p`k') } sort s id egen strata = group( h ) save strpop, replace pro def epctsvysim use strpop, clear gen r1 = uniform() gen r2 = uniform() sort strata r1 r2 bysort strata (r1 r2) : keep if _n < 20+strata gen samplw = 10000/(20+strata) svyset [pw=samplw], strata(strata) epctile y, p(5 10 25 50 75 90 95) svy end simulate _b _se , reps(1000) saving( epctile_svy, replace) : epctsvysim foreach k in 05 10 25 50 75 90 95 { gen byte reject_l`k' = ( p`k'_b_cons + invnorm( 0.05 )*p`k'_se_cons > scalar( pop`k') ) gen byte reject_u`k' = ( p`k'_b_cons + invnorm( 0.95 )*p`k'_se_cons < scalar( pop`k') ) } sum foreach x of varlist reject* { bitest `x' == 0.05 } * stratified two-stage clear all set seed 10201 set obs 50 gen double m = 2+uniform() gen double s = 0.5+0.2*uniform() sort s gen byte strata = _n gen int Nh = 100 + rnbinomial( 10, 0.2 ) gen byte nh = 2 + (uniform()<0.4) + (uniform()<0.1) tab nh expand Nh gen double y0 = exp( m + s*rnormal() ) gen int PSU = _n gen int Mhi = 20 + rnbinomial( 10, 0.1 ) sum Mhi expand Mhi gen double y = y0 - ( ln(uniform()) + ln(uniform()) + ln(uniform()) + 3 ) bysort strata PSU : gen byte first = (_n==1) sum y, d scalar pop05 = r(p5) foreach k in 10 25 50 75 90 95 { scalar pop`k' = r(p`k') } compress save strpop2, replace pro def epctsvy2sim use strpop2, clear gen float r1 = uniform() if first gen float r2 = uniform() if first bysort strata PSU (first) : replace r1 = r1[_N] bysort strata PSU (first) : replace r2 = r2[_N] bysort strata (r1 r2 first) : gen iPSU = sum( first ) bysort strata r1 r2 (first) : replace iPSU = iPSU[_N] * sample of PSUs keep if iPSU <= nh * sample of observations gen float r3 = uniform() gen float r4 = uniform() bysort strata iPSU (r3 r4) : keep if _n <= 10 gen samplw = Mhi/10 * Nh/nh svyset iPSU [pw=samplw], strata(strata) epctile y, p(5 10 25 50 75 90 95) svy end clear set mem 80m simulate _b _se , reps(1000) saving( epctile_svy2, replace) : epctsvy2sim foreach k in 05 10 25 50 75 90 95 { gen byte reject_l`k' = ( p`k'_b_cons + invnorm( 0.05 )*p`k'_se_cons > scalar( pop`k') ) gen byte reject_u`k' = ( p`k'_b_cons + invnorm( 0.95 )*p`k'_se_cons < scalar( pop`k') ) } sca li sum foreach x of varlist reject* { bitest `x' == 0.05 }