*! $Rev$  $Sep 09 2011$
// gsum calculates the mean, standard deviation, and two versions of a set of quantiles for grouped data.

program gsum, rclass byable(recall)
	version 11.1
	syntax varlist [if] [fweight aweight pweight iweight], [g0(string)] [g1(string)] ///
	[g2(string)] [g3(string)] [g4(string)] [g5(string)] [g6(string)] ///
	[g7(string)] [g8(string)] [g9(string)] [g10(string)] [g11(string)] ///
	[g12(string)] [g13(string)] [g14(string)] [g15(string)] [g16(string)] ///
	[g17(string)] [g18(string)] [g19(string)] [g20(string)] [g21(string)] ///
	[g22(string)] [g23(string)] [g24(string)] [g25(string)] ///
	[Table] [Save(string)] [GENerate(namelist)] [Quantiles(numlist)]
	preserve
	*mark sample
	marksample touse
	*get categorical list, assure they are all integers
	quietly : levelsof `varlist' if `touse', local(gvarlist)
	local gcount = 0
	foreach l in `gvarlist' {
		if "`lowestg'" == "" {
			local lowestg = `l'
		}
		local highestg = `l'
		capture confirm integer number `l'
		if _rc == 7 {
			display as error "non-integer value found in `varlist'"
			exit 7
		}
		local ++gcount
	}
	capture assert `gcount' > 1
	if _rc == 9 {
		display "only 1 category of `varlist'"
		exit 9
	}
	*figure out the quantiles
	capture confirm existence `quantiles'
	if _rc == 6 {
		local qlist "25 50 75"
	}
	else {
		foreach n in `quantiles' {
			capture assert `n' > 0 & `n' < 1
			if _rc == 9 {
				display as error "quantiles need to be between 0 and 1"
				exit 9
			}
			local p = regexr("`n'","\.","")
			foreach k in 1 2 3 4 5 6 7 8 9 {
				if "`p'" == "`k'" {
					local p "`p'0"
				}
			}
			local qlist "`qlist' `p' "
		}
	}
	*specifiy the temporary scalars
	tempname count totw mean var sd mn mx
	foreach q in `qlist' {
		tempname qi`q' qm`q'
	}
	forvalue i = 0/25 {
		tempname lower`i' upper`i' midpt`i' range`i'
	}
	*specify the temporary variables
	tempvar w fw sfw upper lower midpt range w_midpt dev2 quantile
	*assure that there is only one variable to get stats of
	local k = 0
	foreach v in `varlist' {
		local ++k
	}
	if `k' > 1 {
		display as error  ///
		"gsum only calcuates point estimates for a single variable"
		exit 9
	}
	*assure that the categorical variable only has values between 0 and 25
	quietly : sum `gvar' if `touse'
	capture assert r(min) > 0 & r(max) < 26
	if _rc == 9 {
		display as error  ///
		"your group variable needs numeric categories that range from 0-25"
		exit 9
	}
	*capture ranges from value labels if they exist
	forvalues l = 0/25 {
		local checklist "`checklist'`g`l''"
	}
	capture confirm existence `checklist'
	if _rc == 6 {
		local labelist : value label `varlist'
		capture confirm existence `labelist'
		if _rc == 6 {
			display as error "need to specify ranges in either the options or as value labels"
		}
		else {
			foreach l in `gvarlist' {
				local g`l' : label `labelist' `l'
			}
		}
	}
	*create category macros
	foreach l in `gvarlist' {
		tokenize `g`l'', parse(-)
		if "`2'" == "-" {
			*parse ok
		}
		else {
			display as error "please provide a range for group `l'"
			exit 9
		}
		capture confirm number `1'
		if _rc == 7 {
			display as error  ///
			"`1' not a number, please provide numeric quantity for group `l' lower bound"
			exit 9
		}
		capture confirm number `3'
		if _rc == 7 {
			display as error  ///
			"`3' not a number, please provide numeric quantity for group `l' higher bound"
			exit 9
		}
		if `3' > `1' {
			*range ok
		}
		else if `3' < `1' {
			display as error  ///
			"lower bound is higher than upper bound for group `l'"
			exit 9
		}
		else if `3' == `1' {
			display as error ///
			"lower bound is equal to upper bound for group `l'"
			exit 9
		}
		scalar `lower`l'' = `1'
		scalar `upper`l'' = `3'
		scalar `range`l'' = `3' - `1'
		scalar `midpt`l'' = ((`3' - `1')/2) + `1'
		if `l' == `lowestg' {
			scalar `mn' = `1'
		}
		if `l' == `highestg' {
			scalar `mx' = `3'
		}
	}
	*assure that categories don't overlap
	foreach l in `gvarlist' {
		foreach k in `gvarlist' {
			if `l' == `k' {
				*skip
			}
			else if `l' < `k' {
				capture assert `upper`l'' <= `lower`k''
				if _rc == 9 {
					display as error "group definitions `l' and `k' overlap"
					exit 9
				}
			}
			else if `k' < `l' {
				capture assert `upper`k'' <= `lower`l''
				if _rc == 9 {
					display as error "group definitions `k' and `l' overlap"
					exit 9
				}
			}
		}
	}
	*drop missing categories of varlist
	quietly : drop if `varlist' == .
	*create or use the weight variable
	if "`weight'" == "" {
		quietly : gen `w' = 1 if `touse'
	}
	else {
		quietly : gen `w' `exp' if `touse'
	}
	*gen casecount variable
	quietly : gen n = 1 if `touse'
	quietly : sum n
	scalar `count' = r(sum)
	*get the sum of the weights for each value of varlist
	collapse (sum) `w' n if `touse', by(`varlist')
	sort `varlist'
	*calculate CDF
	quietly : sum `w'
	scalar `totw' = r(sum)
	quietly : gen `fw' = `w'/`totw'
	quietly : gen `sfw' = sum(`fw')
	*initialize necessary variables
	quietly : gen `lower' = .
	quietly : gen `upper' = .
	quietly : gen `midpt' = .
	quietly : gen `range' = .
	*fill in values
	foreach l in `gvarlist' {
		foreach stat in lower upper midpt range {
			quietly : replace ``stat'' = ``stat'`l'' if `varlist' == `l'
		}
	}
	*calculation of the mean
	quietly : gen `w_midpt' = `midpt' * `w'
	quietly : sum `w_midpt'
	scalar `mean' = r(sum)/`totw'
	*calculation of variance and standard deviation
	quietly : gen `dev2' = ((`midpt' - `mean')^2)*`w'
	quietly : sum `dev2'
	scalar `var' =  r(sum) / (`totw' - 1)
	scalar `sd' = `var'^.5
	*calculate quantiles
	foreach q in `qlist'  {
		*interpolation
		quietly : gen `quantile' =  ///
		`lower' + ((((.`q') - `sfw'[_n-1])/`fw') * `range')  ///
		if `sfw' >= (.`q') & `sfw'[_n-1] < (.`q')
		quietly : sum `quantile'
		if r(N) == 1 {
			scalar `qi`q'' = r(mean)
		}
		else if r(N) == 0 {
			quietly : replace `quantile' =  ///
			`upper' - ((( `sfw' - (.`q'))/`fw') * `range')  ///
			if (`sfw' >= (.`q') & `sfw'[_n-1] < (.`q')) | _n == 1
			quietly : sum `quantile'
			scalar `qi`q'' = r(mean)
		}
		drop `quantile'
		*midpoint of the category at or above quantile
		quietly : gen `quantile' =  ///
		`midpt' if (`sfw' >= (.`q') & `sfw'[_n-1] < (.`q') )
		quietly : sum `quantile'
		if r(N) == 1 {
			scalar `qm`q'' = r(mean)
		}
		else if r(N) == 0 {
			quietly : replace `quantile' =  ///
			`midpt' if _n == 1
			quietly : sum `quantile'
			scalar `qm`q'' = r(mean)
		}
		drop `quantile'
	}
	*post results to r
	return scalar N = `count'
	else if "`weight'" != "" {
		return scalar sum_W = `totw'
	}
	foreach stat in mean var sd mn mx {
		return scalar `stat' = ``stat''
	}
	foreach q in `qlist' {
		return scalar qm`q' = `qm`q''
		return scalar qi`q' = `qi`q''
	}
	*display results
	display _newline as text "Grouped Data Summary Statistics" _newline
	display as text %12s abbrev("Variable",12) _col(14) "{c |}"  ///
	_col(22) "N" _col(29) "Mean" _col(40) "Std. Dev." _col(55) "Min" _col(67) "Max"
	display as text "{hline 13}" "{c +}" "{hline 60}"
	if "`weight'" == "" {
		display as text as text %12s abbrev("`varlist'",12) _col(14) "{c |}" ///
		_col(15) as result %8.0fc `count'  ///
		_col(22) as result %10.3f `mean' _col(38) as result %10.3f `sd' ///
		_col(45) as result %10.3f `mn' _col(60) as result %10.3f `mx'
	}
	else if "`weight'" != "" {
		display as text as text %12s abbrev("`varlist'",12) _col(14) "{c |}" ///
		_col(15) as result %8.0fc `totw'  ///
		_col(22) as result %10.3f `mean' _col(38) as result %10.3f `sd' ///
		_col(45) as result %10.3f `mn' _col(60) as result %10.3f `mx'
	}
	display _newline as text "Quantiles" _newline
	display as text %12s abbrev("Quantile",12) _col(14) "{c |}"  ///
	_col(17) "Lowest Midpoint at Quantile" _col(50) "Linear Interpolation"
	display as text "{hline 13}" "{c +}" "{hline 60}"
	foreach q in `qlist' {
		if `qm`q'' != . & `qi`q'' != . {
			display as text  %12s abbrev("0.`q'",12) _col(14) "{c |}"  ///
			_col(23) as result %10.3f `qm`q'' _col(53) as result %10.3f `qi`q''
		}
		else {
			display as text  %12s abbrev("0.`q'",12) _col(14) "{c |}"  ///
			_col(23) as result "not attainable" _col(53) as result %10.3f "not attainable"
		}
	}
	display as text "{hline 13}" "{c BT}" "{hline 60}"
	*rename for value table and renaming
	quietly : tostring `lower' `upper', force replace format(%10.0f)
	quietly : gen Range = `lower' + "-" + `upper'
	rename `midpt' Midpoint
	rename `w' Weight
	rename `fw' pWeight
	rename `sfw' CDF
	format Weight pWeight CDF %9.3f
	label val `varlist'
	if "`table'" == "table" {
		display _newline as text "Value Table"
		list `varlist' Range Midpoint n Weight pWeight CDF,   ///
		noobs  sep(35) sum(n Weight pWeight)
	}
	if "`save'" != "" {
		keep `varlist' Range Midpoint n Weight pWeight CDF
		order `varlist' Range Midpoint n Weight pWeight CDF
 		quietly : save "`save'", replace
	}
	*restore data
	restore
	if "`generate'" != "" {
		display _newline "variable `generate' created with value midpoints"
		quietly : gen `generate' = .
		foreach l in `gvarlist' {
			quietly : replace `generate' = `midpt`l'' if `varlist' == `l'
		}
	}
end