******************************************************************
*												 				 *
*  PROGRAM TO PRODUCE FLEXIBLE CALIBRATION PLOTS 				 *
*  24/05/17 									 				 *
*  Updated: 24/05/2017											 *
*  	- added cutpoints option					 				 *
*  Updated: 21/10/2018 											 *
*	- added survival model observed data		 				 *
*  Updated: 27/10/2018 											 *
*	- added linear model capabilities			 				 *
*  Updated: 2/4/2019 											 *
*	- removed width default in histograms for cont outcomes		 *
*  Updated: 23/12/2019											 *
*	- updated range() option to ensure plotting of smoother 	 *
*		& spike plot inside range only							 *
*	- updated range() option to ensure correct scaling of spike  *
*		plot at any range										 *
*	- updated graph axes defaults when using range()			 *
*	- added 'zoom' option, which automatically scales the plot 	 *
*		to fit all groupings & CI's (smoother & spikes plotted	 *
*		within this zoomed range only)							 *
*	- updated survival plot: displays groupings at 1 if all 	 *
*		patients had an event before the time point of interest  *
*	- updated survival plot to allow spike plot					 *
*	Updated: 17/12/2021											 *
*	- fixing bug in the spike plots								 *
*	Updated: 13/01/2022											 *
*	- CI for smoother for binary outcomes						 *
*   Updated: 04/05/2022											 *
*	- uses pmcstat to calc c-stat now for binary outcomes 		 *
*	 giving dramatic speed increase with large datasets 		 *
*  Updated: 12/06/2023											 *
*	ADDED:													  	 *
*	- fix to R2 calculation for cont. outcomes using corr 		 *
*	- switch smoother to running, allowing CIs (only bin.)		 *
*	- addplot option (only added to cont. for now)				 *
*	- nosmootherci option (to turn off the CI for the smoother)	 *
*	- smootherciopts option 									 *
*   Updated: 14/06/2023											 *
*	- added smoother & CI for survival outcomes 				 *
*	- added option to alter rendition of survival smoother 		 *
*	- added ttesmoothergroups() to define bins for smoother      *
*	- updated "lowess" language to "smoother" in general		 *	
*	Updated: 06/03/2024											 *
*	- bug fix for smoother package dependency					 *
*																 *
*  2.2.2 J. Ensor								 				 *
******************************************************************

*! 2.2.2 J.Ensor 06Mar2024

program define pmcalplot, rclass

version 12.1						

/* Syntax
	VARLIST = A list of two variables, the predicted probabilities from the model,
				and the event indicator (observed outcome) for binary 
				[not input for survival outcomes]
	Bin = Number of bins used to group patients average observed
					& expected probabilities
	CUTpoints = A numeric list of cutpoints based on risk to be used instead
		of equally sized bins. Real numbers in the interval [0,1].
	NOSMoother = Remove smoother line
	NOSPike = Remove spike plot
	CI = add 95% CI's for the groups (CI's for proportions)
	SCatteropts = twoway options to affect rendition of the groups
	Range = a range for the plot describing the square size of the plot
	SMootheropts = twoway options to affect rendition of the smoother line
	SPikeopts = twoway options to affect rendition of the spikes
	CIopts = twoway options to affect rendition of the confidence intervals
	* = all other twoway options e.g. titles, legends (defaults apply otherwise)
	SURVival = calibration plot for survival models (uses KM estimates 
					for observed data)
	Timepoint = must be provided if survival option is on. A single time point 
					at which calibration is plotted. Units of time are defined by 
					the stset command prior to using pmcalplot. 
					Default = 1 unit of time.
	NOHist = Remove histograms from axes for continuous outcome models
	HIstopts = twoway options to affect rendition of the histograms
	KEEP = return variables for the expected, observed, & risk groups
	ZOOM = produces a plot displaying all data but in a narrower range than [0 1]

*/
	syntax varlist(min=1 max=2 numeric) [if] [in], [Bin(int 10) ///
						CUTpoints(numlist >=0 <=1 sort) noSMoother noSPike ///
						CI noSMOOTHERCI SCatteropts(string asis) ///
						Range(numlist min=2 max=2 sort) ///
						SMootheropts(string asis) SMOOTHERCIopts(string asis) ///
						SPikeopts(string asis) CIopts(string asis) noStatistics SURVival ///
						Timepoint(int 1) KEEP CONTinuous noHist ADDPLOT(string asis) ///
						OBSHIstopts(string asis) EXPHIstopts(string asis) ///
						P(int 0) LP(varname numeric) TTEsmoothergroups(int 5) ZOOM *] 

//SET UP TEMPVARS
tempvar binvar obs obsn exp binvar2 exp2 events nonevents outcomp ///
			lci uci obsse lowvar spikejitter binary_lp thresh lowvar_se ///
			lowvar_lci lowvar_uci histbinvar histeventn histnoneventn histexp ///
			droplinebase pseudo groups

			
// check for packages 
	local package running
	foreach pack of local package {
		capture which `pack'
		if _rc==111 {	
			di as txt "Package 'running' is required for this version on pmcalplot" _n "Installation will begin now ..."
			ssc install `pack', replace
			}
		}
		
// check on the if/in statement
marksample touse
qui count if `touse'
local n = `r(N)'
if `r(N)'==0 { 
	di as err "if statement identifies subgroup with no data?"
	error 2000
	}
	
// check only one of the surv or cont options is specified
if "`continuous'"=="continuous" & "`survival'"=="survival" {
	di as err "Only one outcome type can be selected (default=binary, surv=survival, cont=continuous)."
	error 198
	}
	else if "`survival'"=="survival" {
		di as inp "Survival option selected: Calibration plot for survival prediction model displaying..."
		}
		else if "`continuous'"=="continuous" {
			di as inp "Continuous option selected: Calibration plot for linear prediction model displaying..."
			}
			else {
				di as inp "Binary option selected: Calibration plot for logistic prediction model displaying..."
				}
			
// parse varlist
tokenize `varlist' , parse(" ", ",")

// run checks on user input variables in varlist
// check if user has input both exp and obs (for binary/survival outcomes)
local varcountcheck: word count `varlist'
if "`survival'"=="survival" {
	if `varcountcheck'!=1 {
		di as err "Varlist contains too many variables. For survival outcomes only predicted probabilities (expected values) are required"
		error 103
		}
	}
	else if "`continuous'"=="continuous" {
			if `varcountcheck'!=2 {
					di as err "Varlist must contain two variables. Predicted values (expected values), followed by observed values are required"
					error 102
					}
			}
			else {
				if `varcountcheck'!=2 {
					di as err "Varlist must contain two variables. Predicted probabilities (expected values), followed by observed outcomes (binary variable) are required"
					error 102
					}
					
				// check first var in varlist is a binary outcome (0 1) var
				qui levelsof `2', l(distinct)
				local sum = 0
				local prod = 1
				foreach i in `distinct' {
					local count = `count'+1
					local sum = `sum'+`i'
					local prod = `prod'*`i'
					}
					
				if `count'!=2 {	
						di as err "Event indicator not binary? Check which type of outcome you're using."
						error 450 
						}
						else if (`sum'!=1 & `prod'!=0) {
							di as err "Event indicator must be coded 0 or 1"
							error 450 
							}			
				}
			
// check the first var in varlist is probabilities lying btwn 0-1 (binary/survival)
if "`continuous'"!="continuous" {
	qui su `1'
	if `r(min)'>=0 & `r(max)'<=1 { 
		}
		else {
			di as err "1st element of varlist must be probabilities in the interval [0,1]"
			error 459
			}
		}
		
******************** BINARY OUTCOMES
if "`survival'"!="survival" & "`continuous'"!="continuous" {
	// check if user specified cutpoints
	if "`cutpoints'"!="" {
		qui gen `thresh' = .
		local q = 1
		foreach i in `cutpoints' {
			qui replace `thresh' = `i' in `q'
			local ++q
			}	
		
		* warning message for transparent reporting 
		di as err _n "WARNING: Do not use the cut-point option unless the model has prespecified clinically relevant cut-points."
		
		* create risk groups based on cutpoints
		xtile `binvar' = `1' if `touse', cutpoints(`thresh')
		}
		else {
			// create equally sized risk groups by number of bins
			xtile `binvar' = `1' if `touse', n(`bin')
			}
			
	// average the observed & expected over the bins 
	qui egen `obs' = mean(`2') if `touse', by(`binvar')
	qui egen `obsn' = count(`2') if `touse', by(`binvar')
	qui egen `exp' = mean(`1') if `touse', by(`binvar')

	// CIs for scatter points
	if "`ci'"=="ci" {
		qui gen `obsse' = ((`obs'*(1-`obs'))/`obsn')^.5
		qui gen `lci' = max(0,(`obs' - (1.96*`obsse')))
		qui gen `uci' = min(1,(`obs' + (1.96*`obsse')))
		}
		
	// create spike plot
	qui gen byte `events' = 0
	qui replace `events' = 1 if `2'==1 
	qui replace `events'=. if `1'==.
	qui gen `outcomp'= 1-`2'
	qui gen byte `nonevents' = 0
	qui replace `nonevents' = -1 if `outcomp'==1 
	qui replace `nonevents'=. if `1'==.
	}

******************** SURVIVAL OUTCOMES
if "`survival'"=="survival" & "`continuous'"!="continuous" {
	
	********************
	// check if user specified cutpoints
	if "`cutpoints'"!="" {
		qui gen `thresh' = .
		local q = 1
		foreach i in `cutpoints' {
			qui replace `thresh' = `i' in `q'
			local ++q
			}	
	
		* warning message for transparent reporting 
		di as err _n "WARNING: Do not use the cut-point option unless the model has prespecified clinically relevant cut-points."
		
		* create risk groups based on cutpoints
		xtile `binvar' = `1' if `touse', cutpoints(`thresh')
		}
		else {
			// create equally sized risk groups by number of bins
			xtile `binvar' = `1' if `touse', n(`bin')
			}
			
	// identify the bin numbers in binvar (particularly when using cutpoints)
	qui levelsof `binvar', l(newbins)
		
	// average the observed over the bins - survival outcomes
	*Slightly more complicated for observed
	qui gen `obs'=.
	qui gen `lci'=.
	qui gen `uci'=.
	tempfile temp
		foreach i in `newbins' {
			qui sts list if `binvar'==`i' , at(0 `timepoint') saving(`temp', replace)
			preserve
				qui use `temp', clear
				qui drop if time==0
				local cal_obs=1-survivor
				local cal_obs_lb=1-lb
				local cal_obs_ub=1-ub

			restore
			if `cal_obs'==. {
				qui replace `obs'=1 if `binvar'==`i'
				qui replace `lci'=. if `binvar'==`i'
				qui replace `uci'=. if `binvar'==`i'
				}
				else {
					qui replace `obs'=`cal_obs' if `binvar'==`i'
					qui replace `lci'=`cal_obs_lb' if `binvar'==`i'
					qui replace `uci'=`cal_obs_ub' if `binvar'==`i'
					}
		}

	// average the expected over the bins - survival outcomes
	qui egen `exp' = mean(`1') if `touse', by(`binvar')
		
	
	// create spike plot
	qui gen byte `events' = 0
	qui replace `events' = 1 if _d==1 & _t<=`timepoint'
	qui replace `events'=. if `1'==. 
	//qui gen `outcomp'= 1-_d
	qui gen byte `nonevents' = 0
	qui replace `nonevents' = -1 if `events'!=1 
	qui replace `nonevents'=. if `1'==. 
	}

******************** CONTINUOUS OUTCOMES
if "`survival'"!="survival" & "`continuous'"=="continuous" {
	// average the observed & expected over the bins 
	qui gen `obs' = `2' if `touse'
	qui gen `exp' = `1' if `touse'
	
	// turn spike plot off as continuous outcomes uses histograms
	local spike = "nospike"
	
	// turn off ci for continuous outcomes
	local ci = ""
	}

********************
************************************
// calculate range locals from user input (or default 0 1)
if "`range'"!="" {
		gettoken first second : range
		local minr = `first'
		local maxr = `second'
		di as err _n "WARNING: Plot range has been manually restricted. Be aware that information may lie outside of this range." _n "Groupings & CI's will not be displayed if they lie outside of the specified range" _n "Further, smoother values outside of the specified range will not be plotted"
		
		local range_diff = abs(`maxr'-`minr')
		local adj = `minr' -(.05*`range_diff')
		local adjdown = `minr' -(.04*`range_diff')
		local adjup = `minr' -(.06*`range_diff')
		
		// scale the spike plot to fit along the bottom of the cal plot
		if "`continuous'"!="continuous" {
			qui replace `events' = (`events'/30)*(`range_diff') + `adj'
			qui replace `nonevents' = (`nonevents'/30)*(`range_diff') + `adj' 
			qui gen `spikejitter' = `1'+(runiform()*0.00001)
			}
		
		local sp1 = cond("`spike'"=="nospike","",`"|| rspike `events' `nonevents' `spikejitter' if (`spikejitter'<`maxr') & (`spikejitter'>`minr') & (`touse'), yline(`adj') text(`adjdown' `maxr' "1", place(n)) text(`adjup' `maxr' "0", place(s)) lw(thin) lcol(maroon) `spikeopts'"')
		local ci1 = cond("`ci'"=="ci","|| rspike `uci' `lci' `exp' if (`exp'<=`maxr') & (`exp'>=`minr') & (`uci'<=`maxr') & (`uci'>=`minr') & (`lci'<=`maxr') & (`lci'>=`minr'), lcol(forest_green) `ciopts'","")
		}
		else if "`range'"=="" & "`continuous'"=="continuous" {
			qui su `2'
			
			local minr = r(min)
			local maxr = r(max)
			}
			else if "`zoom'"=="zoom" {
				if "`survival'"=="survival" {
					qui su `uci'
					local ci_minr = r(min)
					qui su `lci'
					local ci_maxr = r(max) 
					qui su `obs'
					local obs_minr = r(min)
					local obs_maxr = r(max)
					qui su `exp'
					local exp_minr = r(min)
					local exp_maxr = r(max)
					local minr = min(`exp_minr', `obs_minr', `ci_minr')
					local maxr = max(`exp_maxr', `obs_maxr', `ci_maxr')
					}
					else {
						qui su `uci'
						local ci_maxr = r(max)
						qui su `lci'
						local ci_minr = r(min)
						qui su `obs'
						local obs_minr = r(min)
						local obs_maxr = r(max)
						qui su `exp'
						local exp_minr = r(min)
						local exp_maxr = r(max)
						local minr = min(`exp_minr', `obs_minr', `ci_minr')
						local maxr = max(`exp_maxr', `obs_maxr', `ci_maxr')
						}
				
				local ci1 = cond("`ci'"=="ci","|| rspike `uci' `lci' `exp' if (`exp'<=`maxr') & (`exp'>=`minr'), lcol(forest_green) `ciopts'","") 
				di as err _n "WARNING: Plot range has been manually restricted. Be aware that information may lie outside of this range." _n "Smoother & Spike plot values may lie outside of the plot range when using zoom option"
		
				local range_diff = abs(`maxr'-`minr')
				local adj = `minr' -(.05*`range_diff')
				local adjdown = `minr' -(.04*`range_diff')
				local adjup = `minr' -(.06*`range_diff')
				
				// scale the spike plot to fit along the bottom of the cal plot
				if "`continuous'"!="continuous" {
					qui replace `events' = (`events'/30)*(`range_diff') + `adj'
					qui replace `nonevents' = (`nonevents'/30)*(`range_diff') + `adj' 
					qui gen `spikejitter' = `1'+(runiform()*0.00001)
					}
		
				local sp1 = cond("`spike'"=="nospike","",`"|| rspike `events' `nonevents' `spikejitter' if (`spikejitter'<`maxr') & (`spikejitter'>`minr') & (`touse'), yline(`adj') text(`adjdown' `maxr' "1", place(n)) text(`adjup' `maxr' "0", place(s)) lw(thin) lcol(maroon) `spikeopts'"')
				}
				else {
					local minr = 0
					local maxr = 1
					
					// scale the spike plot to fit along the bottom of the cal plot
					if "`continuous'"!="continuous" {
						qui replace `events' = (`events'/30)-.05
						qui replace `nonevents' = (`nonevents'/30)-.05 
						qui gen `spikejitter' = `1'+(runiform()*0.00001)
						}
					
					local sp1 = cond("`spike'"=="nospike","",`"|| rspike `events' `nonevents' `spikejitter' if (`spikejitter'<`maxr') & (`spikejitter'>`minr') & (`touse'), yline(-.05) text(-.04 `maxr' "1", place(n)) text(-.06 `maxr' "0", place(s)) lw(thin) lcol(maroon) `spikeopts'"')
					local ci1 = cond("`ci'"=="ci","|| rspike `uci' `lci' `exp' if (`exp'<=`maxr') & (`exp'>=`minr') & (`uci'<=`maxr') & (`uci'>=`minr') & (`lci'<=`maxr') & (`lci'>=`minr'), lcol(forest_green) `ciopts'","")
					}

					
// derive and save smoother variable
if "`smoother'"!="nosmoother" {
	if "`survival'"!="survival" & "`continuous'"!="continuous" {
		qui running `2' `1' if `touse', span(1) ci generate(`lowvar') gense(`lowvar_se') replace nog
		qui gen `lowvar_lci' = `lowvar' - (1.96*`lowvar_se')
		qui replace `lowvar_lci'=0 if `lowvar_lci'<0
		qui gen `lowvar_uci' = `lowvar' + (1.96*`lowvar_se')
		qui replace `lowvar_uci'=1 if `lowvar_uci'>1
	}
	if "`survival'"=="survival" {
		if "`lp'"!="" {
			qui egen `groups' = cut(`lp') if `touse', group(`ttesmoothergroups')
			qui replace `groups' = `groups' + 1
			
			forvalues i = 1/`ttesmoothergroups' {
				* generate pseudo values at t years
				tempvar pseudo`i'
				qui capture stpci if `groups' == `i', at(`timepoint') gen(`pseudo`i'') 
				if _rc {
					di as err "Sample size is too small to allow estimation of the smoother using `ttesmoothergroups' groups and may lead to inappropriate smoothers/CIs. Try reducing the number of groups using ttesmoothergroups()"
					}
			}

			* merge pseudo observations into a single variable
			qui gen `pseudo' = .
			forvalues i = 1/`ttesmoothergroups' {
				qui replace `pseudo' = `pseudo`i'' if `groups' == `i'
			}

			qui running `pseudo' `1' if `touse', span(1) ci generate(`lowvar') gense(`lowvar_se') replace nog
			qui gen `lowvar_lci' = `lowvar' - (1.96*`lowvar_se')
			qui replace `lowvar_lci'=0 if `lowvar_lci'<0
			qui gen `lowvar_uci' = `lowvar' + (1.96*`lowvar_se')
			qui replace `lowvar_uci'=1 if `lowvar_uci'>1
		}
		else {
			local smoother = "nosmoother"
			di as inp "NB: LP variable not provided therefore smoother cannot be calculated/displayed"
		}
	}

	if "`continuous'"=="continuous" {
		qui running `2' `1' if `touse', span(1) ci generate(`lowvar') gense(`lowvar_se') replace nog
		qui gen `lowvar_lci' = `lowvar' - (1.96*`lowvar_se')
		*qui replace `lowvar_lci'=0 if `lowvar_lci'<0
		qui gen `lowvar_uci' = `lowvar' + (1.96*`lowvar_se')
		*qui replace `lowvar_uci'=1 if `lowvar_uci'>1
	}
	
	}

// graph command locals (all combinations of hist/smoother or not)
if "`smootherci'"!="nosmootherci" {
local lo1 = cond("`smoother'"=="nosmoother","","|| line `lowvar' `1' if (`lowvar'<=`maxr') & (`lowvar'>=`minr') & (`1'<=`maxr') & (`1'>=`minr') & (`touse'), sort lcol(midblue) `smootheropts' || rarea `lowvar_uci' `lowvar_lci' `1' if `touse', sort fc(midblue%20) lc(midblue%20) `smootherciopts'")
}
else {
local lo1 = cond("`smoother'"=="nosmoother","","|| line `lowvar' `1' if (`lowvar'<=`maxr') & (`lowvar'>=`minr') & (`1'<=`maxr') & (`1'>=`minr') & (`touse'), sort lcol(midblue) `smootheropts'")
}

// create local to manage the legend ordering
if ("`smoother'"=="nosmoother") & ("`ci'"=="ci") {
	local leglaborder "1 2 3"
	local labs "lab(1 Reference) lab(2 Groups) lab(3 95% CIs) "
	}
	else if ("`smoother'"=="") & ("`ci'"=="") {
		local leglaborder "1 2 3"
		local labs "lab(1 Reference) lab(2 Groups) lab(3 Smoother)"
		}
		else if ("`smoother'"=="nosmoother") & ("`ci'"=="")  {
			local leglaborder "1 2"
			local labs "lab(1 Reference) lab(2 Groups)"
			}
			else {
				local leglaborder "1 2 3 4"
				local labs "lab(1 Reference) lab(2 Groups) lab(3 95% CIs) lab(4 Smoother)"
				} 

				 
// Calibration performance statistics
if "`statistics'"!="nostatistics"  {
	// stats for binary outcomes
	if "`survival'"!="survival" & "`continuous'"!="continuous" {
		* calculating linear predictor
		qui gen `binary_lp' = ln(`1'/(1-`1')) if `touse'

		* calculating c-slope
		qui logistic `2' `binary_lp' if `touse', coef 
		local cslope = _b[`binary_lp']
		local cslope : di %4.3f `cslope'
		return scalar cslope = `cslope'

		* calculating calibration-in-the-large (CITL)
		qui logistic `2' if `touse', offset(`binary_lp') coef
		local citl = _b[_cons]
		local citl : di %4.3f `citl'
		return scalar citl = `citl'

		* exp/obs ratio
		qui su `2' if `touse'
		local o = r(mean)
		qui su `1' if `touse'
		local e = r(mean)
		local oe = `o'/`e'
		local oe : di %4.3f `oe'
		return scalar oe_ratio = `oe'

		* c-index
		qui pmcstat `binary_lp' `2' if `touse'
		local cstat = r(cstat) 
		local cstat : di %4.3f `cstat'
		return scalar cstat = `cstat'
		
		local st1 = `" text(`maxr' `minr' "O:E = `oe'" "CITL = `citl'" "Slope = `cslope'" "AUC = `cstat'", size(small) place(se) just(left))"'
		}
		
		// stats for survival outcomes
	if "`survival'"=="survival" {
		* calculating linear predictor
		if "`lp'"!="" {
			* calculating c-slope
			qui stcox `lp' if `touse', nohr
			local cslope = _b[`lp']
			local cslope : di %4.3f `cslope'
			return scalar cslope = `cslope'
			
			* c-index
			qui estat concordance
			local cstat = r(C) 
			local cstat : di %4.3f `cstat'
			return scalar cstat = `cstat'
			
			
			local st1 = `" text(`maxr' `minr' "Slope = `cslope'" "C-statistic = `cstat'", size(small) place(se) just(left))"'
			}
		}
		
		// stats for continuous outcomes
	if "`continuous'"=="continuous" {
		* calculating c-slope
		qui reg `2' `1' if `touse' 
		local cslope = _b[`1']
		local cslope : di %4.3f `cslope'
		return scalar cslope = `cslope'
		
		* r-sq
		qui corr `2' `1' if `touse' 
		local r2 = r(rho)^2
		if `p'!=0 {
			* r-sq adjusted
			local r2a = (((`n'-1)*`r2')-`p')/(`n'-`p'-1)
			local r2a : di %4.3f `r2a'
			return scalar r2a = `r2a'
			}
			
		local r2 : di %4.3f `r2'
		return scalar r2 = `r2'
		
		
		* calculating calibration-in-the-large (CITL)
		qui constraint 1 `1'=1
		qui cnsreg `2' `1' if `touse', constraint(1)
		local citl = _b[_cons]
		local citl : di %4.3f `citl'
		return scalar citl = `citl'
		
		if `n'!=0 & `p'!=0 {
			local st1 = `" text(`maxr' `minr' "R-squared = `r2'" "Adj R-squared = `r2a'" "CITL = `citl'" "Slope = `cslope'", size(small) place(se) just(left))"'
			}
			else {
				local st1 = `" text(`maxr' `minr' "R-squared = `r2'" "CITL = `citl'" "Slope = `cslope'", size(small) place(se) just(left))"'
				}
		
		}
	}


// Graph command combining user selected features (binary & survival)
if "`continuous'"!="continuous" {
	graph twoway function y=x, range(`minr' `maxr') lp(-) || ///
		scatter `obs' `exp' if (`exp'<=`maxr') & (`exp'>=`minr') & (`obs'<=`maxr') & (`obs'>=`minr'), mcol(dkgreen) ///
		msym(Oh)  graphr(col(white)) ///
		xlab(#5, angle(h) format(%3.1f)) ylab(#5, angle(h) format(%3.1f)) ///
		ytitle("Observed") xtitle("Expected") aspect(1) ///
				legend(pos(3) order(`leglaborder') ///
						`labs' col(1) size(small)) ///
						`scatteropts' `options' `ci1' `lo1' `sp1' `st1' 
	}

******************* 	

// Graph commands for continuous outcomes
if "`continuous'"=="continuous" {
// if histogram is on then produce and combine graphs
	if "`hist'"!="nohist" {
		* set up tempmnames for graphs
		tempname obs_graph sc_graph exp_graph
		
		* graphs
		qui graph twoway function y=x, range(`minr' `maxr') lp(-) || ///
			scatter `obs' `exp' if (`1'<`maxr') & (`1'>`minr'), mcol(dkgreen) ///
			msym(Oh) graphr(col(white)) ///
			xlab(#5, angle(h)) ylab(#5, angle(h)) nodraw ///
			ytitle("") xtitle("") aspect(1)	legend(off) saving(`sc_graph', replace) ///
							`scatteropts' `options' `ci1' `st1' `addplot'  `lo1'
		
		qui twoway histogram `obs', graphr(col(white))  xlab(minmax, angle(v) ///
			format(%3.2f)) xsca(reverse) ylab(#5) ysca() ytitle("Observed") xtitle("") ///
			horiz fxsize(15) saving(`obs_graph', replace) col(maroon) nodraw `obshistopts'

		qui twoway histogram `exp', graphr(col(white))  ylab(minmax, angle(h) ///
			format(%3.2f)) xlab(#5) ytitle("") xtitle("Expected") ///
			fysize(15) saving(`exp_graph', replace) col(maroon) nodraw `exphistopts'

		* Combine the above plots in one plot. 
		graph combine `obs_graph'.gph `sc_graph'.gph `exp_graph'.gph, hole(3) imargin(1 0 1 0) ///
		graphregion(margin(l=1 r=3)) xsize(4) ysize(4) graphr(col(white)) `options' 
		}
		else {
			graph twoway function y=x, range(`minr' `maxr') lp(-) || ///
			scatter `obs' `exp' if (`1'<`maxr') & (`1'>`minr'), mcol(dkgreen) ///
			msym(Oh) graphr(col(white)) ///
			xlab(#5, angle(h)) ylab(#5, angle(h)) ///
			ytitle("Observed") xtitle("Expected") aspect(1) ///
					legend(pos(3) order(`leglaborder') ///
							`labs' col(1) size(small)) ///
							`scatteropts' `options' `ci1' `st1' `addplot' `lo1' 
			}
		}
		
****************************************************************************
/* give user the option to leave behind the cutpoints/bins variable
	so they can see if there were no patients in a specific risk group */
if "`keep'"=="keep" {
	capture confirm variable obs_pmcalplot
	if !_rc {
		di as err "Variable with name 'obs_pmcalplot' already exists, pmcalplot cannot generate required variables"
		}
		else {
			rename `obs' obs_pmcalplot
		    }
	
	capture confirm variable exp_pmcalplot
	if !_rc {
		di as err "Variable with name 'exp_pmcalplot' already exists, pmcalplot cannot generate required variables"
		}
		else {
			rename `exp' exp_pmcalplot
		    }
	
	if "`continuous'"!="continuous" {
	capture confirm variable groups_pmcalplot
		if !_rc {
			di as err "Variable with name 'groups_pmcalplot' already exists, pmcalplot cannot generate required variables"
			}
			else {
				rename `binvar' groups_pmcalplot
				}	
			}
	}

end 	

// END OF PROGRAM
*********************************************************************************


**************************************************
*												 *
*  PROGRAM TO CALCULATE C-STAT					 *
*  16/06/21 									 *
*  			 									 *
*												 *
*  1.0.0 J. Ensor								 *
**************************************************

*! 1.0.0 J.Ensor 16Jun2021

program define pmcstat, rclass

/* Syntax
	VARLIST = A list of two variables, the linear predictor for the model,
			and the event indicator (observed outcome)
	NOPRINT = suppress the onscreen output of performance stats
	MATRIX = specify the name of a matrix storing the performance stats 
*/

syntax varlist(min=1 max=2 numeric) [if] [in], [noPRINT  ///
				MATrix(name local) HANley FASTER]

*********************************************** SETUP/CHECKS
*SET UP TEMPs
tempvar p rank_disc rank2_disc diff_disc inv_outcome rank_cord rank2_cord diff_cord

// check on the if/in statement 
marksample touse
qui count if `touse'
local samp=r(N)
if `r(N)'==0 { 
	di as err "if statement identifies subgroup with no data?"
	error 2000
	}
	
// parse varlist
tokenize `varlist' , parse(" ", ",")
local lp = `"`1'"'
local outcome = `"`2'"'

// generate probabilities
qui gen `p' = exp(`lp')/(1+exp(`lp'))

// run checks on user input variables in varlist
// check if user has input both LP and obs (for binary outcome)
local varcountcheck: word count `varlist'

if `varcountcheck'!=2 {
	di as err "Varlist must contain two variables. Linear predictor values, followed by observed outcomes (binary variable) are required"
	error 102
	}

// check outcome is binary
cap assert `outcome'==0 | `outcome'==1 if `touse'
        if _rc~=0 {
                noi di as err "Event indicator `outcome' must be coded 0 or 1"
                error 450
        }

// preserve data & keep only the touse sample
preserve
keep if `touse'

*********************************************** C-STAT

if "`faster'"=="faster" {
	// check for packages 
	local packs gtools
	foreach pkg of local packs {
		capture which `pkg'
		if _rc==111 {	
			ssc install `pkg'
			}
		}
		
	// discordant pairs
	hashsort `p' `outcome' 		
	qui gen `rank_disc' = _n if `touse'

	hashsort `outcome' `p' `rank_disc'
	qui gen `rank2_disc' = _n if `touse'

	qui gen `diff_disc' = (`rank_disc' - `rank2_disc') if (`outcome'==0) & (`touse')

	// concordant pairs
	qui gen `inv_outcome' = (`outcome'==0)
	hashsort `p' `inv_outcome'
	qui gen `rank_cord' = _n if `touse'

	hashsort `inv_outcome' `p' `rank_cord'
	qui gen `rank2_cord' = _n if `touse'

	qui gen `diff_cord' = (`rank_cord' - `rank2_cord') if `inv_outcome'==0

	// total possible pairs
	qui gstats sum `outcome' if (`outcome'!=.) & (`touse'), meanonly
	local obs = r(N)
	local prev = r(mean)
	local events = r(sum)
	local nonevents = r(N) - r(sum)
	local pairs = `events'*`nonevents'  

	// compute c-stat (allowing for ties)
	qui gstats sum `diff_disc' if `touse'
	local disc = r(sum)
	qui gstats sum `diff_cord' if `touse'
	local cord = r(sum)

	local ties = `pairs'-`disc'-`cord'

	local cstat = (`cord'+(0.5*`ties'))/(`pairs')
	}
	else {
		// discordant pairs
		sort `p' `outcome' 		
		qui gen `rank_disc' = _n if `touse'

		sort `outcome' `p' `rank_disc' 
		qui gen `rank2_disc' = _n if `touse'

		qui gen `diff_disc' = (`rank_disc' - `rank2_disc') if (`outcome'==0) & (`touse')

		// concordant pairs
		qui gen `inv_outcome' = (`outcome'==0) if `touse'
		sort `p' `inv_outcome' 
		qui gen `rank_cord' = _n if `touse'

		sort `inv_outcome' `p' `rank_cord' 
		qui gen `rank2_cord' = _n if `touse'

		qui gen `diff_cord' = (`rank_cord' - `rank2_cord') if (`inv_outcome'==0) & (`touse')

		// total possible pairs
		qui su `outcome' if (`outcome'!=.) & (`touse'), meanonly
		local obs = r(N)
		local prev = r(mean)
		local events = r(sum)
		local nonevents = r(N) - r(sum)
		local pairs = `events'*`nonevents'  

		// compute c-stat (allowing for ties)
		qui su `diff_disc' if `touse'
		local disc = r(sum)
		qui su `diff_cord' if `touse'
		local cord = r(sum)

		local ties = `pairs'-`disc'-`cord'

		local cstat = (`cord'+(0.5*`ties'))/(`pairs')
	}
	
***************************************** CI

if "`hanley'"=="" {
	// default use necombe SE formula
	local newcombe_c = `cstat'
	local cstat_se = ((`cstat'*(1-`cstat'))*(1+(((`obs'/2)-1)*((1-`cstat')/(2-`cstat'))) ///
	+((((`obs'/2)-1)*`cstat')/(1+`cstat')))/((`obs'^2)*`prev'*(1-`prev')))^.5
	local cstat_lb = `cstat' - (1.96*`cstat_se')
	local cstat_ub = `cstat' + (1.96*`cstat_se')
}
else {
	// if hanley option set then use hanley SE formula 
	local Q1 = `cstat' / (2 - `cstat')
	local Q2 = 2 * `cstat'^2 / (1 + `cstat')
	local hanley_c = `cstat'
	local cstat_se = sqrt((`cstat' * (1 - `cstat') + (`nonevents' - 1) * (`Q1' - `cstat'^2) + (`events' - 1) * (`Q2' - `cstat'^2)) / (`nonevents' * `events'))
	local cstat_lb = `cstat' - (1.96*`cstat_se')
	local cstat_ub = `cstat' + (1.96*`cstat_se')
}


***************************************** OUTPUT

// Creating matrix of results
local res cstat 

	tempname rmat
	matrix `rmat' = J(1,5,.)
	local i=0
	foreach r of local res {
		local ++i
		matrix `rmat'[`i',1] = `obs'
		matrix `rmat'[`i',2] = ``r''
		matrix `rmat'[`i',3] = ``r'_se'
		matrix `rmat'[`i',4] = ``r'_lb'
		matrix `rmat'[`i',5] = ``r'_ub'

		//local rown "`rown' `r'"
		}
		mat colnames `rmat' = Obs Estimate SE Lower_CI Upper_CI
		mat rownames `rmat' = "C-Statistic" //`rown'

		
// print matrix 
if "`matrix'"!="" {
			matrix `matrix' = `rmat'
			
			//return matrix `matrix' = `rmat' 
			if "`print'"!="noprint" {
				//di as res _n "Discrimination statistics ..."
				matlist `matrix', border(all) //format(%9.3f)
							
				}
				*return matrix `matrix' = `rmat'
			}
			else { 
				if "`print'"!="noprint" {
					//di as res _n "Discrimination statistics ..."
					matlist `rmat', border(all) //format(%9.3f)
							
					}
				*return matrix rmat = `rmat'
				}
				
// Return scalars
local res cstat cstat_se cstat_lb cstat_ub cord disc ties  obs
 
		foreach r of local res {
			return scalar `r' = ``r''
			}
			
		if "`matrix'"!="" {
		    matrix `matrix' = `rmat'
			return matrix `matrix' = `rmat'
		}
		else {
		    return matrix rmat = `rmat'
		}
		
restore

end