**************************************************
*												 *
*  PROGRAM TO CALCULATE SAMPLE SIZE FOR MODEL    * 
*	VALIDATION BASED ON PRECISION OF PERFORMANCE *
*  31/01/22 									 *
*  												 *
*	Updated: 10/11/23							 *
*	- added net benefit as an optional criteria  *	
*												 *
*  1.0.1 J. Ensor								 *
**************************************************

*! 1.0.1 J.Ensor 10Nov2023

program define pmvalsampsize, rclass

version 12.1

/* Syntax
	CONTINUOUS = use pmvalsampsize for continuous outcome model sample size
	SURVIVAL = use pmvalsampsize for survival outcome model sample size
	BINARY = use pmvalsampsize for binary outcome model sample size
	RSQUARED = anticipated value of the (Cox-Snell) R-squared in the validation sample, based on R-sq adjusted value from model development study
	VAROBS = anticipated variance of observed values in the validation sample 
	RSQCIwidth = target precision in terms of CI width for Rsq
	CSlope = assumed C-slope performance in validation sample
	CSCIwidth = target precision in terms of CI width for C-slope
	CITL = assumed CITL performance in validation sample
	CITLCIwidth = target precision in terms of CI width for CITL
	MMOE = set MMOE threshold for acceptable precision of residual variance of CITL & C-slope
	PREVALENCE = prevalence of outcome
	SEED = set seed for simulation based calclulations 	
	NOPRINT = Suppress criteria descriptions in output 
	TRACE = Output a trace of iteration procedure for obtaining non-event mean under lpcstat option
*/

syntax ,   TYPE(string) ///
			[RSQuared(real 0) RSQCIwidth(real 0.1) /// 
			CSlope(real 1) CSCIwidth(real 0.2) /// 
			CITL(real 0) CITLCIwidth(real 0) VAROBS(real 0) /// 
			PREValence(real 0) SIMOBS(int 1000000) /// 
			CSTATistic(real 0) CSTATCIwidth(real 0.1) ///
			OE(real 1) OECIwidth(real 0.2) ///
			LPNORMal(numlist max=2) LPSKEWednormal(numlist max=4) ///
			LPBETA(numlist max=2) LPCSTAT(numlist max=1) /// 
			TOLerance(real 0.0005) INCrement(real 0.1) OESEINCrement(real 0.0001) ///
			SEED(int 123456) MMOE(real 1.1) TRACE GRAPH CALCURves noPRINT ///
			SENSitivity(real 0) SPECificity(real 0) THRESHold(real 0) ///
			NBCIwidth(real 0.2) NBSEINCrement(real 0.0001)]


***********************************************

if "`type'"=="b" {
	if "`graph'"=="graph" {
			local graph_ind "graph"
	}
	else {
	    local graph_ind ""
	}
	
	if "`trace'"=="trace" {
			local trace_ind "trace"
	}
	else {
	    local trace_ind ""
	}
	
	if "`lpnormal'"!="" {
		
		if "`lpskewednormal'"!="" | "`lpbeta'"!="" | "`lpcstat'"!="" {
			di as err "Only one LP distribution option can be specified"
			error 103
		}
		
		binary_val_samp_size, prev(`prevalence') cstat(`cstatistic') ///
				oe(`oe') oeci(`oeciwidth') oeseinc(`oeseincrement') ///
				cs(`cslope') csciwidth(`csciwidth') ///
				cstatci(`cstatciwidth') simobs(`simobs') ///
				sens(`sensitivity') spec(`specificity') thresh(`threshold') ///
				nbciwidth(`nbciwidth') nbseincrement(`nbseincrement') ///
				tol(`tolerance') inc(`increment') seed(`seed') lpnormal(`lpnormal') `graph_ind' `trace_ind'
		
	}
	else if "`lpskewednormal'"!="" {
	    
		if "`lpbeta'"!="" | "`lpcstat'"!="" {
			di as err "Only one LP distribution option can be specified"
			error 103
		}
		
		binary_val_samp_size, prev(`prevalence') cstat(`cstatistic') ///
				oe(`oe') oeci(`oeciwidth') oeseinc(`oeseincrement') ///
				cs(`cslope') csciwidth(`csciwidth') ///
				cstatci(`cstatciwidth') simobs(`simobs') ///
				sens(`sensitivity') spec(`specificity') thresh(`threshold') ///
				nbciwidth(`nbciwidth') nbseincrement(`nbseincrement') ///
				tol(`tolerance') inc(`increment') seed(`seed') lpskewednormal(`lpskewednormal') `graph_ind' `trace_ind'
		
	}
	else if "`lpbeta'"!="" {
	    
		if "`lpcstat'"!="" {
			di as err "Only one LP distribution option can be specified"
			error 103
		}
		
		binary_val_samp_size, prev(`prevalence') cstat(`cstatistic') ///
				oe(`oe') oeci(`oeciwidth') oeseinc(`oeseincrement') ///
				cs(`cslope') csciwidth(`csciwidth') ///
				cstatci(`cstatciwidth') simobs(`simobs') ///
				sens(`sensitivity') spec(`specificity') thresh(`threshold') ///
				nbciwidth(`nbciwidth') nbseincrement(`nbseincrement') ///
				tol(`tolerance') inc(`increment') seed(`seed') lpbeta(`lpbeta') `graph_ind' `trace_ind'
		
	}
	else if "`lpnormal'"=="" & "`lpskewednormal'"=="" & "`lpbeta'"=="" & "`lpcstat'"=="" {

			di as err "An LP distribution must be specified"
			error 102

	}
	else {
		binary_val_samp_size, prev(`prevalence') cstat(`cstatistic') ///
				oe(`oe') oeci(`oeciwidth') oeseinc(`oeseincrement') ///
				cs(`cslope') csciwidth(`csciwidth') ///
				cstatci(`cstatciwidth') simobs(`simobs') ///
				sens(`sensitivity') spec(`specificity') thresh(`threshold') ///
				nbciwidth(`nbciwidth') nbseincrement(`nbseincrement') ///
				tol(`tolerance') inc(`increment') seed(`seed') lpcstat(`lpcstat') `graph_ind' `trace_ind'
				
		ret sca non_event_mean = r(non_event_mean)
		ret sca event_mean = r(event_mean)
		ret sca common_variance = r(common_variance)
		ret sca tolerance = r(tolerance)
		ret sca increment = r(increment)
		ret sca simulated_data_cstat = r(simulated_data_cstat)
	}
		
	// return list
	return scalar sample_size = r(sample_size)
	return scalar events = r(events)
	return scalar prevalence = r(prevalence)
	local retlist oe se_oe lb_oe ub_oe width_oe cslope se_cslope csciwidth cstatistic se_cstat cstatciwidth sensitivity specificity threshold nb standardised_nb
		foreach s of local retlist {
		ret sca `s' = r(`s')
		}
		
	mat def binary_val_samp_size_results = r(results)
	return mat results = binary_val_samp_size_results
	
	if "`print'"!="noprint" {
		criteria_print, type("`type'") sens(`sensitivity') spec(`specificity') thresh(`threshold')
	}
	}
	else if "`type'"=="c" {
		
***********************************************
	if `rsquared'==0 {
				di as err "rsquared() must be specified"
				error 103
			}

	continuous_val_samp_size , rsq(`rsquared') varobs(`varobs') rsqci(`rsqciwidth') ///
		cs(`cslope') csci(`csciwidth') ///
		citl(`citl') citlci(`citlciwidth') mmoe(`mmoe')
	
	return scalar sample_size = r(sample_size)
	
	local retlist citl se_citl lb_citl ub_citl citlciwidth cslope se_cslope lb_cslope ub_cslope csciwidth rsquared se_rsq lb_rsq ub_rsq rsqciwidth
	foreach s of local retlist {
		ret sca `s' = r(`s')
		}
	
	mat def continuous_val_samp_size_results = r(results)
	return mat results = continuous_val_samp_size_results
	
	if "`print'"!="noprint" {
		criteria_print, type("`type'") 
	}
	}
	else {
		di as err "Model type must be either b or c (binary/continuous)"
		error 499
		}
	
	
***********************************************
		
end

******* start of binary
program define binary_val_samp_size, rclass

version 12.1

/* Syntax
	PREVALENCE = prevalence of outcome 
	CSTATistic = anticipated c statistic at validation
	OE = observed/expected ratio
	OECIwidth = target CI width for OE performance
	CSlope = anticipated calibration slope at validation
	CSCIwidth = target CI width for calibration slope performance
	CSTATCIwidth = target CI width for c statistic performance
	SIMOBS = sets the number of observations to use for simulated LP calculations
	LPNORMal = defines parameters to simulate LP from normal distribution
	LPSKEWednormal = defines parameters to simulate LP from a skewed normal distribution
	LPBETA  = defines parameters to simulate LP from beta distribution
	LPCSTAT = defines starting value for non-events mean  
	TOLerance = sets tolerance for observed event prop. during iterative procedure for non-events mean 
	INCrement = sets increment by which to iterate when identifying mean for non-events for lpcstat()
	OESEINCrement = sets increment by which to iterate when identifying the SE(ln(OE)) to meet the target CI width for OE 
	SEED = sets seed for calculations based on simulated data
	GRAPH = produces histogram of LP dist for checking
	TRACE = Output a trace of iteration procedure for obtaining non-event mean under lpcstat option
	SENSitivity = sensitivity for net benefit 
	SPECificity = specificity for net benefit
	THRESHold = risk threshold for net benefit 
*/

syntax , PREValence(real) CSTATistic(real) ///
			[OE(real 1) OECIwidth(real 0.2) ///
			 CSlope(real 1) CSCIwidth(real 0.2) ///
			 CSTATCIwidth(real 0.1) SIMOBS(int 1000000) ///
			 LPNORMal(numlist max=2) LPSKEWednormal(numlist max=4) ///
			 LPBETA(numlist max=2) LPCSTAT(numlist max=1) /// 
			 TOLerance(real 0.0005) INCrement(real 0.1) OESEINCrement(real 0.0001) ///
			 SEED(int 123456) GRAPH CALCURves TRACE SENSitivity(real 0) ///
			 SPECificity(real 0) THRESHold(real 0) NBCIwidth(real 0.2) NBSEINCrement(real 0.0001)] 

// CHECK INPUTS

if `prevalence'>=0 & `prevalence'<=1 { 
		}
		else {
			di as err "Prevalence must lie in the interval [0,1]"
			error 459
			}
			
if `cstatistic'>=0 & `cstatistic'<=1 { 
		}
		else {
			di as err "C statistic must lie in the interval [0,1]"
			error 459
			}
			
if "`lpnormal'"!="" {
		
		if "`lpskewednormal'"!="" | "`lpbeta'"!="" | "`lpcstat'"!="" {
			di as err "Only one LP distribution option can be specified"
			error 103
		}
		
	}
	else if "`lpskewednormal'"!="" {
	    
		if "`lpbeta'"!="" | "`lpcstat'"!="" {
			di as err "Only one LP distribution option can be specified"
			error 103
		}
		
	}
	else if "`lpbeta'"!="" {
	    
		if "`lpcstat'"!="" {
			di as err "Only one LP distribution option can be specified"
			error 103
		}
		
	}
	else if "`lpnormal'"=="" & "`lpskewednormal'"=="" & "`lpbeta'"=="" & "`lpcstat'"=="" {

			di as err "An LP distribution must be specified"
			error 102

	}
			
// SET SEED 
set seed `seed'
 
// CALCULATE N FOR EACH PERFORMANCE STAT CRITERIA 

 // CRITERIA 1 - O/E 
 local width_oe = 0 
 local se_oe = 0
 while `width_oe'<`oeciwidth' {
 	local se_oe = `se_oe'+`oeseincrement'
	local ub_oe = exp(ln(`oe') + (1.96*`se_oe'))
	local lb_oe = exp(ln(`oe') - (1.96*`se_oe'))
	local width_oe = `ub_oe'-`lb_oe'
	//nois di "width=`width_oe' -----lb=`lb_oe'-----ub=`ub_oe'-------se=`se_oe'"
 }

	local n1 = ceil((1-`prevalence')/(`prevalence'*`se_oe'^2))

	local E1 = `n1'*`prevalence'
	
// CRITERIA 2 - CSLOPE
	preserve
	qui drop _all  
	qui set obs `simobs'
	
// parse distribution parameters & generate LP values 
if "`lpskewednormal'"!="" {
		local lpdist "skewednormal"
		tokenize `lpskewednormal' , parse(" ", ",")
		local mean = `1'
		local var = `2'
		local skew = `3'
		local kurtosis = `4'
		
		di as txt "Skewed normal LP distribution with parameters - mean=`mean', variance=`var', skew=`skew', kurtosis=`kurtosis'" _n "Working ..."
		
		qui sknor `simobs' `seed' `lpskewednormal'
		
		qui gen LP = skewnormal
		}
		else if "`lpnormal'"!="" {
		    local lpdist "normal"
			tokenize `lpnormal' , parse(" ", ",")
			local mean = `1'
			local sd = `2'
						
			di as txt "Normal LP distribution with parameters - mean=`mean', standard deviation=`sd'" _n
			
			qui gen LP = rnormal(`mean',`sd')
		}
		else if "`lpbeta'"!="" {
		    local lpdist "beta"
			tokenize `lpbeta' , parse(" ", ",")
			local a = `1'
			local b = `2'
						
			di as txt "Beta P distribution with parameters - alpha=`a', beta=`b'" _n
			
			qui gen P = rbeta(`a',`b')
			qui gen LP = logit(P)
		}
		else if "`lpcstat'"!="" {
		 
			local lpdist "cstat"
			local m2 = `lpcstat'
			local var = 2*(invnorm(`cstatistic')^2)
				
			qui gen outcome = rbinomial(1,`prevalence')
			* non-events group
			qui gen LP = rnormal(`m2', sqrt(`var'))
			* events group mean is non-events group mean + S2 
			qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1

			qui gen P = exp(LP)/(1+exp(LP)) 
			qui gen outcome_test = rbinomial(1,P)
			qui summ outcome_test
						
			local diff = abs(`prevalence' - r(mean))
			
			if "`trace'"=="trace" {
			if `diff'>`tolerance' {
			    di as txt "Proportion of observed outcome events does not match input prevalence" _n "Beginning iterative approach ..." _n
				
				di as txt "-------------------- TRACE ON ---------------------" _n
				
				local n=1
				local diff`n' = `diff'
				local m2 = `m2'+`increment'
																											
				qui replace outcome = rbinomial(1,`prevalence')
				* non-events group
				qui replace LP = rnormal(`m2', sqrt(`var'))
				* events group mean is non-events group mean + S2 
				qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1
	
				qui replace P = exp(LP)/(1+exp(LP)) 
				qui replace outcome_test = rbinomial(1,P)
				qui summ outcome_test
								
				local n = `n'+1
				local diff`n' = abs(`prevalence' - r(mean))
	
				if `diff`n''<`diff' {
					
					while `diff`n''>`tolerance' {
					    local m2 = `m2'+`increment'
						//di as txt _c "." _c 
				
						qui replace outcome = rbinomial(1,`prevalence')
						* non-events group
						qui replace LP = rnormal(`m2', sqrt(`var'))
						* events group mean is non-events group mean + S2 
						qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1
			
						qui replace P = exp(LP)/(1+exp(LP)) 
						qui replace outcome_test = rbinomial(1,P)
						qui summ outcome_test
												
						local n = `n'+1
						local diff`n' = abs(`prevalence' - r(mean))
						
						di as txt "Proportion of outcome events under simulation = `r(mean)' + Target prevalence = `prevalence' + Mean in non-event group=`m2'" _n
				
					}
					
				}
				else {
					
					while `diff`n''>`tolerance' {
					    local m2 = `m2'-`increment'
						//di as txt _c "." _c 
				
						qui replace outcome = rbinomial(1,`prevalence')
						* non-events group
						qui replace LP = rnormal(`m2', sqrt(`var'))
						* events group mean is non-events group mean + S2 
						qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1
			
						qui replace P = exp(LP)/(1+exp(LP)) 
						qui replace outcome_test = rbinomial(1,P)
						qui summ outcome_test
											
						local n = `n'+1
						local diff`n' = abs(`prevalence' - r(mean))
						
						di as txt "Proportion of outcome events under simulation = `r(mean)' + Target prevalence = `prevalence' + Mean in non-event group=`m2'" _n
					}
					
				}
				
				di as txt "-------------------- TRACE END ---------------------" _n
				di as txt _n "Proportion of observed outcome events is within tolerance" _n "Proportion of outcome events under simulation = `r(mean)' + Target prevalence = `prevalence'" _n "Mean in non-event group=`m2'"
				local m2 = `m2'
				}
				else {
				    di as txt "Proportion of observed outcome events is within tolerance" _n "Proportion of outcome events under simulation = `r(mean)' + Target prevalence = `prevalence'" _n "Mean in non-event group=`m2'" 
					local m2 = `m2'
				}
			}
			else {
				if `diff'>`tolerance' {
			    di as txt "Proportion of observed outcome events does not match input prevalence" _n "Beginning iterative approach ..." _c
				
				local n=1
				local diff`n' = `diff'
				local m2 = `m2'+`increment'
																											
				qui replace outcome = rbinomial(1,`prevalence')
				* non-events group
				qui replace LP = rnormal(`m2', sqrt(`var'))
				* events group mean is non-events group mean + S2 
				qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1
	
				qui replace P = exp(LP)/(1+exp(LP)) 
				qui replace outcome_test = rbinomial(1,P)
				qui summ outcome_test
								
				local n = `n'+1
				local diff`n' = abs(`prevalence' - r(mean))
	
				if `diff`n''<`diff' {
					
					while `diff`n''>`tolerance' {
					    local m2 = `m2'+`increment'
						di as txt _c "." _c 
				
						qui replace outcome = rbinomial(1,`prevalence')
						* non-events group
						qui replace LP = rnormal(`m2', sqrt(`var'))
						* events group mean is non-events group mean + S2 
						qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1
			
						qui replace P = exp(LP)/(1+exp(LP)) 
						qui replace outcome_test = rbinomial(1,P)
						qui summ outcome_test
												
						local n = `n'+1
						local diff`n' = abs(`prevalence' - r(mean))
					}
					
				}
				else {
					
					while `diff`n''>`tolerance' {
					    local m2 = `m2'-`increment'
						di as txt _c "." _c 
				
						qui replace outcome = rbinomial(1,`prevalence')
						* non-events group
						qui replace LP = rnormal(`m2', sqrt(`var'))
						* events group mean is non-events group mean + S2 
						qui replace LP = rnormal(`m2' + `var', sqrt(`var')) if outcome == 1
			
						qui replace P = exp(LP)/(1+exp(LP)) 
						qui replace outcome_test = rbinomial(1,P)
						qui summ outcome_test
											
						local n = `n'+1
						local diff`n' = abs(`prevalence' - r(mean))
					}
					
				}
				di as txt _n "Proportion of observed outcome events is within tolerance" _n "Proportion of outcome events under simulation = `r(mean)' + Target prevalence = `prevalence'" _n "Mean in non-event group=`m2'"
				local m2 = `m2'
				}
				else {
				    di as txt "Proportion of observed outcome events is within tolerance" _n "Proportion of outcome events under simulation = `r(mean)' + Target prevalence = `prevalence'" _n "Mean in non-event group=`m2'" 
					local m2 = `m2'
				}
			}
			
			// check C-statistic is correct
			qui pmcstat LP outcome_test
			local simulated_data_cstat = r(cstat)
			
			ret sca non_event_mean = `m2'
			ret sca event_mean = `m2' + `var'
			ret sca common_variance = `var'
			ret sca tolerance = `tolerance'
			ret sca increment = `increment'
			ret sca simulated_data_cstat = `simulated_data_cstat'
			}
		
	// graph option to output a histogram of the LP distribution for checking
	if "`graph'"=="graph" {
		qui su LP, det
		local stats mean sd min max p50 p25 p75 skewness kurtosis 
		foreach r of local stats {
			local `r' : di %4.3f r(`r')
		}
		
		qui hist LP , text(0.01 `min' "Mean = `mean'" "SD = `sd'" "Median = `p50'" "LQ = `p25'" "UQ = `p75'" "Min = `min'" "Max = `max'" "Skewness = `skewness'" "Kurtosis = `kurtosis'", size(small) place(ne) just(left)) name(pmvalss_lp_dist_`lpdist', replace)
	}
	
	* input assumed parameters of calibration model (in future vr these could be options)
	local beta0 = 0
	local beta1 = 1
	
	* caclulate elements of I matrix
	qui gen Borenstein_00 = exp(`beta0' + (`beta1'*LP))/((1+ exp(`beta0' + (`beta1'*LP)))^2)
	qui gen Borenstein_01 = LP*exp(`beta0' + (`beta1'*LP))/((1+ exp(`beta0' + (`beta1'*LP)))^2)
	qui gen Borenstein_11 = LP*LP*exp(`beta0' + (`beta1'*LP))/((1+ exp(`beta0' + (`beta1'*LP)))^2)
	
	qui summ Borenstein_00
	local I_00 = r(mean)

	qui summ Borenstein_01
	local I_01 = r(mean)

	qui summ Borenstein_11
	local I_11 = r(mean)

	
	* calculate SE from input target CI width
	local se_cslope = round(`csciwidth'/(2*1.96),.00001) // SS very sensitive to rounding 
	
	* calculate sample size
	local n2 = ceil(`I_00'/(`se_cslope'*`se_cslope'*((`I_00'*`I_11')-(`I_01'*`I_01'))))
	
	local E2 = `n2'*`prevalence'
	
// CRITERIA 4 - NET BENEFIT
local estimate_nb = `sensitivity'+`specificity'

if `sensitivity'!=0 & `specificity'!=0 & `threshold'!=0 {
	local nb = (`sensitivity'*`prevalence') - ((1-`specificity')*(1-`prevalence')*(`threshold'/(1-`threshold')))
	local standardised_nb = `nb'/`prevalence'

	local w = ((1-`prevalence')/`prevalence')*(`threshold'/(1-`threshold'))
	* target CI width for sNB of 0.2, which corresponds to SE of 0.051

	local width_nb = 0 
	 local se_nb = 0
	 while `width_nb'<`nbciwidth' {
		local se_nb = `se_nb'+`nbseincrement'
		local ub_nb = (`standardised_nb') + (1.96*`se_nb')
		local lb_nb = (`standardised_nb') - (1.96*`se_nb')
		local width_nb = `ub_nb'-`lb_nb'
	// 	nois di "width=`width_nb' -----lb=`lb_nb'-----ub=`ub_nb'-------se=`se_nb'"
	 }
	 
	 local se_nb = round(`se_nb',0.001)

	 * calculate sample size
	local n4 = ceil((1/(`se_nb'^2))*(((`sensitivity'*(1-`sensitivity'))/`prevalence')+(`w'*`w'*`specificity'*(1-`specificity')/(1-`prevalence'))+(`w'*`w'*(1-`specificity')*(1-`specificity')/(`prevalence'*(1-`prevalence')))))

	local no_nb = 0
}
else if `threshold'!=0 {
			qui gen nb_p = exp(LP)/(1+exp(LP)) 
			qui gen nb_outcome = rbinomial(1,nb_p)
			qui gen classification = 0 if nb_p<`threshold'
			qui replace classification = 1 if classification==.
			qui count if classification==1 & nb_outcome==1
			local pos = r(N)
			qui count if classification==0 & nb_outcome==0
			local neg = r(N)
			qui count if nb_outcome==1
			local e = r(N)
			qui count if nb_outcome==0
			local ne = r(N)
			local sensitivity = round(`pos'/`e',0.01)
			local specificity = round(`neg'/`ne',0.01)
			
			local nb = (`sensitivity'*`prevalence') - ((1-`specificity')*(1-`prevalence')*(`threshold'/(1-`threshold')))
	local standardised_nb = `nb'/`prevalence'

	local w = ((1-`prevalence')/`prevalence')*(`threshold'/(1-`threshold'))
	* target CI width for sNB of 0.2, which corresponds to SE of 0.051

	local width_nb = 0 
	 local se_nb = 0
	 while `width_nb'<`nbciwidth' {
		local se_nb = `se_nb'+`nbseincrement'
		local ub_nb = (`standardised_nb') + (1.96*`se_nb')
		local lb_nb = (`standardised_nb') - (1.96*`se_nb')
		local width_nb = `ub_nb'-`lb_nb'
	// 	nois di "width=`width_nb' -----lb=`lb_nb'-----ub=`ub_nb'-------se=`se_nb'"
	 }
	 
	 local se_nb = round(`se_nb',0.001)

	 * calculate sample size
	local n4 = ceil((1/(`se_nb'^2))*(((`sensitivity'*(1-`sensitivity'))/`prevalence')+(`w'*`w'*`specificity'*(1-`specificity')/(1-`prevalence'))+(`w'*`w'*(1-`specificity')*(1-`specificity')/(`prevalence'*(1-`prevalence')))))

	local no_nb = 0
}
else {
	local no_nb = 1
	local nb = 0
	local standardised_nb = 0
	}

** end of criteria 4

// restore following simulation for criteria 2 - c-slope
	
	restore

// CRITERIA 3 - CSTATISTIC
	preserve
	
	qui drop _all  
	qui set obs 1000000
	
	qui gen size = _n
	qui gen se_cstatsq = `cstatistic'*(1-`cstatistic')*(1+(((size/2)-1)*((1-`cstatistic')/(2-`cstatistic'))) +((((size/2)-1)*`cstatistic')/(1+`cstatistic')))/(size*size*`prevalence'*(1-`prevalence'))
	qui gen se_cstat = sqrt(se_cstatsq)
	qui gen CIwidth = 2*1.96*se_cstat 
	* Now identify the sample sizes that give a CIwidth no wider than the desired value
	qui drop if CIwidth>`cstatciwidth'
	* Finally identify the minimum of all these sample sizes
	qui su size
	local n3 = r(min)
	local se_cstat = sqrt(`cstatistic'*(1-`cstatistic')*(1+(((`n3'/2)-1)*((1-`cstatistic')/(2-`cstatistic'))) +((((`n3'/2)-1)*`cstatistic')/(1+`cstatistic')))/(`n3'*`n3'*`prevalence'*(1-`prevalence')))

	local E3 = `n3'*`prevalence'

	restore 
	

 
	
****************************************************** REPORTING

if `no_nb'==1 {
	// MINIMUM N 
local nfinal = max(`n1',`n2',`n3')
local di_perf_1 : di %4.3f `oe'
local di_perf_2 : di %4.3f `cslope'
local di_perf_3 : di %4.3f `cstatistic'
// local di_perf_4 : di %4.3f `standardised_nb'

local di_se_1 : di %4.3f `se_oe'
local di_se_2 : di %4.3f `se_cslope'
local di_se_3 : di %4.3f `se_cstat'
// local di_se_4 : di %4.3f `se_nb'

local di_ci_1 : di %4.3f  `width_oe' //`oeciwidth'
local di_ci_2 : di %4.3f `csciwidth'
local di_ci_3 : di %4.3f `cstatciwidth'
// local di_ci_4 : di %4.3f `nbciwidth'

local di_lb_1 : di %4.3f `lb_oe'
//local di_lb_2 : di %4.3f `lb_cslope'
//local di_lb_3 : di %4.3f `lb_cstat'
local di_ub_1 : di %4.3f `ub_oe'
//local di_ub_2 : di %4.3f `ub_cslope'
//local di_ub_3 : di %4.3f `ub_cstat'


local E_final = `nfinal'*`prevalence'

// RETURN LIST
return scalar sample_size = `nfinal'
return scalar events = `E_final'
return scalar prevalence = `prevalence'
local retlist oe se_oe lb_oe ub_oe width_oe cslope se_cslope csciwidth cstatistic se_cstat cstatciwidth sensitivity specificity threshold standardised_nb nb
foreach s of local retlist {
ret sca `s' = ``s''
}

// OUTPUT TABLE & ASSUMPTIONS
*di as txt _n "NB: Assuming E/O=`di_perf_1' & c-slope=`di_perf_2'"

local res 1 2 3   
matrix Results = J(4,4,.)
local i=0
foreach r of local res {
	local ++i
	matrix Results[`i',1] = `n`r''
	matrix Results[`i',2] = `di_perf_`r''
	matrix Results[`i',3] = `di_se_`r''
	matrix Results[`i',4] = `di_ci_`r''
	}
	mat colnames Results = "Samp_size" "Perf" "SE" "CI width" 
	mat rownames Results = "Criteria 1 - O/E" "Criteria 2 - C-slope" "Criteria 3 - C statistic" "Final SS"
	// alternative more informative criteria names for output table 
// 	mat rownames Results = "1.O/E" "2.C-slope" "3.C statistic" "Final SS"

preserve
clear
qui svmat Results, names(col)
sort Samp_size
local i = 4
matrix Results[`i',1] = Samp_size[3]
matrix Results[`i',2] = Perf[3]
matrix Results[`i',3] = SE[3]
matrix Results[`i',4] = CI[3]
restore

matlist Results,  aligncolnames(r) twidth(30) lines(rowtotal)

return mat results = Results

local di_E_final = ceil(`E_final')
di _n "Minimum sample size required for model validation based on user inputs = `nfinal',"
di "with `di_E_final' events (assuming an outcome prevalence = `prevalence')"

}
else {
	// MINIMUM N 
	local nfinal = max(`n1',`n2',`n3',`n4')
	local di_perf_1 : di %4.3f `oe'
	local di_perf_2 : di %4.3f `cslope'
	local di_perf_3 : di %4.3f `cstatistic'
	local di_perf_4 : di %4.3f `standardised_nb'

	local di_se_1 : di %4.3f `se_oe'
	local di_se_2 : di %4.3f `se_cslope'
	local di_se_3 : di %4.3f `se_cstat'
	local di_se_4 : di %4.3f `se_nb'

	local di_ci_1 : di %4.3f  `width_oe' //`oeciwidth'
	local di_ci_2 : di %4.3f `csciwidth'
	local di_ci_3 : di %4.3f `cstatciwidth'
	local di_ci_4 : di %4.3f `nbciwidth'

	local di_lb_1 : di %4.3f `lb_oe'
	//local di_lb_2 : di %4.3f `lb_cslope'
	//local di_lb_3 : di %4.3f `lb_cstat'
	local di_ub_1 : di %4.3f `ub_oe'
	//local di_ub_2 : di %4.3f `ub_cslope'
	//local di_ub_3 : di %4.3f `ub_cstat'


	local E_final = `nfinal'*`prevalence'

	// RETURN LIST
	return scalar sample_size = `nfinal'
	return scalar events = `E_final'
	return scalar prevalence = `prevalence'
	local retlist oe se_oe lb_oe ub_oe width_oe cslope se_cslope csciwidth cstatistic se_cstat cstatciwidth sensitivity specificity threshold standardised_nb nb
	foreach s of local retlist {
	ret sca `s' = ``s''
	}

	// OUTPUT TABLE & ASSUMPTIONS
	*di as txt _n "NB: Assuming E/O=`di_perf_1' & c-slope=`di_perf_2'"

	local res 1 2 3 4  
	matrix Results = J(5,4,.)
	local i=0
	foreach r of local res {
		local ++i
		matrix Results[`i',1] = `n`r''
		matrix Results[`i',2] = `di_perf_`r''
		matrix Results[`i',3] = `di_se_`r''
		matrix Results[`i',4] = `di_ci_`r''
		}
		mat colnames Results = "Samp_size" "Perf" "SE" "CI width" 
		mat rownames Results = "Criteria 1 - O/E" "Criteria 2 - C-slope" "Criteria 3 - C statistic" "Criteria 4 - St Net Benefit" "Final SS"
		// alternative more informative criteria names for output table 
	// 	mat rownames Results = "1.O/E" "2.C-slope" "3.C statistic" "Final SS"

	preserve
	clear
	qui svmat Results, names(col)
	sort Samp_size
	local i = 5
	matrix Results[`i',1] = Samp_size[4]
	matrix Results[`i',2] = Perf[4]
	matrix Results[`i',3] = SE[4]
	matrix Results[`i',4] = CI[4]
	restore

	matlist Results,  aligncolnames(r) twidth(30) lines(rowtotal)

	return mat results = Results

	local di_E_final = ceil(`E_final')
	di _n "Minimum sample size required for model validation based on user inputs = `nfinal',"
	di "with `di_E_final' events (assuming an outcome prevalence = `prevalence')"
}

end 	

******* end of binary

******* start of continuous
program define continuous_val_samp_size, rclass

version 12.1

/* Syntax
	RSQuared = assumed Rsq performance in validation sample [use adjusted Rsq from previous model development study - must be adjusted]
	VAROBS = anticipated variance of observed values in the validation sample 
	RSQCIwidth = target precision in terms of CI width for Rsq
	CSlope = assumed C-slope performance in validation sample
	CSCIwidth = target precision in terms of CI width for C-slope
	CITL = assumed CITL performance in validation sample
	CITLCIwidth = target precision in terms of CI width for CITL
	MMOE = set MMOE threshold for acceptable precision of residual variance of CITL & C-slope
*/

syntax , RSQuared(real) VAROBS(real) ///
			[RSQCIwidth(real 0.1) /// RSQSE(real 0.0255) /// 
			CSlope(real 1) CSCIwidth(real 0.2) /// CSSE(real 0.051) ///
			CITL(real 0) CITLCIwidth(real 0) /// CITLSE(real 0) ///
			MMOE(real 1.1)]
			
			// RSQVAL(real 0) RSQCAL(real 0) RSQCITL(real 0)] 
// these rsq's would allow users to enter different rsq for each criteria but this should not be the case 

// CHECK INPUTS
	if `rsquared'>=0 & `rsquared'<=1 { 
		}
		else {
			di as err "R-sq must lie in the interval [0,1]"
			error 459
			}
			


	// CRITERIA 1 - PRECISE ESTIMATION OF R2 VALIDATION
	local r2val = `rsquared' 
	local DIr2val : di %4.3f `r2val'
	
	local lb_rsq = `r2val'-(`rsqciwidth'/2)
	local ub_rsq = `r2val'+(`rsqciwidth'/2)
	local se_rsq = round(((`ub_rsq'-`lb_rsq')/3.92),.0001)
		
	local n1 = ceil((4*`r2val'*(1-`r2val')^2)/`se_rsq'^2)

	
	// CRITERIA 2 - PRECISE ESTIMATION OF CITL
	local r2citl = `rsquared'
	
	local lb_citl = `citl'-(`citlciwidth'/2)
	local ub_citl = `citl'+(`citlciwidth'/2)
	local se_citl = round(((`ub_citl'-`lb_citl')/3.92),.0001)
	
	local n2 = ceil((`varobs'*(1-`r2citl'))/`se_citl'^2)

		
	// CRITERIA 3 - PRECISE ESTIMATE OF CALIBRATION SLOPE 
	local r2cal = `rsquared'
	
	local lb_cslope = `cslope'-(`csciwidth'/2)
	local ub_cslope = `cslope'+(`csciwidth'/2)
	local se_cslope = round(((`ub_cslope'-`lb_cslope')/3.92),.0001)
	
	local n3 = ceil(((`cslope'^2*(1-`r2cal'))/(`r2cal'*`se_cslope'^2))+1)

	
	// CRITERIA 4 - PRECISE ESTIMATION OF RESIDUAL VARIANCES
	local n4 = 235
	// delete after here? as N=235 is the necessary N for mmoe=1.1 
	local df = `n4'-1
	local chilow = `df'/(invchi2(`df',0.025))
	local chiupp = (invchi2(`df',0.975))/`df'
	local max = max(`chilow',`chiupp')
	local resvar_mmoe = `max'^.5
	if `resvar_mmoe'>`mmoe' {
		while `resvar_mmoe'>`mmoe' {
			local ++n4
			local df = `n4'-1
			local chilow = `df'/(invchi2(`df',0.025))
			local chiupp = (invchi2(`df',0.975))/`df'
			local max = max(`chilow',`chiupp')
			local resvar_mmoe = `max'^.5
			
				if `resvar_mmoe'<=`mmoe' { 
					local n4 = `n4'
					continue, break
					}
			}
		}
		else {
			local n4 = `n4'
			}
			
// MINIMUM N 
local nfinal = max(`n1',`n2',`n3',`n4')	
local di_perf_1 : di %4.3f `rsquared'
local di_perf_2 : di %4.3f `citl'
local di_perf_3 : di %4.3f `cslope'
local di_perf_4 = .
local di_se_1 : di %4.3f `se_rsq'
local di_se_2 : di %4.3f `se_citl'
local di_se_3 : di %4.3f `se_cslope'
local di_se_4  = .
local di_ci_1 : di %4.3f `rsqciwidth'
local di_ci_2 : di %4.3f `citlciwidth'
local di_ci_3 : di %4.3f `csciwidth'
local di_ci_4  = .
		
// RETURN LIST
return scalar sample_size = `nfinal'
local retlist citl se_citl lb_citl ub_citl citlciwidth cslope se_cslope lb_cslope ub_cslope csciwidth rsquared se_rsq lb_rsq ub_rsq rsqciwidth
foreach s of local retlist {
ret sca `s' = ``s''
}


// OUTPUT TABLE & ASSUMPTIONS
* add lb & ub to output? 
di as txt "NB: Assuming MMOE<=`mmoe' in estimation of residual variances (Criteria 4)"

local res 1 2 3 4 //final 
matrix Results = J(5,4,.)
local i=0
foreach r of local res {
	local ++i
	matrix Results[`i',1] = `n`r''
	matrix Results[`i',2] = `di_perf_`r''
	matrix Results[`i',3] = `di_se_`r''
	matrix Results[`i',4] = `di_ci_`r''
	}
	mat colnames Results = "Samp_size" "Perf" "SE" "CI width" 
	mat rownames Results = "Criteria 1 - Rsq" "Criteria 2 - CITL" "Criteria 3 - C-slope" "Criteria 4 - Residual var" "Final SS"

preserve
clear
qui svmat Results, names(col)
sort Samp_size
local i = 5
matrix Results[`i',1] = Samp_size[4]
matrix Results[`i',2] = Perf[4]
matrix Results[`i',3] = SE[4]
matrix Results[`i',4] = CI[4]
restore

matlist Results,  aligncolnames(r) twidth(30) lines(rowtotal)

return mat results = Results

di _n "Minimum sample size required for model validation based on user inputs = `nfinal'"

end 	

******* end of continuous

******* start of survival

******* end of survival


******* start of program to print criteria info 
program define criteria_print, rclass

version 12.1

/* Syntax
	TYPE = model type 
*/

syntax ,  TYPE(string) [SENSitivity(real 0) SPECificity(real 0) THRESHold(real 0)]

if "`type'"!="c" {
	
	
	if `sensitivity'!=0 & `specificity'!=0 & `threshold'!=0 {
		di _n "Criteria 1 - precise estimation of O/E performance in the validation sample"
			di "Criteria 2 - precise estimation of the calibration slope in the validation sample"
			di "Criteria 3 - precise estimation of the C statistic in the validation sample"
			di "Criteria 4 - precise estimation of the standardised net-benefit in the validation sample"
	}
	else if `threshold'!=0 {
		di _n "Criteria 1 - precise estimation of O/E performance in the validation sample"
			di "Criteria 2 - precise estimation of the calibration slope in the validation sample"
			di "Criteria 3 - precise estimation of the C statistic in the validation sample"
			di "Criteria 4 - precise estimation of the standardised net-benefit in the validation sample"
	}
	else {
		di _n "Criteria 1 - precise estimation of O/E performance in the validation sample"
		di "Criteria 2 - precise estimation of the calibration slope in the validation sample"
		di "Criteria 3 - precise estimation of the C statistic in the validation sample"
	}
			
}
	else {
		di _n "Criteria 1 - precise estimation of R-squared performance in the validation sample"
		di "Criteria 2 - precise estimation of CITL performance in the validation sample"
		di "Criteria 3 - precise estimation of the calibration slope in the validation sample"
		di "Criteria 4 - precise estimation of the residual variances in the validation sample"
	}

end

******* end of criteria_print program



program define pmcstat, rclass

/* Syntax
	VARLIST = A list of two variables, the linear predictor for the model,
			and the event indicator (observed outcome)
	NOPRINT = suppress the onscreen output of performance stats
	MATRIX = specify the name of a matrix storing the performance stats 
*/

syntax varlist(min=1 max=2 numeric) [if] [in], [noPRINT  ///
				MATrix(name local) HANley FASTER]

*********************************************** SETUP/CHECKS
*SET UP TEMPs
tempvar p rank_disc rank2_disc diff_disc inv_outcome rank_cord rank2_cord diff_cord

// check on the if/in statement 
marksample touse
qui count if `touse'
local samp=r(N)
if `r(N)'==0 { 
	di as err "if statement identifies subgroup with no data?"
	error 2000
	}
	
// parse varlist
tokenize `varlist' , parse(" ", ",")
local lp = `"`1'"'
local outcome = `"`2'"'

// generate probabilities
qui gen `p' = exp(`lp')/(1+exp(`lp'))

// run checks on user input variables in varlist
// check if user has input both LP and obs (for binary outcome)
local varcountcheck: word count `varlist'

if `varcountcheck'!=2 {
	di as err "Varlist must contain two variables. Linear predictor values, followed by observed outcomes (binary variable) are required"
	error 102
	}

// check outcome is binary
cap assert `outcome'==0 | `outcome'==1 if `touse'
        if _rc~=0 {
                noi di as err "Event indicator `outcome' must be coded 0 or 1"
                error 450
        }


*********************************************** C-STAT

if "`faster'"=="faster" {
	// check for packages 
	local packs gtools
	foreach pkg of local packs {
		capture which `pkg'
		if _rc==111 {	
			ssc install `pkg'
			}
		}
		
	// discordant pairs
	hashsort `p' `outcome' 		
	qui gen `rank_disc' = _n if `touse'

	hashsort `outcome' `p' `rank_disc'
	qui gen `rank2_disc' = _n if `touse'

	qui gen `diff_disc' = (`rank_disc' - `rank2_disc') if (`outcome'==0) & (`touse')

	// concordant pairs
	qui gen `inv_outcome' = (`outcome'==0)
	hashsort `p' `inv_outcome'
	qui gen `rank_cord' = _n if `touse'

	hashsort `inv_outcome' `p' `rank_cord'
	qui gen `rank2_cord' = _n if `touse'

	qui gen `diff_cord' = (`rank_cord' - `rank2_cord') if `inv_outcome'==0

	// total possible pairs
	qui gstats sum `outcome' if (`outcome'!=.) & (`touse'), meanonly
	local obs = r(N)
	local prev = r(mean)
	local events = r(sum)
	local nonevents = r(N) - r(sum)
	local pairs = `events'*`nonevents'  

	// compute c-stat (allowing for ties)
	qui gstats sum `diff_disc' if `touse'
	local disc = r(sum)
	qui gstats sum `diff_cord' if `touse'
	local cord = r(sum)

	local ties = `pairs'-`disc'-`cord'

	local cstat = (`cord'+(0.5*`ties'))/(`pairs')
	}
	else {
		// discordant pairs
		sort `p' `outcome' 		
		qui gen `rank_disc' = _n if `touse'

		sort `outcome' `p' `rank_disc' 
		qui gen `rank2_disc' = _n if `touse'

		qui gen `diff_disc' = (`rank_disc' - `rank2_disc') if (`outcome'==0) & (`touse')

		// concordant pairs
		qui gen `inv_outcome' = (`outcome'==0)
		sort `p' `inv_outcome' 
		qui gen `rank_cord' = _n if `touse'

		sort `inv_outcome' `p' `rank_cord' 
		qui gen `rank2_cord' = _n if `touse'

		qui gen `diff_cord' = (`rank_cord' - `rank2_cord') if `inv_outcome'==0

		// total possible pairs
		qui su `outcome' if (`outcome'!=.) & (`touse'), meanonly
		local obs = r(N)
		local prev = r(mean)
		local events = r(sum)
		local nonevents = r(N) - r(sum)
		local pairs = `events'*`nonevents'  

		// compute c-stat (allowing for ties)
		qui su `diff_disc' if `touse'
		local disc = r(sum)
		qui su `diff_cord' if `touse'
		local cord = r(sum)

		local ties = `pairs'-`disc'-`cord'

		local cstat = (`cord'+(0.5*`ties'))/(`pairs')
	}
	
***************************************** CI


if "`hanley'"=="" {
	// default use necombe SE formula
	local newcombe_c = `cstat'
	local cstat_se = ((`cstat'*(1-`cstat'))*(1+(((`obs'/2)-1)*((1-`cstat')/(2-`cstat'))) ///
	+((((`obs'/2)-1)*`cstat')/(1+`cstat')))/((`obs'^2)*`prev'*(1-`prev')))^.5
	local cstat_lb = `cstat' - (1.96*`cstat_se')
	local cstat_ub = `cstat' + (1.96*`cstat_se')
}
else {
	// if hanley option set then use hanley SE formula 
	local Q1 = `cstat' / (2 - `cstat')
	local Q2 = 2 * `cstat'^2 / (1 + `cstat')
	local hanley_c = `cstat'
	local cstat_se = sqrt((`cstat' * (1 - `cstat') + (`nonevents' - 1) * (`Q1' - `cstat'^2) + (`events' - 1) * (`Q2' - `cstat'^2)) / (`nonevents' * `events'))
	local cstat_lb = `cstat' - (1.96*`cstat_se')
	local cstat_ub = `cstat' + (1.96*`cstat_se')
}


***************************************** OUTPUT

// Creating matrix of results
local res cstat 

	tempname rmat
	matrix `rmat' = J(1,5,.)
	local i=0
	foreach r of local res {
		local ++i
		matrix `rmat'[`i',1] = `obs'
		matrix `rmat'[`i',2] = ``r''
		matrix `rmat'[`i',3] = ``r'_se'
		matrix `rmat'[`i',4] = ``r'_lb'
		matrix `rmat'[`i',5] = ``r'_ub'

		//local rown "`rown' `r'"
		}
		mat colnames `rmat' = Obs Estimate SE Lower_CI Upper_CI
		mat rownames `rmat' = "C-Statistic" //`rown'

		
// print matrix 
if "`matrix'"!="" {
			matrix `matrix' = `rmat'
			
			//return matrix `matrix' = `rmat' 
			if "`print'"!="noprint" {
				//di as res _n "Discrimination statistics ..."
				matlist `matrix', border(all) //format(%9.3f)
							
				}
				*return matrix `matrix' = `rmat'
			}
			else { 
				if "`print'"!="noprint" {
					//di as res _n "Discrimination statistics ..."
					matlist `rmat', border(all) //format(%9.3f)
							
					}
				*return matrix rmat = `rmat'
				}
				
// Return scalars
local res cstat cord disc ties  obs
 
		foreach r of local res {
			return scalar `r' = ``r''
			}
			
		if "`matrix'"!="" {
		    matrix `matrix' = `rmat'
			return matrix `matrix' = `rmat'
		}
		else {
		    return matrix rmat = `rmat'
		}
		

end

********************************************