*! version 4.0.9 15mar2022 MJC

/*
History
15mar2022 version 4.0.9 - bug fix, when survsim_msm called with all exponential, NI fails - now fixed by doing integration 
                          analytically for that special case of all CR hazards constant
12jan2022 version 4.0.8 - build error; now fixed
02dec2021 version 4.0.7 - bug fix with msm simulation, would occur if directory of installed files had a space in it
                        - missing .mata files in pkg file
13oct2021 version 4.0.6 - bug fix with varnames in chazard() or logchazard() not parsed correctly with survsim user; now fixed
                        - error check for negative hazard() added
                        - Now tidies up after itself if it errors out
02sep2021 version 4.0.5 - bug fix; mixture + tde() would error out, now fixed
24aug2020 version 4.0.4 - re-syncing with predictms for merlin model simulation
- bug fix in missing value warning text; now fixed
25may2020 version 4.0.3 - efficiency of _msm improved
                        - bug fix: Stata variables in user-defined hazard functions not parsed correctly; now fixed
30mar2020 version 4.0.2 - bug fix; missing value issue when t = 0 in chq function, now fixed
                        - bug fix; indexing issue occurred in varied circumstances, now fixed
29mar2020 version 4.0.1 - bug fix; error in distribution(gompertz) when specified in a hazard#() -> now fixed
                        - help file edits
26mar2020 version 4.0.0 - computation time reduced by about 70% for multi-state model simulation
                        - reset added to hazard#() for semi-Markov models
                        - syntax changed for user-defined functions, #t now {t} for more robust parsing
                        - support for general multiple timescales, including:
                                - {t0} can be used in user() and tdefunction() for time of entry into initial state for associated transition
                        - cr subclass removed, now part of general msm syntax
                        - bug fix; variables used directly in user() with a multi-state model were not passed to Mata -> now fixed
                        - bug fix; if the same variable was used in multiple hazard#()s, parsing would only pick up the first -> now fixed
24mar2020 version 3.1.0 - general Markov multi-state simulation now added
                        - transmatrix() option added
                        - tolerance changed to 0 from 1e-08 in mm_root()
21mar2020 version 3.0.0 - complete re-write of core code
                        - can now simulate from a fitted merlin survival model
                        - can now simulate from cause-specific hazards competing risks models, with either parametric or user-defined hazard functions, with a new syntax
                        - distribution() now required for parameteric models
                        - ltruncated() added, can be number or varname, synced with all except merlin model simulation
                        - maxtime() can now be a number or a varname
                        - help files re-written, hugely improved
                        - added error report on missing values when general algorithm used
                        - parsing made more robust
18dec2013 version 2.0.3 - tde() with logcumh() or cumh() caused an error. Now fixed.
10oct2013 version 2.0.2 - bug fix -> exact option added to confirm vars
08jul2013 version 2.0.1 - minor bug fix in mixture models
04jan2013 version 2.0.0 - maxtime() added to specify maximum generated survival time and event indicator
                        - n() removed so must set obs
                        - loghazard() and hazard() added for user-defined hazard functions, simulated using quadrature and root finding
                        - default centol() changed to 1E-08
                        - varnames can now appear in loghazard()/hazard() allowing time-dependent covariates
                        - mixture models now use Brent method -> much more reliable than NR, and allows tdes
                        - cumhazard() and logcumhazard() now added which just use root finding
15Nov2011 version 1.1.2 - Fixed bug when generating covariate tempvars with competing risks.
20sep2011 version 1.1.1 - Exponential distribution added.
10sep2011 version 1.1.0 - Added Gompertz distribution. Time-dependent effects available for all models except mixture. showerror option added.
09sep2011 version 1.0.1 - Time dependent effects now allowed for standard Weibull.
*/

program define survsim
	version 14.2

	CheckObs
	
	local opts1											/// -fitted merlin model-
                        MODel(passthru)				/// 
														//
													
	local opts2											///	-user-defined-
                        LOGHazard(passthru)		///	
                        Hazard(passthru)		///	
                        LOGCHazard(passthru)		/// 
                        CHazard(passthru)		/// 
                        NODES(passthru)			///	-# nodes-
                        TDEFUNCtion(passthru)		///	-function of time to interact with time-dependent effects-
                        MIXture				///	-two-component mixture-
                        PMix(passthru)			///	-mixture parameter-
                        CR				///	-simulate competing risks-
                        NCR(string)			///	-number of competing risks-
                        Distribution(passthru)		/// 
                        Lambdas(passthru)		///	
                        Gammas(passthru)		///	
                        COVariates(passthru)		///	-baseline covariates, e.g, (sex 0.5 race -0.4)-
                        TDE(passthru)			///	-time dependent effects-
                        MARGinal			///
                                                        //

	local commonopts	 			///
                        MAXTime(string)			///	-right censoring time-
                        LTruncated(string)		///	-left truncation/delayed entry-
							//
													
	//==============================================================================================================//
	//parse opts that are common to all three settings
		
		syntax newvarname(min=1 max=3) , [`commonopts' *]
		local newvars `varlist'
		local stime : word 1 of `newvars'
		local died  : word 2 of `newvars'
		local hasmaxtime = "`maxtime'"!=""
		
		if `hasmaxtime' {	
			
			if "`died'"=="" {
				di as error "2 new variable names required"
				exit 198
			}
			
			cap confirm number `maxtime'
			if _rc {
				cap confirm numeric variable `maxtime'
				if _rc {
					di as error "Invalid maxtime()"
					exit 198
				}
			}
			else {
				if `maxtime'<0 {
					di as error "maxtime() must be >0"
					exit 198
				}
			}
			
			tempvar maxtvar 
			gen `maxtvar' = `maxtime'
			local maxtopt maxtime(`maxtvar')
		}
		
		if "`ltruncated'"!="" {
			
			cap confirm number `ltruncated'
			if _rc {
				cap confirm numeric variable `ltruncated'
				if _rc {
					di as error "Invalid ltruncated()"
					exit 198
				}
			}
			else {
				if `ltruncated'<0 {
					di as error "ltruncated() must be >0"
					exit 198
				}
			}
			
			tempvar ltvar 
			gen `ltvar' = `ltruncated'
			local ltopt ltruncated(`ltvar')
		}
		
		local 0 , `options'
	
	//==============================================================================================================//	
	//survsim_model
	
		syntax , [`opts1' *]
		
		if "`model'"!="" {
			capture survsim_model `newvars', 	`model' 		///
												`maxtopt'		//
			local rc = c(rc)
			if `rc' {
				cap drop _survsim_rc
				cap drop `stime'
				cap drop `died'
                                exit `rc'
			}
			else {
				RCREPORT 	_survsim_rc
				MISSREPORT 	`stime'
				GENEVENT 	`stime' `died' `maxtime'
                                exit
			}
		}	
	
		local 0 , `options'
	
	//==============================================================================================================//	
	//survsim_user
	
		syntax , [`opts2' *]
		
		local useropt "`loghazard'`hazard'`logchazard'`chazard'`mixture'"
		
		if "`useropt'"!="" {
			capture survsim_user `newvars', 	`useropt' 		///
												`nodes'			///
												`maxtopt' 		///
												`ltopt'			///
												`covariates'	///
												`tde'			///
												`tdefunction'	///
												`distribution'	///	-for mixture-
												`lambdas'		///
												`gammas'		///
												`pmix'			///
												`marginal'		//
            local rc = c(rc)
			if `rc' {
				cap drop _survsim_rc
				cap drop `stime'
				cap drop `died'
                                exit `rc'
			}
			else {
				RCREPORT 	_survsim_rc
				MISSREPORT 	`stime'
				GENEVENT 	`stime' `died' `maxtime'
                                exit
			}
		}
		
		local 0 , `options'
		
	//==============================================================================================================//	
	//survsim_msm
	
		syntax , [HAZARD1(passthru) HAZARD2(passthru) TRANSMATrix(passthru) STARTSTATE(passthru) *]
		
		if "`hazard1'"!="" {
		
			capture program drop survsim_msm			// refreshes Mata functions
			
			survsim_msm `newvars', 		`transmatrix'	///
										`hazard1'		///
										`hazard2'		///
										`maxtopt' 		///
										`ltopt'			///
										`startstate'	///
										`nodes'			///
										`options' 		//
			exit
			
		}
		
		local 0 , `options' `distribution' `lambdas' `gammas' `covariates' `tde'
		
	//==============================================================================================================//	
	//parametric distribution	
		
		syntax , 	Distribution(string) 		///
					Lambdas(string) 			///
				[								///
					Gammas(string) 				///
					COVariates(string)			///
					TDE(string)					///
					`opts3'						///
				]								//
		
		local ld = length("`distribution'")
		if 		substr("exponential",1,max(1,`ld'))=="`distribution'" {
			local dist "exp"
		}
		else if substr("gompertz",1,max(3,`ld'))=="`distribution'" {
			local dist "gompertz"
		}
		else if substr("weibull",1,max(1,`ld'))=="`distribution'" {
			local dist "weibull"
		}
		else {
			di as error "Unknown distribution"
			exit 198
		}
		
		foreach l of numlist `lambdas' `gammas' {
			if `l'<0 {
				di as error "lambdas()/gammas() must be > 0"
				exit 198
			}
		}
		
		if "`dist'"=="exp" & "`gammas'"!="" {
			di as error "gammas cannot be specified with distribution(exponential)"
			exit 198
		}
				
	//==============================================================================================================//	
	//baseline covariates and time-dependent effects
	
		if "`covariates'"!="" {
		
			tokenize `covariates'
			local ncovlist : word count `covariates'
			local ncovvars = `ncovlist'/2
			cap confirm integer number `ncovvars'
			if _rc>0 {
				di as error "Variable/number missing in covariates"
				exit 198
			}
			local ind = 1
			local error = 0
			forvalues i=1/`ncovvars' {	
				cap confirm var ``ind'', exact
				if _rc {
					local errortxt "invalid covariates(... ``ind'' ``=`ind'+1'' ...)"
					local error = 1
				}
				cap confirm num ``=`ind'+1''
				if _rc {
					local errortxt "invalid covariates(... ``ind'' ``=`ind'+1'' ...)"
					local error = 1
				}
				tempvar vareffect`i'
				gen double `vareffect`i'' = ``ind''*``=`ind'+1'' 
	
				local ind = `ind' + 2
			}
			if `error' {
				di as error "`errortxt'"
				exit 198
			}
			local cov_linpred "`vareffect1'"
			if `ncovvars'>1 {
				forvalues k=2/`ncovvars' {
					local cov_linpred "`cov_linpred' + `vareffect`k''"
				}
			}
			local cov_linpred "* exp(`cov_linpred')"
			
		}
		
		if "`tde'"!="" {
		
			tokenize `tde'
			local ntde : word count `tde'	
			local ntdevars = `ntde'/2
			cap confirm integer number `ntdevars'
			if _rc>0 {
				di as error "Variable/number missing in tde"
				exit 198
			}

			local ind = 1
			local error = 0
			forvalues i=1/`ntdevars' {
				cap confirm var ``ind'', exact
				if _rc {
					local errortxt "invalid tde(... ``ind'' ``=`ind'+1'' ...)"
					local error = 1
				}
				cap confirm num ``=`ind'+1''
				if _rc {
					local errortxt "invalid tde(... ``ind'' ``=`ind'+1'' ...)"
					local error = 1
				}
				tempvar tdeeffect`i'
				gen double `tdeeffect`i'' = ``ind''*``=`ind'+1'' 

				local ind = `ind' + 2
			}
			if `error' {
				di as error "`errortxt'"
				exit 198
			}
			local tde_linpred "`tdeeffect1'"
			if `ntdevars'>1 {
				forvalues k=2/`ntdevars' {
					local tde_linpred "`tde_linpred' + `tdeeffect`k''"
				}
			}
			local tde_linpred "+ `tde_linpred'"

		}
		
	//==============================================================================================================//	
	//stime and died
		
		tempvar u
		qui gen double `u' = runiform() 
		
		if 		"`dist'"=="exp" {
			if "`ltruncated'"!="" {
				local ltcontrib "* exp(-`lambdas' `cov_linpred' * `ltvar' ^ (1 `tde_linpred') / (1 `tde_linpred'))"
			}
			qui gen double `stime' 	= (-ln(`u' `ltcontrib')*(1 `tde_linpred')/(`lambdas' `cov_linpred'))^(1/(1 `tde_linpred'))
		}
		else if "`dist'"=="weibull" {
			if "`ltruncated'"!="" {
				local ltcontrib "* exp(-`lambdas' `cov_linpred' * `gammas' * `ltvar' ^ (`gammas' `tde_linpred') / (`gammas' `tde_linpred'))"
			}
			qui gen double `stime' 	= (-ln(`u' `ltcontrib')*(`gammas' `tde_linpred')/(`lambdas'*`gammas' `cov_linpred'))^(1/(`gammas' `tde_linpred')) 
		}
		else{
			if "`ltruncated'"!="" {
				local ltcontrib "* exp(- `lambdas' `cov_linpred' / (`gammas' `tde_linpred') * (exp(`ltvar' * (`gammas' `tde_linpred') ) - 1))"
			}
			qui gen double `stime' 	= (1/(`gammas' `tde_linpred'))*log(1-(((`gammas' `tde_linpred')*log(`u' `ltcontrib'))/(`lambdas' `cov_linpred'))) 
		}
			
		MISSREPORT `stime'
		if "`maxtime'"!="" {
			GENEVENT `stime' `died' `maxtvar'
			qui replace `stime' = `maxtvar' if `died'==0
		}
	
end

program CheckObs
	if _N==0 {
		di as error "No observations"
		exit 198
	}
end	

program GENEVENT
	args stime died maxtime
	qui gen byte `died' = `stime'<`maxtime' if `stime'!=.
	qui replace `stime' = `maxtime' if `stime'>`maxtime' & `stime'!=.
end

program RCREPORT
	args rcvar
	qui su `rcvar' if `rcvar'==1, meanonly
	if r(N)>0 {
		di in yellow "Warning: `r(N)' survival times did not converge"
		di in yellow "         They have been set to the final iteration value"
		di in yellow "         You can identify them by _survsim_rc = 1"
	}
	qui su `rcvar' if `rcvar'==2, meanonly
	if r(N)>0 {
		di in yellow "Warning: `r(N)' survival times were below the lower limit"
		di in yellow "         You can identify them by _survsim_rc = 2"
	}
	qui su `rcvar' if `rcvar'==3, meanonly
	if r(N)>0 {
		di in yellow "Warning: `r(N)' survival times were above the upper limit of maxtime()"
		di in yellow "         They have been set to maxtime()"
		di in yellow "         You can identify them by _survsim_rc = 3"
	}	
end

program MISSREPORT
	args time
	qui count if `time'==.
	if `r(N)'>0 {
		di in yellow "`r(N)' missing values generated in simulated survival times"		
	}
end