*! stkerhaz 2.0.4 EC 28NOV2005
*! Baseline Hazard Estimates via Kernel Smoother and plots

program define stkerhaz, sortpreserve
        version 7.0
	st_is 2 analysis

        syntax [if] [in] , Bwidth(numlist max=4 >0 ascending) [ Kercode(int 2) /*
                 */ NPoint(int 150) BASECHa(varname) STRATA(varname numeric)  /*
                 */ TMAX OUTfile(string) PER(integer 1) CI BY(varname numeric) /*
                 */ Level(integer $S_level) L1title(string) B1title(string) /*
                 */ Connect(string) Symbol(string)  * ]


******* General frame of the command *****************************************
/*
  Command needs a previous estimate of cumulative baseline hazard via stcox
  or sts gen or otherwise. Next increments of this function is computed at
  times where a failure occurs. This quantity is smoothed using a Kernel
  method.
  Bandwidth is the only fundamental parameter the user must specify. The bwidth
  option must contain a number, but can contain up to four numbers. More
  bandwidths look inappropriate because they can blur the plot and are useless.
  Kercode is set at Epanechnicow function because it is a general use.
  Npoint default is 150. Maybe this is the number of points in corresponding
  plot obtained using EGRET. Anyway varying the number of points usually does not
  cause valuable differences in the smoothed curve.
  In Basecha user can specify the variable storing Baseline Cumulative Hazard.
  It is an option because, if not specified, stkerhaz searches for the estimate
  of this function obtained in a previous stcox.
*/
******************************************************************************


*1 - Process options because they cannot be used at the same time
        if "`by'" != "" {
        	local strata `by'
        }
        local bnd: word count `bwidth'
        if `bnd'>1 & "`strata'"!="" {
            di as err "Multiple bandwidths and Strata are alternative options."
                  exit 198
        }
        if `bnd'>1 | "`strata'"!="" {
                if "`ci'"!=""{
                        di as err "CI option is not allowed with" /*
                          */ " Multiple bandwidth or Strata options."
                  exit 198
                }
                local pen /* no pen control if two or more lines */
        }

*2 - Get baseline cumulative hazard
        if "`basecha'" == "" {
                if "`e(cmd2)'" != "stcox" {
                  error 301
                }
                if "`e(cmd2)'" == "stcox" & "`e(basech)'" == ""  {
                        noi di as err "stcox did not save baseline cumulative hazard." _n /*
                        */ "basech(newvar) option must have been specified on the stcox" /*
                        */ " command" _n "prior to using this command"
                        exit 198
                }
                local H  `e(basech)'
	}
        else { local H `basecha' }

*3 - Process further options
        if `kercode' < 1 | `kercode' > 4 {
                di as err "kernel code should be between 1 and 4"
                exit 198 
        }
        if `level'<50 | `level'>99 {
		di in red "level() invalid"
		exit 198
	}

*4 - Put in local macros basic quantities
        local xvar _t   /* xvar = Analysis time */
        local d _d
        local intime _t0
        local kc `kercode'
        local hv `bwidth'
        local np `npoint'
        local by `strata'

*5 - Label for the plot. If e(offset) is defined stkerhaz guesses
*    Baseline SMR is plotted, but it cannot be always appropriate. 
        if `kc' == 1 {
                local kernel "Uniform"
        }
        else if `kc' == 2 {
                local kernel "Epanechnikov" /* It is different from Stata weigths */
        }
        else {
                local kernel "Biweight"
        }
        if "`e(offset)'" != "" { local lbly "Smoothed Baseline SMR" }
        else { local lbly "Smoothed Baseline Hazard" }

        preserve

/* 6 - Keep only records needed for the estimation.
       So records where Baseline Cumulative Hazard is missing will be dropped
       restricting the compute to e(sample), records where no failure occurs
       and records tied are dropped too. One record is added to preserve the lowest _t0.
*/
        marksample touse  
        qui replace `touse' = 0 if _st==0 | `H'==.   
        qui count if `touse'
        if r(N) == 0 { error 2000 }
        tempvar tmin yvar tie gridp dcum
        quietly {
                keep if `touse'
                egen `tmin' = min(`intime'), by(`by' `touse')
                if "`tmax'" == "" {
                        bysort `by' `d' (`xvar') : keep if _n==1 | `d'
                        bysort `by' `touse' :  drop if _N==1 & !`d'
                        replace `xvar' = `tmin' if `d'==0
                        replace `H' = 0 if `d'==0
                }
                else { keep if `d' }
                bysort `by' `xvar': gen long `tie' = (_n==1)
                egen `dcum' = sum(`d'), by(`by' `xvar')
                keep if `tie' /* keep one observation by time */
		if `np' >_N { set obs `np' }

*7 - Define the range of points where estimates will be done.
               	su `xvar', meanonly
               	local maxval = r(max)  
               	local minval = r(min) 
               	local range = `maxval' - `minval'

*8 - Define equally spaced points - gridpoint 
               	local inter = `range' / (`np' - 1)
               	gen double `gridp' = sum(`inter') - `inter' + `minval' if _n <= `np'
               	label var `gridp' "Gridpoint on the Analysis Time"

*9 Compute smoothed hazard if by   
		if "`by'" == "" {
                	foreach X of local hv {
                        	if `range'/ 2 < `X' {
                                	di as err "The time range must be at least " /*
                              		*/ "twice the bandwidth you specify."
                              		exit 121
                        	}
                	}

*10 - Compute increments between successive cumulative hazard - yvar
                	sort `xvar'
                	gen double `yvar' = cond(_n==1,`H',`H'-`H'[_n-1])

*11 - Do estimate when no strata option - More bandwidths can be specified
*   - Initialize variable for standard error(sterh) and baseline hazard(mhX)
                        tempvar sterh
                        gen double `sterh' = 0
                        foreach X of local hv {
                                local x = subinstr("`X'",".","_",.) /* to manage decimal bandwidth */ 
                                tempvar mh`x' hi lo
                                gen double `mh`x''=0
*12 - kerhaz is the subroutine doing the estimates
                               	kerhaz `yvar' `xvar' `gridp' `dcum' `mh`x'' , np(`np') /* 
                                       	*/ ic(`sterh') max(`maxval') hv(`X') kc(`kc') 

/*13 - Confidence bounds are allowed when just one bandwidth and no strata
       option is specified. They are correct if Baseline Cumulative Hazard
       derives from an unadjusted model.
       When baseline hazard estimates approximate 0 standard errors should
       be missing. Unfortunately graph connects points with a straight line,
       even if they are not consecutive in x axis. So missing values are not
       correctly displayed.
       At present when baseline hazard approximates 0 high confidence bound
       can be a trouble to the plot.
*/
                                if "`ci'"!="" {
                                 /* CI are better on log scale */
*                                        replace `sterh' = `sterh'/ `mh`x'' if _n <= `np'
					local level = invnorm((1-`level'/100)/2 + `level'/100)
					gen double `hi' = `per'* (`mh`x'' * exp( `level'*`sterh'/`mh`x'')) if _n <= `np' 
					gen double `lo' = `per'* (`mh`x'' * exp(-`level'*`sterh'/`mh`x'')) if _n <= `np'
				}       
                                replace `mh`x'' = `per'* `mh`x'' if _n <= `np'
                                local n_hv "`n_hv'`x' "
                                local lbl "bw=`X'"
                                label var `mh`x'' "`lbl'"
                                local mh "`mh' `mh`x''"
                                local tograph "`tograph' `mh`x''"
                        }
                }

*14 - Do estimate when strata option is used.
                else {
                        bysort `by' (`xvar') : gen double `yvar' = /*
                                */ cond(_n==1,`H',`H'-`H'[_n-1])
                        local i = 1
			levels7 `by', local(bylist)
                	foreach X of local bylist {
			*7BIS - Define the range for each by
               			su `xvar' if `by'==`X', meanonly
               			local maxval = r(max)  
               			local minval = r(min) 
               			local range = `maxval' - `minval'
                        	if `range'/ 2 < `hv' {
                                	di as err "The time range must be at least " /*
                              		*/ "twice the bandwidth you specify."
                              		exit 121
                        	}
                                tempvar mh`X'
                                gen double `mh`X''= 0 if `by'==float(`X') 
                                kerhaz `yvar' `xvar' `gridp' `dcum' `mh`X'' if !`mh`X'', /*
                                        */ np(`np') max(`maxval') hv(`hv') kc(`kc') 
                                replace `mh`X'' = `per'* `mh`X'' if _n <= `np'
                                local x = subinstr("`X'",".","_",.) /* to manage decimal bandwidth */ 
                                local n_hv "`n_hv'`x' "

 /* Actually in each case of bylist the number of points where the estimates are computed can
    be different from np, but this should not be a practical concern. */
				replace `mh`X'' = . if `gridp' < `minval' 
				replace `mh`X'' = . if `gridp' > `maxval' 
                                
                                count if `mh`X''!= .
                                if r(N) == 0 {
                                        drop `mh`X''
                                }
                                else {
                                        local lbl "`by'=`X'"
                                        label var `mh`X'' "`lbl'"
                                        local mh "`mh' `mh`X''"
                                        if `i' < 5 {
                                                local tograph "`tograph' `mh`X''"
                                        }
                                        local i = `i' + 1
                                 }
                	}
               }
        }

*15 - Do graph
        if "`b1title'" =="" {
               local b1title "Kernel function `kernel'"
        }
        if "`l1title'" =="" {
               local l1title "`lbly'"
        }

        if "`by'"!="" { local hv `bylist' }
                                             /* now hv contents is bandwith
                                              or values of stratavar */
        local bw bw
        if "`by'"!="" { local bw `by' }
                                             /* now bw contents is "bw"
                                              or name of stratavar */
        local i = 0
        local p = 2
        tokenize `mh'
        while "`1'" != "" {
                local i = `i' + 1
                if `i' < 5 {
                        if "`symbol'" =="" { local sym "`sym'." }
                        if "`connect'" == "" { local con "`con'l" }
                        local b: word `i' of `hv'
                        local k`i' key`i'(c(l) p(`p') "`bw'=`b'")
                        local p = `p' + 1
                        mac shift
                }
                else { mac shift }
        }
        if `i' > 4 {
                di as text "Warning: no more than 4 lines are plotted" _n /*
            */  "To plot more lines you must save smoothed estimates and then plot them."
        }
        if "`symbol'" =="" { local symbol "`sym'" }
        if "`connect'" == "" { local connect "`con'" }
        if "`ci'"!="" {
                   local pp "33"
                   if "`symbol'" =="." { local sym  "`sym'ii" }
                        else  { local sym  "`symbol'" }
                   if "`connect'"=="l" { local con "ll[.]l[.]" }
                        else  { local con "`connect'" }
                   if "`pen'"=="" {local pen "2`pp'" }
                   graph `mh' `hi' `lo' `gridp', `options' pen(`pen') /*
                        */ b1("`b1title'") l1("`l1title'") s(`sym') c(`con') 
        }
        else {
        	   graph `tograph' `gridp', `k1' `k2' `k3' `k4' s(`symbol') /* 
                      */ c(`connect') b1("`b1title'") l1("`l1title'") `options' 
        }

*16 - Save results
        if "`outfile'" ~= "" {
                if "`ci'"!="" {
                	keep `mh' `gridp' `sterh' `hi' `lo'
                        rename `sterh' KS_SE_bw_`n_hv'
                        label var KS_SE_bw_`n_hv' "Standard error"
                        qui replace `hi' = `hi' / `per'
                        qui replace `lo' = `lo' / `per'
                        rename `hi' KS_HI_`n_hv'
                        label var KS_HI_`n_hv' "High CI `bw' `n_hv'"
                        rename `lo' KS_LO_`n_hv'
                        label var KS_LO_`n_hv' "Low CI `bw' `n_hv'"
                }
                else { keep `mh' `gridp' }
		qui keep if `gridp' < .
                local i = 1
* thanks to Margaret May ->  foreach X of local mh{    must be substituted in  
                foreach X of varlist `mh'{
                        local l: word `i' of `n_hv'
                        local bw = substr("`bw'",1,6)
                        rename `X' KS_`bw'_`l'
                        qui replace KS_`bw'_`l' = KS_`bw'_`l'/ `per'
                        label var KS_`bw'_`l' "KSm Haz `bw' `l'"
                        local i = `i' + 1
                }
                rename `gridp' Gridpoint
		tokenize "`outfile'", parse(",")
		qui save "`1'" `2' `3'
        }
end


*17 - kerhaz do estimates
program define kerhaz
        version 7.0
*      	kerhaz `yvar' `xvar' `gridp' `dcum' `mh`x'' , np(`np') 
*	       	 ic(`sterh') max(`maxval') hv(`X') kc(`kc') 

        args yvar xvar gridp d mh
        syntax varlist [if] , np(numlist) max(numlist) hv(numlist) kc(numlist) [ ic(varname) ]
        if "`if'" != "" {
                local if "if !`mh'"
        }

*18 - Initialize var to compute estimates
        tempvar z kz kzy rh kse
        gen double `z' = 0
        gen double `kz' = 0
        gen double `kzy' = 0
        gen double `kse' = 0
        gen double `rh' = 0
	
*19 - Local function contains kernel function
        if `kc' == 1 {
                local function "0.5"
        }
        else if `kc' == 2 {
                local function `"0.75*(1-`z'^2)"' 
		
        }
        else {
                local function `".9375*(1-`z'^2)^2"'
        }

        forvalues i = 1(1)`np' {
                local xo = `gridp'[`i']
		local ql  = `xo' / `hv'
		local qr = (`max' - `xo') / `hv'
/*20 - z measures distance between target point (xo) and observed values
       in bandwidth units. kz define wheigths to assign to yvar values
       according to this distance.
*/
                replace `z' = (`xo' - `xvar') / `hv'
                if `ql'<1 {
                	if `kc'==1 { replace `kz' = 4*(1+`ql'^3)/(1+`ql')^4  /*
                		*/ + 6*(1-`ql')* `z' / (1+`ql')^3 if abs(`z')<1 
                	}
			else if `kc'==2 {
				local alpha = 64*(2-4*`ql'+6*`ql'^2-3*`ql'^3) / /*
					*/ ((1+`ql')^4*(19-18*`ql'+3*`ql'^2))
               			local beta = 240*(1-`ql')^2 / ((1+`ql')^4*(19-18*`ql'+3*`ql'^2))
               			replace `kz'= `function'*(`alpha' + `beta'*`z') if abs(`z')<1
               		}
			else {
				local alpha = 64*(8-24*`ql'+48*`ql'^2-45*`ql'^3+15*`ql'^4) / /*
					*/ ((1+`ql')^5*(81-168*`ql'+126*`ql'^2-40*`ql'^3+5*`ql'^4))
               			local beta = 1120*(1-`ql')^3 / ((1+`ql')^5 * /*
               				*/ (81-168*`ql'+126*`ql'^2-40*`ql'^3+5*`ql'^4))
               			replace `kz'= `function'*(`alpha' + `beta'*`z') if abs(`z')<1
               		}
               	}
                else if `qr'<1{
               		if `kc' == 1 { 
               			replace `kz' = 4*(1+`qr'^3) / /*
               			*/ (1+`qr')^4 + 6*(1-`qr')*(-`z')/(1+`qr')^3 if abs(`z')<1
               		}
			else if `kc'==2 {
				local alpha = 64*(2-4*`qr'+6*`qr'^2-3*`qr'^3) / /*
					*/ ((1+`qr')^4*(19-18*`qr'+3*`qr'^2))
               			local beta = 240*(1-`qr')^2 / ((1+`qr')^4*(19-18*`qr'+3*`qr'^2))
               			replace `kz'= `function'*(`alpha' + `beta'*(-`z')) if abs(`z')<1
               		}
               		else {
				local alpha = 64*(8-24*`qr'+48*`qr'^2-45*`qr'^3+15*`qr'^4) / /*
					*/ ((1+`qr')^5*(81-168*`qr'+126*`qr'^2-40*`qr'^3+5*`qr'^4))
               			local beta = 1120*(1-`qr')^3 / ((1+`qr')^5 * /*
               				*/ (81-168*`qr'+126*`qr'^2-40*`qr'^3+5*`qr'^4))
               			replace `kz'= `function'*(`alpha' + `beta'*(-`z')) if abs(`z')<1
               		}
               	}

		else { replace `kz' = `function' if abs(`z')<1 }
               	replace `kzy' = `kz' * `yvar' `if'
                su `kzy', meanonly
                replace `rh' = `r(sum)' in `i'

*21 - Standard error - not if strata option is specified
                replace `kse' = `kz'^2 * `yvar'^2 / `d'
                su `kse', meanonly
                if "`ic'"!= "" {
                        replace `ic' =sqrt(`r(sum)') / `hv' in `i'
                }
                replace `z' = 0
                replace `kz' = 0 
                replace `kzy' = 0 
                replace `kse' = 0 
                local i = `i' + 1
        }

*22 - wheigthed sum has to be divided by bandwidth
        replace `mh' = `rh' / `hv' if _n <= `np'
end