*! version 1.2.2  --- 18nov2008
program define stcascoh
*! Case Cohort Sampling
*! Syntax: [varlist] (var to keep) [if ] [in],
*!         Alpha(numlist max=1 >=0 <100) (sampling fraction)
*!         [ GRoup(groupvarlist)(to preserve proportions by groupvarlist)
*!         GENerate(string)(to specify up to three var marking subcohort
*!             member, Barlow and Self-Prentice weights. Default
*!             names are _subco _wBarlow _wSelPre )
*!         EPs(real 0.001)(default click of time if fail happens)
*!         SEed(numlist integer max=1) ]
*  Enzo Coviello (enzo.coviello@tin.it)
        
        version 6.0
        st_is 2 analysis
        syntax [varlist(default=none)] [if] [in], Alpha(numlist max=1 >=0 <100) /*
                */ [ GRoup(varlist) GENerate(string) EPs(real 0.001) /*
                */ SEed(numlist integer max=1) noSHow ]
        tempvar touse
        st_smpl `touse' `"`if'"' `"`in'"' `"`group'"'  `""'
        st_show `show'
        local id : char _dta[st_id]
        if "`id'" == "" {
                di in re _n "stcascoh" /*
                */ " requires you have previously stset an id() variable"
		exit 198
	}
        local event _d
        local timeto _t
        local id : char _dta[st_id]
	local w  : char _dta[st_w]
	local wt : char _dta[st_wt]
        if "`w'" ~=  "" | "`wt'" ~= ""{
                di in re _n "st weights not allowed"
                exit 198
        }
        preserve

        /* warning for exclusions */

        quietly {
                count if _st==0
                if r(N) > 0 {
                        noi di _n in bl r(N) " records with _st==0 are dropped"
                        keep if _st==1
                }
                if `"`if'"' != "" | "`in'" != "" {
                        local n = _N
                        keep `if' `in'
                        if `n' > _N {
                                noi di _n in bl `n'-_N `" records dropped via if/in clauses"'
                        }
                }
                if `"`group'"' != "" {
                        count if `touse' == 0 
                        if r(N) > 0 {
                                noi di _n in bl r(N) " records dropped" /*
                                */ " with missings in variables (`group')"
			        keep if `touse'
                        }
                }
        }

        if "`generate'" == "" {
                cap confirm new var _subco _wBarlow _wSelPre
                if _rc {
                   di in re _n "Some of default new variables exists." /*
                    */ " gen() must be defined"
                   exit
                }
                local gen "_subco _wBarlow _wSelPre"
        }
           else {
                local i : word count `generate'
                if `i' > 3 {
                        di in re "more than 3 variables specified in generate()" _n /*
                        */ " only 3 variables are generated"
                        exit 198
                }
                if `i'==2 {
                        local gen `"`generate' _wSelPre"'
                }
                if `i'==1 {
                        local gen `"`generate' _wBarlow _wSelPre"'
                }
                else local gen `generate'
        }

        if `alpha' >= 1 {
                local alpha = `alpha' / 100
        }
        /* if alpha = 0 only failures are sampled */

        gettoken _subco gen : gen
        gen byte `_subco' = 0
        label var `_subco' "Subcohort member"
        gettoken wBarl gen : gen

        /* if no seed, time adds randomness to seed */

        if "`seed'" == "" {
                tempname in
                sca `in' = 1 + int(10^9 *uniform())
                local a = substr("$S_TIME",1,2)
                local b = substr("$S_TIME",4,2)
                local c = substr("$S_TIME",7,2)
                local seed = `in' + (`a' + 1) * (`b' + 1) * (`c' + 1)
        }
        set seed `seed'

        quietly {
                if "`group'" != "" {
                        tempvar grp subuse
                        egen `grp' = group(`group')
                        su `grp'
                        local n_gr = r(max)
                        local i = 1

                        /* sampling by groups */
                        while `i' <= `n_gr' {
                                gen byte `subuse' = `grp'==`i'
                                DoSam `id' `subuse' `_subco' `alpha' 
                                local i = `i' + 1
                                drop `subuse'
                        }
                }

                /* random sampling */
                else  DoSam `id' `touse' `_subco' `alpha'
        }

        /* Sample description */

        di in gr _n(2) "Sample composition" _n
        DiSam `_subco' `id' `event' `timeto' 

        /* Expand if event */
           
        quietly {
                expand 2 if `event'
                sort `id' `event'
                by `id': replace `timeto' = `timeto' - `eps' if `event' & _n == _N - 1
                by `id': replace _t0 = `timeto' - `eps' if `event' & _n == _N 

        /* adjust user variables - copied from stsplit */

                IsVar `_dta[st_bd]'
                if `s(exists)' {
                        local old_ev : char _dta[st_bd]
                        by `id': replace `old_ev' = . if /*
                                */ `event' & _n == _N-1
                        char _dta[Oldev] `old_ev'
                }
                IsVar `_dta[st_bt]'
                if `s(exists)' {
                        local old_ti : char _dta[st_bt]
                        by `id': replace `old_ti' = `_dta[st_bt]' - `eps' /*
                                */ if `event' & _n == _N - 1
                        char _dta[Oldti] `old_ti'
                }
                IsVar `_dta[st_bt0]'
                if `s(exists)' {
                        by `id': replace `_dta[st_bt0]' = `_dta[st_bt0]' - `eps' /*
                                */ if `event' & _n == _N
                }

        /* adjust event */

                by `id': replace `event' = 0 if `event' & _n==_N-1

        /* Now I can keep subcohort members and failures */

                keep  if `_subco' | `event'

        /* Barlow weights */
                gen `wBarl' = ln(1 / `alpha')
                replace `wBarl' = 0 if `event' 
                label var `wBarl' "Barlow weight"

        /* Self and Prentice weights */
                gen byte `gen' = 0
                replace `gen' = -100 if ~`_subco' & `event'
                label var `gen' "Self Prentice weight"

        /* recoding _subco */
                replace `_subco' = 2 if ~`_subco'
                by `id': replace `_subco' = 0 if `_subco'==1 & ~`event'[_N]
                label define sublbl 0 "sub-member with no failure" /*
                                 */ 1 "sub-member who failed" /*
                                 */ 2 "non subcohort cases", modify
        }
        /* saving if varlist */

        if `"`varlist'"' != "" {
                local need "_st _t _t0 _d `id' `_subco' `wBarl' `gen'"
                qui keep `varlist' `need'
        }

        restore,not

        nobreak {
                /* Displaying risk sets with few controls */

                DiFew `alpha'

       /*  Non sub-cohort cases can no more rely on previous stset.
           Entry time and exit time must correspond to the current _t0 and _t. */

                di in gr _n(3) "New stset definition" _n
                stset _t,f(_d) enter(_t0) time0(_t0) id(`id')
        }
        /* char for stselpre */
        char _dta[Alpha] `alpha'
        char _dta[wBarlow] `wBarl'
        char _dta[wSelPre] `gen'
        char _dta[Subco] `_subco'
end

program define DoSam
        args id subuse subco alfa 
        tempvar subj u subcu
        gsort `id' -`subuse'
        by `id': gen `subj' = (_n==1 & `subuse')
        gen `u' = uniform() if `subj'
	gen `subcu' = 1 - `subco' if `u'!=.
        sort `subcu' `u'
        count if `subj'
        local N_sam = round(r(N)*`alfa',1)
        if `N_sam' == 0 { exit }
        replace `subco' = 1 in 1/`N_sam'
        sort `id' `subco'
        by `id': replace `subco' = 1 if `subco'[_N] == 1
end

program define DiSam
	args _subco id event timeto
        tempvar ever_ev subj
        tempname evlbl 
        label define `evlbl' 0 "Censored" 1 "Failure"
        label values `event' `evlbl'
        label define sublbl 0 "No" 1 "Yes"
        label values `_subco' sublbl
        sort `id' `timeto'
        qui by `id': gen byte `ever_ev' = (`event'[_N] == 1)
        label var `ever_ev' " "
        label values `ever_ev' `evlbl'
        qui by `id' : gen byte `subj' = (_n==_N)
        tabulate `_subco' `ever_ev' if `subj'
        qui count if `ever_ev' & `subj'
        local N_ev = r(N)
        qui count if `_subco' & ~`ever_ev' & `subj'
        di _n(2) in gr "Total sample =  " in ye r(N) + `N_ev'
end

program define IsVar, sclass
	nobreak {
		capture confirm new var `1'
		if _rc {
			capture confirm var `1'
			if _rc==0 { 
				sret local exists 1
				exit 
			}
		}
	}
	sret local exists 0
end

program define DiFew
        preserve
        tempvar t pop die control order
        st_ct "" -> `t' `pop' `die'
        gen long `control' = `pop' - `die'
        label var `t' "failure time"
        label var `pop' "Total"
        label var `die' "failure"
        label var `control' "controls"
        qui keep if `die' > 0
        gen long `order' = _n
        label var `order' "risk set"
        qui count if `control' < 4
        if r(N) >0 {
                if `1'>0 {   /* if the file consists of only non-subcohort cases this check is meaningless */
			di _n(2) in gr "Risk set with less than 4 controls" _n
				tabdisp `t' if `control' < 4, c(`order' `die' `control' `pop') /*
					*/ center cellw(9)
		}
	}
        else di in bl _n(2) "No risk set with less than 4 controls"
end
exit


. stcascoh, alpha(20) group(race)

         failure _d:  tumnas
   analysis time _t:  (dataout-origin)
             origin:  time dataass
  enter on or after:  time datain
                 id:  id


Sample composition


 Subcohort |           
    member |  Censored    Failure |     Total
-----------+----------------------+----------
        No |        51          5 |        56 
       Yes |        13          1 |        14 
-----------+----------------------+----------
     Total |        64          6 |        70 



Total sample =  19


Risk set with less than 4 controls


----------+---------------------------------------
failure   |
time      | risk set   failure  controls    Total 
----------+---------------------------------------
  15.2245 |    1         1         3         4    
----------+---------------------------------------



New stset definition


                id:  id
     failure event:  _d ~= 0 & _d ~= .
obs. time interval:  (_t0, _t]
 enter on or after:  time _t0
 exit on or before:  failure

------------------------------------------------------------------------------
       20  total obs.
        0  exclusions
------------------------------------------------------------------------------
       20  obs. remaining, representing
       19  subjects
        6  failures in single failure-per-subject data
 337.8655  total analysis time at risk, at risk from t =         0
                             earliest observed entry t =  9.849487
                                  last observed exit t =  75.50708