*! 1.0.2 jmendelson 20dec14
*Chromy PPS sequential random sampling
/*
Changelog

v1.0.2:
fixed bug that occurred when first case has prob of selection equal to 1 due to fractionpart being 0

v1.0.1:
-reprogrammed core logic as to fix floating point rounding issue that could occur with pmr and epsilon probabilities of selection

v1.0.0:
-strata optional
-allows PMR
-allows "if"
-additional testing/debugging prior to public release
-allows replace
-note: "pre" option for debugging/testing purposes and not for public use, therefore not necessarily fully implemented

To-do:
-Examine possible speed enhancements (i.e., avoiding egen)
-Clean up syntactical structure of subroutines
-Consider requiring random seed
-Consider adding additional safeguards against misuse
-Consider 'rescale' option to modify MOS so that it if it's too big, it is scaled down as appropriate
-Possibly create as output vars 1st- and/or 2nd-order inclusion probabilities
*/
program ppschromy, sortpreserve
version 12.1
syntax [if] [in], SAMPSIZE(varname numeric) OUTname(string) SORT(varlist min=1 numeric) [STRATA(varname numeric)] [PREfix(string)] [MOS(varname numeric)] [PMR] [REPlace]
marksample touse
tempvar one numInStrata tempStrata ID badTarget
qui {
	if("`replace'"!="") cap drop `outname'		//if "replace" is on, drop previous sampled variable

	//First, make sure we have complete data
	if("`mos'"!="") {
		//If there's an MOS, it should be complete and should be positive for all observations
		tempvar zeros

		count if `touse'
		loc num1 = r(N)

		markout `touse' `mos'
		count if `touse'
		loc num2a = r(N)
		if("`num1'"!="`num2a'") di as err "Warning: there are some cases with missing MOS, which will be ignored."

		gen byte `zeros' = 1 if(`touse' & `mos'!=0)
		markout `touse' `zeros'
		count if `touse'
		loc num2b = r(N)
		if("`num2a'"!="`num2b'") di as err "Warning: there are some cases with MOS equal to zero, which will be ignored."
	}
	if("`strata'"!="") {
		//If stratification is used, variable should be complete
		count if `touse'
		loc num1 = r(N)
		
		markout `touse' `strata'
		count if `touse'
		loc num3 = r(N)
		if("`num1'"!="`num3'") di as err "Warning: there are some cases with missing strata, which will be ignored."
	}

	//The target sample size for all strata should be positive
	count if `touse'
	loc num1 = r(N)
	gen byte `badTarget' = 1 if(`touse'==1 & `sampsize'>0)
	markout `touse' `badTarget'
	count if `touse'
	loc num2 = r(N)
	if("`num1'"!="`num2'") di as err "Warning: there are some cases in strata with target sample size of 0 or less, which will be ignored."
	
	cap assert `touse'==1
	if(_rc!=0) {
		//If not all cases are to be included in sampling, then save dataset, drop observations that aren't relevant, and run the program on only those valid cases, and merge back in
		tempfile tempdata_full tempdata_sampled

		gen double `ID' = _n
		cap assert `ID'[_n]!=`ID'[_n+1]
		if(_rc!=0) {
			di as err _n "{error}Program error. Too many observations to uniquely identify."
		}
		save `tempdata_full'
		drop if(`touse'!=1)
		cap noi ppschromy, sampsize(`sampsize') outname(`outname') sort(`sort') `=cond("`strata'"=="","","strata(`strata')")' `=cond("`mos'"=="","","mos(`mos')")' `pmr'
		if (_rc==0) {
			//Sampling was successful, so merge output variable into original dataset
			keep `ID' `outname'		//Keep ID and sampling indicator variable
			save `tempdata_sampled'
			
			drop _all
			use `tempdata_full'
			merge 1:1 `ID' using `tempdata_sampled', nogenerate
		}
		else {
			//sampling was unsuccessful, so restore dataset to previous state and exit
			loc error = _rc
			drop _all
			use `tempdata_full'
			exit `error'
		}
	}
	else {
		//All cases are to be included as observations

		//If sampling WOR is used then make sure there are enough available cases
		if("`pmr'"!="pmr") {
			gen byte `one' = 1
			egen `numInStrata' = total(`one'), by(`strata')
			cap assert `numInStrata'>=`sampsize'
			if(_rc!=0) {
				display as err _n "{error}Some strata have fewer cases available than you are attempting to sample."
				error 459
			}
		}

		//Ensure MOS positive, finite
		if("`mos'"!="") {
			cap assert `mos'>0 & `mos'<.
			if(_rc!=0) {
				display as err _n "{error}MOS is not in the range (0,infinity)."
				error 459
			}
		}
		
		if("`mos'"=="") local sizestring ""
		if("`mos'"!="") local sizestring "size(`mos')"							//create a string that can be used to pass on the optional argument to subroutines
		
		if("`prefix'"=="") local prefixstring ""
		if("`prefix'"!="") local prefixstring "prefix(`prefix')"
		
		//If no strata variable is specified, assign all cases to the same stratum
		if("`strata'"=="") {
			gen `tempStrata' = 1
			loc strataString "`tempStrata'"
		}
		else {
			loc strataString "`strata'"
		}

		//Apply serpentine sorting and then draw sample
		serpsort `sort', strata(`strataString') `prefixstring'
		if("`prefix'"!="") gen double `prefix'_serporder = _n
		chromysample `sampsize', outname(`outname') strata(`strataString') `sizestring' `pmr' `prefixstring'
		if("`prefix'"!="") gen double `prefix'_finalorder = _n
	}
}
end





program serpsort
*Program assumes existence of a strata variable
syntax varlist (min=1), STRATA(varname numeric) [PREfix(string)] 
if("`prefix'"=="") serpsort_main `varlist', strata(`strata')
if("`prefix'"!="") serpsort_main `varlist', strata(`strata') prefix(`prefix')
end





program serpsort_main
tempvar randnum
gen double `randnum' = runiform()
syntax varlist (min=1), STRATA(varname numeric) [PREfix(string)] 
*Only the last variable can be a string; the other vars must be numeric
*"Prefix" optionally saves all the intermediate variables (albeit with slightly confusing prefixes)
/*
My general recursive algorithm:
1. Count the number of vars
2. If one var, sort by strata v1 rand#.  then done
3. If 2+ vars, then do serpsort on all but the last var to determine the ordering for all but the last var.
    Then do two sorts: one where the last var is ascending, and one where it is descending.  Determine if 
	the ordering should be ascending or descending, then assign the final ordering accordingly, and reorder.
*/
local nvar: word count `varlist'
display "`nvar'"
 
if (`nvar'==1) sort `strata' `varlist' `randnum'
if `nvar'>=2 {
	tempvar ordergroup newOrderGroup totOrderGroup minStrataGroup flipGroup temporder1 temporder2 finalorder
	tokenize `varlist'
	local tempcount: word count `varlist'
	local lastitem = "``tempcount''"
	local tempcount = `tempcount' - 1
	local firstitems ""
	forval i=1/`tempcount' {
		local firstitems = "`firstitems'" + " " + "``i''"
	}
	*After these few lines of code, firstitems has all but the last variable; and lastitem is the last variable
	
	sort `randnum'
	if("`prefix'"=="") serpsort_main `firstitems', strata(`strata')
	if("`prefix'"!="") serpsort_main `firstitems', strata(`strata') prefix(`prefix'v`nvar'_)
	
	egen `ordergroup' = group(`strata' `firstitems')
	gen `newOrderGroup' = `ordergroup'[_n]!=`ordergroup'[_n-1]
	gen `totOrderGroup' = 0
	replace `totOrderGroup' = `totOrderGroup'[_n-1]+`newOrderGroup'[_n] if(_n>1)
	egen `minStrataGroup' = min(`totOrderGroup'), by(`strata')
	gen `flipGroup' = mod(`totOrderGroup'+`minStrataGroup',2)

	sort `totOrderGroup' `lastitem' `randnum'
	gen double `temporder1' = _n

	gsort `totOrderGroup' -`lastitem' -`randnum'
	gen double `temporder2' = _n
	
	gen double `finalorder' = `temporder1'
	replace `finalorder' = `temporder2' if(`flipGroup')
	sort `finalorder'
	
	if("`prefix'"!="") {
		foreach v in ordergroup newOrderGroup totOrderGroup minStrataGroup flipGroup temporder1 temporder2 finalorder {
			rename ``v'' `prefix'_serp_`v'
		}
	}

}

if("`prefix'"!="") rename `randnum' `prefix'randnum

end





program chromysample
syntax varlist (min=1 max=1), OUTNAME(string) [PREFIX(string)] STRATA(varname numeric) [SIZE(varlist min=1 max=1)] [PMR]
*Program assumes existence of a strata variable
*The input variable is the number of desired cases per stratum
*Outname is the output variable for whether someone was sampled
*Prefix, optionally, will save the intermediate variables, using that prefix

tempvar nInStrata rand totInStrata one minInStrata strataStart firstInStrata lastInStrata
tempvar obsInStrata temp numOfStrataStart obsInStrataReordered sampled probOfSelection cumuExpectedHits integerPart fractionPart 
tempvar newrand formula1 formula2 comparison1 comparison2 cumuSampled mos totMosInStrata numCertaintyHits totSamp diffFromIntPart
tempname numStrata numStarts

if("`size'"!="") clonevar `mos' = `size'		//if MOS specified, copy to mos
if("`size'"=="") gen byte `mos' = 1				//if MOS not specified, create new one equal to 1

qui: duplicates report `strata'
scalar `numStrata' = r(unique_value)

gen byte `one' = 1
gen `nInStrata' = `varlist'
scalar `numStarts' = 0

//Check to make sure target sample size is uniform within strata
cap assert (`strata'[_n]!=`strata'[_n+1] | `nInStrata'[_n]==`nInStrata'[_n+1] | _n==_N)
if(_rc!=0) {
	display as err _n "{error}Target sample size (`varlist') in not uniform within stratum"
	error 459
}

while `numStrata'!=`numStarts' {
	*This section of code most likely will only run once. It randomly generates a number, then the smallest numbers are the starts within strata.
	*In the unlikely event that two minimum numbers are equal, it will re-run.
	qui: cap drop `rand' `totInStrata' `minInStrata' `strataStart'
	gen `rand' = runiform()
	egen `totInStrata' = total(`one'), by(`strata')
	egen `minInStrata' = min(`rand'), by(`strata')
	gen `strataStart' = (`minInStrata'==`rand')

	count if `strataStart'
	scalar `numStarts' = r(N)
}

gen byte `firstInStrata' = `strata'[_n]!=`strata'[_n-1]
gen byte `lastInStrata' = `strata'[_n]!=`strata'[_n+1]
gen `obsInStrata' = `firstInStrata'
replace `obsInStrata' = `obsInStrata'[_n-1]+1 if(`obsInStrata'!=1)
gen `temp' = `obsInStrata' if(`strataStart')
egen `numOfStrataStart' = min(`temp'), by(`strata')
gen `obsInStrataReordered' = mod(`obsInStrata'-`numOfStrataStart',`totInStrata')

sort `strata' `obsInStrataReordered'

egen `totMosInStrata' = total(`mos'), by(`strata')
gen double `probOfSelection' = `nInStrata'*`mos'/`totMosInStrata'

if("`pmr'"=="") {
	//If probability minimum replacement is not specified (i.e., probabability non-replacement is used), make sure prob of selection is <=100%
	cap assert inrange(`probOfSelection',0,1) if(`probOfSelection'!=.)
	if(_rc!=0) {
		display as err _n "{error}Some cases have a probability of selection higher than 1."
		error 459
	}
}
if("`pmr'"!="") {
	//If probability w/ minimum replacement has been selected, select the int number of hits first
	gen double `numCertaintyHits' = floor(`probOfSelection')
	replace `probOfSelection' = `probOfSelection' - `numCertaintyHits'
	assert inrange(`probOfSelection',0,1) if(`probOfSelection'!=.)
}

gen double `cumuExpectedHits' = `probOfSelection' if(`strataStart'==1)
replace `cumuExpectedHits' = `cumuExpectedHits'[_n-1]+`probOfSelection'[_n] if(!`strataStart')

gen `integerPart' = floor(`cumuExpectedHits')
gen double `fractionPart' = `cumuExpectedHits' - `integerPart'

gen double `newrand' = runiform()

//If Tot[i-1]==Int[i-1], use formula 1
//	If Frac[i]==0 or Frac[i-1]>Frac[i] then Tot[i]=Int[i]
//	Otherwise, Tot[i] = I[i]+1 with adjusted probability: (F[i]-F[i-1])/1-F[i-1]
//
//In other words, this formula is used when case has not yet been picked for the implicit stratum through case i-1
//So, since we've already selected the certainty hits:
//	If strata start then we choose with probability probOfSelection
//	If not strata start
//		If we've hit a boundary or crossed a boundary this round, pick with probability 1
//		If we haven't crossed a boundary, pick with adjusted probability (Frac[i]-Frac[i-1])/(1-Frac(i-1))
gen double `formula1' = cond(`strataStart'==1, ///
								`probOfSelection',	///
								cond(`integerPart'[_n]>`integerPart'[_n-1], ///
									1, ///
									(`fractionPart'[_n]-`fractionPart'[_n-1])/(1-`fractionPart'[_n-1]) ///
									) ///
							)

//If T[i-1]==Int[i-1]+1, use formula 2
//	If Frac[i]=0, then T[i]==I[i]
//	If Frac[i]>Frac[i-1], then T[i]=Int[i]+1
//	Otherwise, Total[i] = Int[i]+1 with probability Frac[i]/Frac[i-1]
//
//Formula 2 is used when when a case has been picked for previous implicit stratum for case i-1
//doesn't matter what it is for strata start, since this always leads to formula1
//If cumu expected hits has not increased, set to 0 (this is a fix for floating point rounding issues for very tiny probabilities of selection relative to MOS)
//	Else, if fraction is now 0, then it is cumu expected hits minus int part
//		If fraction is greater than prev fraction, then it is 0 (since case has been picked for implicit stratum)
//			Otherwise (eg fraction is less than prev fraction, ie case not yet picked for implicit stratum), then adjusted probability - Frac[i]/Frac[i-1]
gen double `formula2' = cond(`cumuExpectedHits'[_n]==`cumuExpectedHits'[_n-1], 0, ///
						cond(`fractionPart'[_n]==0,	///
							`cumuExpectedHits'[_n]-`integerPart'[_n], ///
							cond(`fractionPart'[_n]>`fractionPart'[_n-1], 0, (`fractionPart'[_n]/`fractionPart'[_n-1])) ) )

gen byte `comparison1' = `newrand'<`formula1'
gen byte `comparison2' = `newrand'<`formula2'

//Now, determine whether to use formula1 or formula2 for a cumulative sampled var, depending on whether we're at the integer part or not
//	Use comparison1 if strata start
//	Else, if we're at the prev int part, then add comparison 1
//		else, use formula 2

gen long `cumuSampled' = `comparison1' if (`strataStart')
replace `cumuSampled' = cond(`cumuSampled'[_n-1]==`integerPart'[_n-1],`cumuSampled'[_n-1]+`comparison1'[_n],`cumuSampled'[_n-1]+`comparison2'[_n]) if(!`strataStart')

//Check to make sure cumu sample count is within acceptable range of algorithm
gen byte `diffFromIntPart' = `cumuSampled'-`integerPart'
cap assert inrange(`diffFromIntPart',0,1)
if (_rc!=0) {
	//This shouldn't ever occur, but provides an extra safeguard in case of floating-point arithmetic issues.
	display as err _n "{error}Warning: properties of algorithm violated, probably due to floating-point rounding issues. Please contact program's author with the information necessary to replicate issue if possible and/or if the problem persists."
	error 459
}

gen double `sampled' = `cumuSampled' if(`strataStart')
replace `sampled' = (`cumuSampled'[_n]!=`cumuSampled'[_n-1]) if(!`strataStart')

if("`pmr'"!="") replace `sampled' = `sampled' + `numCertaintyHits'

rename `sampled' `outname'
if("`pmr'"!="") la var `outname' "`outname': Number of case selections via Chromy algorithm with PMR"
if("`pmr'"=="") la var `outname' "`outname': Case sampled via Chromy algorithm with PNR"

//Check to see if right # of cases has been selected
egen `totSamp' = total(`outname'), by(`strata')
cap assert `totSamp'==`nInStrata'
if (_rc!=0) {
	//This shouldn't ever occur, but provides an extra safeguard in case of floating-point arithmetic issues.
	display as err _n "{error}Warning: incorrect number of cases sampled; probably due to floating-point rounding issues. Please contact program's author with the information necessary to replicate issue if possible and/or if the problem persists."
	error 459
}

if("`prefix'"!="") {
	*if a prefix is supplied, saves the intermediate variables
	foreach v in nInStrata rand totInStrata one minInStrata strataStart firstInStrata lastInStrata obsInStrata temp numOfStrataStart obsInStrataReordered probOfSelection cumuExpectedHits integerPart fractionPart newrand formula1 formula2 comparison1 comparison2 cumuSampled {
		rename ``v'' `prefix'_samp_`v'
	}
	if("`pmr'"!="") rename `numCertaintyHits' `prefix'_samp_`v'
}
end